Ejemplo n.º 1
0
class FindProbableCauseOfFailureTestCase(MockHadoopTestCase):

    # integration tests for _find_probable_cause_of_failure()

    def setUp(self):
        super(FindProbableCauseOfFailureTestCase, self).setUp()

        os.environ['MOCK_HADOOP_VERSION'] = '2.7.0'

        self.runner = HadoopJobRunner()

    def test_empty(self):
        self.assertEqual(self.runner._find_probable_cause_of_failure(), None)

    def test_yarn_python_exception(self):
        APPLICATION_ID = 'application_1450486922681_0004'
        CONTAINER_ID = 'container_1450486922681_0005_01_000003'

        log_subdir = os.path.join(
            os.environ['HADOOP_HOME'], 'logs',
            'userlogs', APPLICATION_ID, CONTAINER_ID)

        os.makedirs(log_subdir)

        syslog_path = os.path.join(log_subdir, 'syslog')
        with open(syslog_path, 'w') as syslog:
            syslog.write(
                '2015-12-21 14:06:17,707 INFO [main]'
                ' org.apache.hadoop.mapred.MapTask: Processing split:'
                ' hdfs://e4270474c8ee:9000/user/root/tmp/mrjob'
                '/mr_boom.root.20151221.190511.059097/files'
                '/bootstrap.sh:0+335\n')
            syslog.write(
                '2015-12-21 14:06:18,538 WARN [main]'
                ' org.apache.hadoop.mapred.YarnChild: Exception running child'
                ' : java.lang.RuntimeException:'
                ' PipeMapRed.waitOutputThreads(): subprocess failed with'
                ' code 1\n')
            syslog.write(
                '        at org.apache.hadoop.streaming.PipeMapRed'
                '.waitOutputThreads(PipeMapRed.java:322)\n')

        stderr_path = os.path.join(log_subdir, 'stderr')
        with open(stderr_path, 'w') as stderr:
            stderr.write('Traceback (most recent call last):\n')
            stderr.write('  File "mr_boom.py", line 10, in <module>\n')
            stderr.write('    MRBoom.run()\n')
            stderr.write('Exception: BOOM\n')

        # need application_id
        self.assertIsNone(self.runner._find_probable_cause_of_failure())

        cause = self.runner._find_probable_cause_of_failure(
            application_id=APPLICATION_ID)

        self.assertTrue(cause)
        self.assertEqual(cause['syslog']['path'], syslog_path)
        self.assertTrue(cause['syslog']['error'])
        self.assertEqual(cause['stderr']['path'], stderr_path)
        self.assertTrue(cause['stderr']['error'])
Ejemplo n.º 2
0
    def setUp(self):
        super(StreamingArgsTestCase, self).setUp()
        self.runner = HadoopJobRunner(
            hadoop_bin='hadoop', hadoop_streaming_jar='streaming.jar',
            mr_job_script='my_job.py', stdin=StringIO())
        self.runner._add_job_files_for_upload()

        self.runner._hadoop_version='0.20.204'
        self.simple_patch(self.runner, '_new_upload_args',
                          return_value=['new_upload_args'])
        self.simple_patch(self.runner, '_old_upload_args',
                          return_value=['old_upload_args'])
        self.simple_patch(self.runner, '_hadoop_args_for_step',
                          return_value=['hadoop_args_for_step'])
        self.simple_patch(self.runner, '_hdfs_step_input_files',
                          return_value=['hdfs_step_input_files'])
        self.simple_patch(self.runner, '_hdfs_step_output_dir',
                          return_value='hdfs_step_output_dir')
        self.runner._script_path = 'my_job.py'

        self._new_basic_args = [
            'hadoop', 'jar', 'streaming.jar',
             'new_upload_args', 'hadoop_args_for_step',
             '-input', 'hdfs_step_input_files',
             '-output', 'hdfs_step_output_dir']

        self._old_basic_args = [
            'hadoop', 'jar', 'streaming.jar',
             'hadoop_args_for_step',
             '-input', 'hdfs_step_input_files',
             '-output', 'hdfs_step_output_dir',
             'old_upload_args']
Ejemplo n.º 3
0
    def setUp(self):
        super(StreamingArgsTestCase, self).setUp()
        self.runner = HadoopJobRunner(hadoop_bin='hadoop',
                                      hadoop_streaming_jar='<streaming jar>',
                                      mr_job_script='my_job.py',
                                      stdin=BytesIO())
        self.runner._add_job_files_for_upload()

        self.start(
            patch.object(self.runner,
                         '_upload_args',
                         return_value=['<upload args>']))
        self.start(
            patch.object(self.runner,
                         '_hadoop_args_for_step',
                         return_value=['<hadoop args for step>']))
        self.start(
            patch.object(self.runner,
                         '_hdfs_step_input_files',
                         return_value=['<hdfs step input files>']))
        self.start(
            patch.object(self.runner,
                         '_hdfs_step_output_dir',
                         return_value='<hdfs step output dir>'))
        self.start(
            patch.object(HadoopFilesystem,
                         'get_hadoop_version',
                         return_value='2.7.1'))
        self.runner._script_path = 'my_job.py'
Ejemplo n.º 4
0
    def test_du(self):
        root = os.environ['MOCK_HDFS_ROOT']
        data_path_1 = os.path.join(root, 'data1')
        with open(data_path_1, 'w') as f:
            f.write("abcd")
        remote_data_1 = 'hdfs:///data1'

        data_dir = os.path.join(root, 'more')
        os.mkdir(data_dir)
        remote_dir = 'hdfs:///more'

        data_path_2 = os.path.join(data_dir, 'data2')
        with open(data_path_2, 'w') as f:
            f.write("defg")
        remote_data_2 = 'hdfs:///more/data2'

        data_path_3 = os.path.join(data_dir, 'data3')
        with open(data_path_3, 'w') as f:
            f.write("hijk")
        remote_data_2 = 'hdfs:///more/data3'

        runner = HadoopJobRunner(conf_path=False)
        self.assertEqual(runner.du(root), 12)
        self.assertEqual(runner.du(remote_dir), 8)
        self.assertEqual(runner.du(remote_dir + '/*'), 8)
        self.assertEqual(runner.du(remote_data_1), 4)
        self.assertEqual(runner.du(remote_data_2), 4)
Ejemplo n.º 5
0
    def test_infer_from_hadoop_bin_realpath(self):
        with patch('posixpath.realpath', return_value='/ha/do/op/bin'):
            self.runner = HadoopJobRunner(hadoop_bin=['/usr/bin/hadoop'])
            self.mock_paths.append('/ha/do/op/hadoop-streaming.jar')

            self.assertEqual(self.runner._find_hadoop_streaming_jar(),
                             '/ha/do/op/hadoop-streaming.jar')
Ejemplo n.º 6
0
    def setUp(self):
        super(HadoopLogDirsTestCase, self).setUp()

        os.environ.clear()

        self.mock_hadoop_version = '2.7.0'
        # the result of _hadoop_dir(). This handles non-log-specific
        # environment variables, such as $HADOOP_PREFIX, and also guesses
        # based on the path of the Hadoop binary
        self.mock_hadoop_dirs = []

        def mock_get_hadoop_version():
            return self.mock_hadoop_version

        def mock_hadoop_dirs_method():
            return (d for d in self.mock_hadoop_dirs)

        self.start(
            patch('mrjob.hadoop.HadoopJobRunner.get_hadoop_version',
                  side_effect=mock_get_hadoop_version))
        self.start(
            patch('mrjob.hadoop.HadoopJobRunner._hadoop_dirs',
                  side_effect=mock_hadoop_dirs_method))

        self.runner = HadoopJobRunner()
Ejemplo n.º 7
0
    def test_infer_from_hadoop_bin_parent_dir(self):
        self.runner = HadoopJobRunner(
            hadoop_bin=['/ha/do/op/bin-parent/bin/hadoop'])

        self.mock_paths.append('/ha/do/op/bin-parent/hadoop-streaming.jar')
        self.assertEqual(self.runner._find_hadoop_streaming_jar(),
                         '/ha/do/op/bin-parent/hadoop-streaming.jar')
Ejemplo n.º 8
0
    def test_hadoop_mapred_home_beats_infer_from_hadoop_bin(self):
        self.runner = HadoopJobRunner(
            hadoop_bin=['/ha/do/op/bin-parent/bin/hadoop'])

        self.mock_paths.append('/ha/do/op/bin-parent/hadoop-streaming.jar')

        self.test_hadoop_mapred_home()
Ejemplo n.º 9
0
class FindProbableCauseOfFailureTestCase(MockHadoopTestCase):

    # integration tests for _find_probable_cause_of_failure()

    def setUp(self):
        super(FindProbableCauseOfFailureTestCase, self).setUp()

        os.environ['MOCK_HADOOP_VERSION'] = '2.7.0'

        self.runner = HadoopJobRunner()

    def test_empty(self):
        self.assertEqual(self.runner._find_probable_cause_of_failure(), None)

    def test_yarn_python_exception(self):
        APPLICATION_ID = 'application_1450486922681_0004'
        CONTAINER_ID = 'container_1450486922681_0005_01_000003'

        log_subdir = os.path.join(
            os.environ['HADOOP_HOME'], 'logs',
            'userlogs', APPLICATION_ID, CONTAINER_ID)

        os.makedirs(log_subdir)

        syslog_path = os.path.join(log_subdir, 'syslog')
        with open(syslog_path, 'w') as syslog:
            syslog.write(
                '2015-12-21 14:06:17,707 INFO [main]'
                ' org.apache.hadoop.mapred.MapTask: Processing split:'
                ' hdfs://e4270474c8ee:9000/user/root/tmp/mrjob'
                '/mr_boom.root.20151221.190511.059097/files'
                '/bootstrap.sh:0+335\n')
            syslog.write(
                '2015-12-21 14:06:18,538 WARN [main]'
                ' org.apache.hadoop.mapred.YarnChild: Exception running child'
                ' : java.lang.RuntimeException:'
                ' PipeMapRed.waitOutputThreads(): subprocess failed with'
                ' code 1\n')
            syslog.write(
                '        at org.apache.hadoop.streaming.PipeMapRed'
                '.waitOutputThreads(PipeMapRed.java:322)\n')

        stderr_path = os.path.join(log_subdir, 'stderr')
        with open(stderr_path, 'w') as stderr:
            stderr.write('Traceback (most recent call last):\n')
            stderr.write('  File "mr_boom.py", line 10, in <module>\n')
            stderr.write('    MRBoom.run()\n')
            stderr.write('Exception: BOOM\n')

        # need application_id
        self.assertIsNone(self.runner._find_probable_cause_of_failure())

        cause = self.runner._find_probable_cause_of_failure(
            application_id=APPLICATION_ID)

        self.assertTrue(cause)
        self.assertEqual(cause['syslog']['path'], syslog_path)
        self.assertTrue(cause['syslog']['error'])
        self.assertEqual(cause['stderr']['path'], stderr_path)
        self.assertTrue(cause['stderr']['error'])
Ejemplo n.º 10
0
    def setUp(self):
        super(StreamingArgsTestCase, self).setUp()
        self.runner = HadoopJobRunner(
            hadoop_bin='hadoop', hadoop_streaming_jar='streaming.jar',
            mr_job_script='my_job.py', stdin=BytesIO())
        self.runner._add_job_files_for_upload()

        self.start(patch.object(self.runner, '_upload_args',
                                return_value=['new_upload_args']))
        self.start(patch.object(self.runner, '_pre_0_20_upload_args',
                                return_value=['old_upload_args']))
        self.start(patch.object(self.runner, '_hadoop_args_for_step',
                                return_value=['hadoop_args_for_step']))
        self.start(patch.object(self.runner, '_hdfs_step_input_files',
                                return_value=['hdfs_step_input_files']))
        self.start(patch.object(self.runner, '_hdfs_step_output_dir',
                                return_value='hdfs_step_output_dir'))
        self.start(patch.object(HadoopFilesystem, 'get_hadoop_version',
                                return_value='1.2.0'))
        self.runner._script_path = 'my_job.py'

        self._new_basic_args = [
            'hadoop', 'jar', 'streaming.jar',
             'new_upload_args', 'hadoop_args_for_step',
             '-input', 'hdfs_step_input_files',
             '-output', 'hdfs_step_output_dir']

        self._old_basic_args = [
            'hadoop', 'jar', 'streaming.jar',
             'hadoop_args_for_step',
             '-input', 'hdfs_step_input_files',
             '-output', 'hdfs_step_output_dir',
             'old_upload_args']
Ejemplo n.º 11
0
    def test_hadoop_runner_cluster_mode(self):
        runner = HadoopJobRunner(spark_deploy_mode='cluster')

        self.assertEqual(runner._logs_needed_to_pick_error('streaming'),
                         ('step', 'history', 'task'))
        self.assertEqual(runner._logs_needed_to_pick_error('spark'),
                         ('step', 'task'))
Ejemplo n.º 12
0
    def test_hadoop_runner_client_mode(self):
        runner = HadoopJobRunner()

        self.assertEqual(runner._logs_needed_to_pick_error('streaming'),
                         ('step', 'history', 'task'))
        self.assertEqual(runner._logs_needed_to_pick_error('spark'),
                         ('step',))
Ejemplo n.º 13
0
    def test_hadoop_log_dirs_opt(self):
        self.runner = HadoopJobRunner(hadoop_log_dirs=['/logs1', '/logs2'])

        os.environ['HADOOP_LOG_DIR'] = '/path/to/hadoop-log-dir'

        # setting hadoop_log_dirs short-circuits automatic discovery of logs
        self.assertEqual(list(self.runner._hadoop_log_dirs()),
                         ['/logs1', '/logs2'])
Ejemplo n.º 14
0
    def test_uris(self):
        runner = HadoopJobRunner()
        list(runner.ls('hdfs://tmp/waffles'))
        list(runner.ls('lego://my/ego'))
        list(runner.ls('/tmp'))

        with open(os.environ['MOCK_HADOOP_LOG']) as mock_log:
            hadoop_cmd_args = [shlex.split(line) for line in mock_log]

        assert_equal(hadoop_cmd_args, [
            ['fs', '-lsr', 'hdfs://tmp/waffles'],
            ['fs', '-lsr', 'lego://my/ego'],
        ])
Ejemplo n.º 15
0
    def test_uris(self):
        runner = HadoopJobRunner()
        list(runner.ls('hdfs://tmp/waffles'))
        list(runner.ls('lego://my/ego'))
        list(runner.ls('/tmp'))

        with open(os.environ['MOCK_HADOOP_LOG']) as mock_log:
            hadoop_cmd_args = [shlex.split(line) for line in mock_log]

        assert_equal(hadoop_cmd_args, [
            ['fs', '-lsr', 'hdfs://tmp/waffles'],
            ['fs', '-lsr', 'lego://my/ego'],
        ])
Ejemplo n.º 16
0
    def setUp(self):
        super(HadoopStreamingJarTestCase, self).setUp()

        self.mock_paths = []

        def mock_ls(path):  # don't bother to support globs
            return (p for p in sorted(self.mock_paths) if p.startswith(path))

        self.start(patch('mrjob.fs.local.LocalFilesystem.ls',
                         side_effect=mock_ls))

        os.environ.clear()

        self.runner = HadoopJobRunner()
Ejemplo n.º 17
0
    def test_infer_from_hadoop_bin_parent_dir(self):
        self.runner = HadoopJobRunner(
            hadoop_bin=['/ha/do/op/bin-parent/bin/hadoop'])

        self.mock_paths.append('/ha/do/op/bin-parent/hadoop-streaming.jar')
        self.assertEqual(self.runner._find_hadoop_streaming_jar(),
                         '/ha/do/op/bin-parent/hadoop-streaming.jar')
Ejemplo n.º 18
0
    def setUp(self):
        super(StreamingArgsTestCase, self).setUp()
        self.runner = HadoopJobRunner(
            hadoop_bin='hadoop', hadoop_streaming_jar='streaming.jar')
        self.runner._hadoop_version='0.20.204'
        self.simple_patch(self.runner, '_new_upload_args',
                          return_value=['new_upload_args'])
        self.simple_patch(self.runner, '_old_upload_args',
                          return_value=['old_upload_args'])
        self.simple_patch(self.runner, '_hadoop_conf_args',
                          return_value=['hadoop_conf_args'])
        self.simple_patch(self.runner, '_hdfs_step_input_files',
                          return_value=['hdfs_step_input_files'])
        self.simple_patch(self.runner, '_hdfs_step_output_dir',
                          return_value='hdfs_step_output_dir')
        self.runner._script = {'name': 'my_job.py'}

        self._new_basic_args = [
            'hadoop', 'jar', 'streaming.jar',
             'new_upload_args', 'hadoop_conf_args',
             '-input', 'hdfs_step_input_files',
             '-output', 'hdfs_step_output_dir']

        self._old_basic_args = [
            'hadoop', 'jar', 'streaming.jar',
             'hadoop_conf_args',
             '-input', 'hdfs_step_input_files',
             '-output', 'hdfs_step_output_dir',
             'old_upload_args']
Ejemplo n.º 19
0
    def test_infer_from_hadoop_bin_realpath(self):
        with patch('posixpath.realpath', return_value='/ha/do/op/bin'):
            self.runner = HadoopJobRunner(hadoop_bin=['/usr/bin/hadoop'])
            self.mock_paths.append('/ha/do/op/hadoop-streaming.jar')

            self.assertEqual(self.runner._find_hadoop_streaming_jar(),
                             '/ha/do/op/hadoop-streaming.jar')
Ejemplo n.º 20
0
    def make_runner(self):
        """Make a runner based on command-line arguments, so we can
        launch this job on EMR, on Hadoop, or locally.

        :rtype: :py:class:`mrjob.runner.MRJobRunner`
        """
        if self.options.runner == 'emr':
            # avoid requiring dependencies (such as boto3) for other runners
            from mrjob.emr import EMRJobRunner
            return EMRJobRunner(**self.emr_job_runner_kwargs())

        elif self.options.runner == 'dataproc':
            from mrjob.dataproc import DataprocJobRunner
            return DataprocJobRunner(**self.dataproc_job_runner_kwargs())

        elif self.options.runner == 'hadoop':
            from mrjob.hadoop import HadoopJobRunner
            return HadoopJobRunner(**self.hadoop_job_runner_kwargs())

        elif self.options.runner == 'inline':
            raise ValueError("inline is not supported in the multi-lingual"
                             " launcher.")

        else:
            # run locally by default
            from mrjob.local import LocalMRJobRunner
            return LocalMRJobRunner(**self.local_job_runner_kwargs())
Ejemplo n.º 21
0
    def test_hadoop_mapred_home_beats_infer_from_hadoop_bin(self):
        self.runner = HadoopJobRunner(
            hadoop_bin=['/ha/do/op/bin-parent/bin/hadoop'])

        self.mock_paths.append('/ha/do/op/bin-parent/hadoop-streaming.jar')

        self.test_hadoop_mapred_home()
Ejemplo n.º 22
0
    def setUp(self):
        super(StreamingArgsTestCase, self).setUp()
        self.runner = HadoopJobRunner(
            hadoop_bin='hadoop', hadoop_streaming_jar='streaming.jar',
            mr_job_script='my_job.py', stdin=StringIO())
        self.runner._add_job_files_for_upload()

        self.runner._hadoop_version='0.20.204'
        self.simple_patch(self.runner, '_new_upload_args',
                          return_value=['new_upload_args'])
        self.simple_patch(self.runner, '_old_upload_args',
                          return_value=['old_upload_args'])
        self.simple_patch(self.runner, '_hadoop_args_for_step',
                          return_value=['hadoop_args_for_step'])
        self.simple_patch(self.runner, '_hdfs_step_input_files',
                          return_value=['hdfs_step_input_files'])
        self.simple_patch(self.runner, '_hdfs_step_output_dir',
                          return_value='hdfs_step_output_dir')
        self.runner._script_path = 'my_job.py'

        self._new_basic_args = [
            'hadoop', 'jar', 'streaming.jar',
             'new_upload_args', 'hadoop_args_for_step',
             '-input', 'hdfs_step_input_files',
             '-output', 'hdfs_step_output_dir']

        self._old_basic_args = [
            'hadoop', 'jar', 'streaming.jar',
             'hadoop_args_for_step',
             '-input', 'hdfs_step_input_files',
             '-output', 'hdfs_step_output_dir',
             'old_upload_args']
Ejemplo n.º 23
0
    def setUp(self):
        super(StreamingArgsTestCase, self).setUp()
        self.runner = HadoopJobRunner(
            hadoop_bin='hadoop', hadoop_streaming_jar='streaming.jar',
            mr_job_script='my_job.py', stdin=BytesIO())
        self.runner._add_job_files_for_upload()

        self.start(patch.object(self.runner, '_upload_args',
                                return_value=['new_upload_args']))
        self.start(patch.object(self.runner, '_pre_0_20_upload_args',
                                return_value=['old_upload_args']))
        self.start(patch.object(self.runner, '_hadoop_args_for_step',
                                return_value=['hadoop_args_for_step']))
        self.start(patch.object(self.runner, '_hdfs_step_input_files',
                                return_value=['hdfs_step_input_files']))
        self.start(patch.object(self.runner, '_hdfs_step_output_dir',
                                return_value='hdfs_step_output_dir'))
        self.start(patch.object(HadoopFilesystem, 'get_hadoop_version',
                                return_value='1.2.0'))
        self.runner._script_path = 'my_job.py'

        self._new_basic_args = [
            'hadoop', 'jar', 'streaming.jar',
             'new_upload_args', 'hadoop_args_for_step',
             '-input', 'hdfs_step_input_files',
             '-output', 'hdfs_step_output_dir']

        self._old_basic_args = [
            'hadoop', 'jar', 'streaming.jar',
             'hadoop_args_for_step',
             '-input', 'hdfs_step_input_files',
             '-output', 'hdfs_step_output_dir',
             'old_upload_args']
Ejemplo n.º 24
0
    def make_runner(self):
        """Make a runner based on command-line arguments, so we can
        launch this job on EMR, on Hadoop, or locally.

        :rtype: :py:class:`mrjob.runner.MRJobRunner`
        """
        # have to import here so that we can still run the MRJob
        # without importing boto
        from mrjob.emr import EMRJobRunner
        from mrjob.hadoop import HadoopJobRunner
        from mrjob.local import LocalMRJobRunner

        if self.options.runner == 'emr':
            return EMRJobRunner(**self.emr_job_runner_kwargs())

        elif self.options.runner == 'hadoop':
            return HadoopJobRunner(**self.hadoop_job_runner_kwargs())

        elif self.options.runner == 'inline':
            raise ValueError("inline is not supported in the multi-lingual"
                             " launcher.")

        else:
            # run locally by default
            return LocalMRJobRunner(**self.local_job_runner_kwargs())
Ejemplo n.º 25
0
    def test_hadoop_log_dirs_opt(self):
        self.runner = HadoopJobRunner(hadoop_log_dirs=['/logs1', '/logs2'])

        os.environ['HADOOP_LOG_DIR'] = '/path/to/hadoop-log-dir'

        # setting hadoop_log_dirs short-circuits automatic discovery of logs
        self.assertEqual(
            list(self.runner._hadoop_log_dirs()),
            ['/logs1', '/logs2'])
Ejemplo n.º 26
0
    def setUp(self):
        super(StreamingLogDirsTestCase, self).setUp()

        self.log = self.start(patch('mrjob.hadoop.log'))

        self.runner = HadoopJobRunner()
        self.runner._hadoop_log_dirs = Mock(return_value=[])
        self.runner.fs.exists = Mock(return_value=True)

        self.log.reset_mock()  # ignore logging from HadoopJobRunner init
Ejemplo n.º 27
0
    def test_hadoop_home_regression(self):
        # kill $HADOOP_HOME if it exists
        try:
            del os.environ['HADOOP_HOME']
        except KeyError:
            pass

        with patch('mrjob.hadoop.find_hadoop_streaming_jar',
                   return_value='some.jar'):
            HadoopJobRunner(hadoop_home=self.tmp_dir, conf_paths=[])
Ejemplo n.º 28
0
    def setUp(self):
        super(HadoopStreamingJarTestCase, self).setUp()

        self.mock_paths = []

        def mock_ls(path):  # don't bother to support globs
            return (p for p in sorted(self.mock_paths) if p.startswith(path))

        self.start(patch('mrjob.fs.local.LocalFilesystem.ls',
                         side_effect=mock_ls))

        os.environ.clear()

        self.runner = HadoopJobRunner()
Ejemplo n.º 29
0
    def test_cat_compressed(self):
        input_gz_path = os.path.join(self.tmp_dir, 'input.gz')
        input_gz = gzip.GzipFile(input_gz_path, 'w')
        input_gz.write('foo\nbar\n')
        input_gz.close()

        with HadoopJobRunner(cleanup=['NONE']) as runner:
            output = []
            for line in runner.cat(input_gz_path):
                output.append(line)

        assert_equal(output, ['foo\n', 'bar\n'])

        input_bz2_path = os.path.join(self.tmp_dir, 'input.bz2')
        input_bz2 = bz2.BZ2File(input_bz2_path, 'w')
        input_bz2.write('bar\nbar\nfoo\n')
        input_bz2.close()

        with HadoopJobRunner(cleanup=['NONE']) as runner:
            output = []
            for line in runner.cat(input_bz2_path):
                output.append(line)

        assert_equal(output, ['bar\n', 'bar\n', 'foo\n'])
Ejemplo n.º 30
0
    def test_pass_through_fields(self):
        # TODO: currently can't initialize HadoopRunner without setting these
        runner = HadoopJobRunner(hadoop_bin='hadoooooooooop',
                                 hadoop_home='kansas',
                                 hadoop_streaming_jar='streaming.jar')

        with no_handlers_for_logger('mrjob.runner'):
            stderr = StringIO()
            log_to_stream('mrjob.runner', stderr)

            self.assertEqual(runner._hadoop_bin, runner.fs._hadoop_bin)

            # deprecation warning is different for non-functions
            self.assertIn(
                'deprecated: access HadoopJobRunner.fs._hadoop_bin directly',
                stderr.getvalue())
Ejemplo n.º 31
0
    def setUp(self):
        super(StreamingArgsTestCase, self).setUp()
        self.runner = HadoopJobRunner(
            hadoop_bin='hadoop', hadoop_streaming_jar='<streaming jar>',
            mr_job_script='my_job.py', stdin=BytesIO())
        self.runner._add_job_files_for_upload()

        self.start(patch.object(self.runner, '_upload_args',
                                return_value=['<upload args>']))
        self.start(patch.object(self.runner, '_hadoop_args_for_step',
                                return_value=['<hadoop args for step>']))
        self.start(patch.object(self.runner, '_hdfs_step_input_files',
                                return_value=['<hdfs step input files>']))
        self.start(patch.object(self.runner, '_hdfs_step_output_dir',
                                return_value='<hdfs step output dir>'))
        self.start(patch.object(HadoopFilesystem, 'get_hadoop_version',
                                return_value='2.7.1'))
        self.runner._script_path = 'my_job.py'
Ejemplo n.º 32
0
    def test_prefer_own_methods(self):
        # TODO: currently can't initialize HadoopRunner without setting these
        runner = HadoopJobRunner(hadoop_bin='hadoop',
                                 hadoop_home='kansas',
                                 hadoop_streaming_jar='streaming.jar')

        with no_handlers_for_logger('mrjob.runner'):
            stderr = StringIO()
            log_to_stream('mrjob.runner', stderr)

            self.assertEqual(runner.ls, runner.fs.ls)

            # Hadoop Runner has its own version
            self.assertNotEqual(runner.get_hadoop_version,
                                runner.fs.get_hadoop_version)

            self.assertIn('deprecated: call HadoopJobRunner.fs.ls() directly',
                          stderr.getvalue())
            self.assertNotIn('get_hadoop_version', stderr.getvalue())
Ejemplo n.º 33
0
    def setUp(self):
        super(StreamingArgsTestCase, self).setUp()
        self.runner = HadoopJobRunner(
            hadoop_bin="hadoop", hadoop_streaming_jar="streaming.jar", mr_job_script="my_job.py", stdin=StringIO()
        )
        self.runner._add_job_files_for_upload()

        self.runner._hadoop_version = "0.20.204"
        self.simple_patch(self.runner, "_new_upload_args", return_value=["new_upload_args"])
        self.simple_patch(self.runner, "_old_upload_args", return_value=["old_upload_args"])
        self.simple_patch(self.runner, "_hadoop_args_for_step", return_value=["hadoop_args_for_step"])
        self.simple_patch(self.runner, "_hdfs_step_input_files", return_value=["hdfs_step_input_files"])
        self.simple_patch(self.runner, "_hdfs_step_output_dir", return_value="hdfs_step_output_dir")
        self.runner._script_path = "my_job.py"

        self._new_basic_args = [
            "hadoop",
            "jar",
            "streaming.jar",
            "new_upload_args",
            "hadoop_args_for_step",
            "-input",
            "hdfs_step_input_files",
            "-output",
            "hdfs_step_output_dir",
        ]

        self._old_basic_args = [
            "hadoop",
            "jar",
            "streaming.jar",
            "hadoop_args_for_step",
            "-input",
            "hdfs_step_input_files",
            "-output",
            "hdfs_step_output_dir",
            "old_upload_args",
        ]
Ejemplo n.º 34
0
    def setUp(self):
        super(HadoopLogDirsTestCase, self).setUp()

        os.environ.clear()

        self.mock_hadoop_version = '2.7.0'
        # the result of _hadoop_dir(). This handles non-log-specific
        # environment variables, such as $HADOOP_PREFIX, and also guesses
        # based on the path of the Hadoop binary
        self.mock_hadoop_dirs = []

        def mock_get_hadoop_version():
            return self.mock_hadoop_version

        def mock_hadoop_dirs_method():
            return (d for d in self.mock_hadoop_dirs)

        self.start(patch('mrjob.hadoop.HadoopJobRunner.get_hadoop_version',
                         side_effect=mock_get_hadoop_version))
        self.start(patch('mrjob.hadoop.HadoopJobRunner._hadoop_dirs',
                         side_effect=mock_hadoop_dirs_method))

        self.runner = HadoopJobRunner()
Ejemplo n.º 35
0
    def test_cat_uncompressed(self):
        local_input_path = os.path.join(self.tmp_dir, 'input')
        with open(local_input_path, 'w') as input_file:
            input_file.write('bar\nfoo\n')

        input_to_upload = os.path.join(self.tmp_dir, 'remote_input')
        with open(input_to_upload, 'w') as input_to_upload_file:
            input_to_upload_file.write('foo\nfoo\n')
        remote_input_path = 'hdfs:///data/foo'
        check_call([
            self.hadoop_bin, 'fs', '-put', input_to_upload, remote_input_path
        ])

        with HadoopJobRunner(cleanup=['NONE']) as runner:
            local_output = []
            for line in runner.cat(local_input_path):
                local_output.append(line)

            remote_output = []
            for line in runner.cat(remote_input_path):
                remote_output.append(line)

        assert_equal(local_output, ['bar\n', 'foo\n'])
        assert_equal(remote_output, ['foo\n', 'foo\n'])
Ejemplo n.º 36
0
    def setUp(self):
        super(FindProbableCauseOfFailureTestCase, self).setUp()

        os.environ['MOCK_HADOOP_VERSION'] = '2.7.0'

        self.runner = HadoopJobRunner()
Ejemplo n.º 37
0
    def test_deprecated_hadoop_home_option(self):
        self.runner = HadoopJobRunner(hadoop_home='/ha/do/op/home-option')

        self.mock_paths.append('/ha/do/op/home-option/hadoop-streaming.jar')
        self.assertEqual(self.runner._find_hadoop_streaming_jar(),
                         '/ha/do/op/home-option/hadoop-streaming.jar')
Ejemplo n.º 38
0
    def test_deprecated_hadoop_home_option(self):
        self.runner = HadoopJobRunner(hadoop_home='/ha/do/op/home-option')

        self.mock_paths.append('/ha/do/op/home-option/hadoop-streaming.jar')
        self.assertEqual(self.runner._find_hadoop_streaming_jar(),
                         '/ha/do/op/home-option/hadoop-streaming.jar')
Ejemplo n.º 39
0
class StreamingArgsTestCase(EmptyMrjobConfTestCase):

    MRJOB_CONF_CONTENTS = {
        'runners': {
            'hadoop': {
                'hadoop_home': 'kansas',
                'hadoop_streaming_jar': 'binks.jar.jar',
            }
        }
    }

    def setUp(self):
        super(StreamingArgsTestCase, self).setUp()
        self.runner = HadoopJobRunner(hadoop_bin='hadoop',
                                      hadoop_streaming_jar='streaming.jar',
                                      mr_job_script='my_job.py',
                                      stdin=StringIO())
        self.runner._add_job_files_for_upload()

        self.runner._hadoop_version = '0.20.204'
        self.simple_patch(self.runner,
                          '_new_upload_args',
                          return_value=['new_upload_args'])
        self.simple_patch(self.runner,
                          '_old_upload_args',
                          return_value=['old_upload_args'])
        self.simple_patch(self.runner,
                          '_hadoop_conf_args',
                          return_value=['hadoop_conf_args'])
        self.simple_patch(self.runner,
                          '_hdfs_step_input_files',
                          return_value=['hdfs_step_input_files'])
        self.simple_patch(self.runner,
                          '_hdfs_step_output_dir',
                          return_value='hdfs_step_output_dir')
        self.runner._script_path = 'my_job.py'

        self._new_basic_args = [
            'hadoop', 'jar', 'streaming.jar', 'new_upload_args',
            'hadoop_conf_args', '-input', 'hdfs_step_input_files', '-output',
            'hdfs_step_output_dir'
        ]

        self._old_basic_args = [
            'hadoop', 'jar', 'streaming.jar', 'hadoop_conf_args', '-input',
            'hdfs_step_input_files', '-output', 'hdfs_step_output_dir',
            'old_upload_args'
        ]

    def simple_patch(self, obj, attr, side_effect=None, return_value=None):
        patcher = patch.object(obj,
                               attr,
                               side_effect=side_effect,
                               return_value=return_value)
        patcher.start()
        self.addCleanup(patcher.stop)

    def _assert_streaming_step(self, step, args, step_num=0, num_steps=1):
        self.assertEqual(
            self.runner._streaming_args(step, step_num, num_steps),
            self._new_basic_args + args)

    def _assert_streaming_step_old(self, step, args, step_num=0, num_steps=1):
        self.runner._hadoop_version = '0.18'
        self.assertEqual(
            self._old_basic_args + args,
            self.runner._streaming_args(step, step_num, num_steps))

    def test_basic_mapper(self):
        self._assert_streaming_step(
            {
                'type': 'streaming',
                'mapper': {
                    'type': 'script',
                },
            }, [
                '-mapper', 'python my_job.py --step-num=0 --mapper',
                '-jobconf', 'mapred.reduce.tasks=0'
            ])

    def test_basic_reducer(self):
        self._assert_streaming_step(
            {
                'type': 'streaming',
                'reducer': {
                    'type': 'script',
                },
            }, [
                '-mapper', 'cat', '-reducer',
                'python my_job.py --step-num=0 --reducer'
            ])

    def test_pre_filters(self):
        self._assert_streaming_step(
            {
                'type': 'streaming',
                'mapper': {
                    'type': 'script',
                    'pre_filter': 'grep anything',
                },
                'combiner': {
                    'type': 'script',
                    'pre_filter': 'grep nothing',
                },
                'reducer': {
                    'type': 'script',
                    'pre_filter': 'grep something',
                },
            }, [
                "-mapper",
                "bash -c 'grep anything | python my_job.py --step-num=0"
                " --mapper'", "-combiner",
                "bash -c 'grep nothing | python my_job.py --step-num=0"
                " --combiner'", "-reducer",
                "bash -c 'grep something | python my_job.py --step-num=0"
                " --reducer'"
            ])

    def test_combiner_018(self):
        self._assert_streaming_step_old(
            {
                'type': 'streaming',
                'mapper': {
                    'type': 'command',
                    'command': 'cat',
                },
                'combiner': {
                    'type': 'script',
                },
            }, [
                "-mapper",
                "bash -c 'cat | sort | python my_job.py --step-num=0"
                " --combiner'", '-jobconf', 'mapred.reduce.tasks=0'
            ])

    def test_pre_filters_018(self):
        self._assert_streaming_step_old(
            {
                'type': 'streaming',
                'mapper': {
                    'type': 'script',
                    'pre_filter': 'grep anything',
                },
                'combiner': {
                    'type': 'script',
                    'pre_filter': 'grep nothing',
                },
                'reducer': {
                    'type': 'script',
                    'pre_filter': 'grep something',
                },
            }, [
                '-mapper',
                "bash -c 'grep anything | python my_job.py --step-num=0"
                " --mapper | sort | grep nothing | python my_job.py"
                " --step-num=0 --combiner'", '-reducer',
                "bash -c 'grep something | python my_job.py --step-num=0"
                " --reducer'"
            ])

    def test_pre_filter_escaping(self):
        # ESCAPE ALL THE THINGS!!!
        self._assert_streaming_step(
            {
                'type': 'streaming',
                'mapper': {
                    'type': 'script',
                    'pre_filter': bash_wrap("grep 'anything'"),
                },
            }, [
                '-mapper', "bash -c 'bash -c '\\''grep"
                " '\\''\\'\\'''\\''anything'\\''\\'\\'''\\'''\\'' |"
                " python my_job.py --step-num=0 --mapper'", '-jobconf',
                'mapred.reduce.tasks=0'
            ])
Ejemplo n.º 40
0
class HadoopStreamingJarTestCase(SandboxedTestCase):

    def setUp(self):
        super(HadoopStreamingJarTestCase, self).setUp()

        self.mock_paths = []

        def mock_ls(path):  # don't bother to support globs
            return (p for p in sorted(self.mock_paths) if p.startswith(path))

        self.start(patch('mrjob.fs.local.LocalFilesystem.ls',
                         side_effect=mock_ls))

        os.environ.clear()

        self.runner = HadoopJobRunner()

    def test_empty_fs(self):
        self.assertEqual(self.runner._find_hadoop_streaming_jar(), None)

    def test_deprecated_hadoop_home_option(self):
        self.runner = HadoopJobRunner(hadoop_home='/ha/do/op/home-option')

        self.mock_paths.append('/ha/do/op/home-option/hadoop-streaming.jar')
        self.assertEqual(self.runner._find_hadoop_streaming_jar(),
                         '/ha/do/op/home-option/hadoop-streaming.jar')

    def test_deprecated_hadoop_home_option_beats_hadoop_prefix(self):
        os.environ['HADOOP_PREFIX'] = '/ha/do/op/prefix'
        self.mock_paths.append('/ha/do/op/prefix/hadoop-streaming.jar')

        self.test_deprecated_hadoop_home_option()

    # tests of well-known environment variables

    def test_hadoop_prefix(self):
        os.environ['HADOOP_PREFIX'] = '/ha/do/op/prefix'
        self.mock_paths.append('/ha/do/op/prefix/hadoop-streaming.jar')

        self.assertEqual(self.runner._find_hadoop_streaming_jar(),
                         '/ha/do/op/prefix/hadoop-streaming.jar')

    def test_hadoop_prefix_beats_hadoop_home(self):
        os.environ['HADOOP_HOME'] = '/ha/do/op/home'
        self.mock_paths.append('/ha/do/op/home/hadoop-streaming.jar')

        self.test_hadoop_prefix()

    def test_hadoop_home(self):
        os.environ['HADOOP_HOME'] = '/ha/do/op/home'
        self.mock_paths.append('/ha/do/op/home/hadoop-streaming.jar')

        self.assertEqual(self.runner._find_hadoop_streaming_jar(),
                         '/ha/do/op/home/hadoop-streaming.jar')

    def test_hadoop_home_beats_hadoop_install(self):
        os.environ['HADOOP_INSTALL'] = '/ha/do/op/install'
        self.mock_paths.append('/ha/do/op/install/hadoop-streaming.jar')

        self.test_hadoop_home()

    def test_hadoop_install(self):
        os.environ['HADOOP_INSTALL'] = '/ha/do/op/install'
        self.mock_paths.append('/ha/do/op/install/hadoop-streaming.jar')

        self.assertEqual(self.runner._find_hadoop_streaming_jar(),
                         '/ha/do/op/install/hadoop-streaming.jar')

    def test_hadoop_install_beats_hadoop_mapred_home(self):
        os.environ['HADOOP_MAPRED_HOME'] = '/ha/do/op/mapred-home'
        self.mock_paths.append('/ha/do/op/mapred-home/hadoop-streaming.jar')

        self.test_hadoop_install()

    def test_hadoop_mapred_home(self):
        os.environ['HADOOP_MAPRED_HOME'] = '/ha/do/op/mapred-home'
        self.mock_paths.append('/ha/do/op/mapred-home/hadoop-streaming.jar')

        self.assertEqual(self.runner._find_hadoop_streaming_jar(),
                         '/ha/do/op/mapred-home/hadoop-streaming.jar')

    def test_hadoop_mapred_home_beats_infer_from_hadoop_bin(self):
        self.runner = HadoopJobRunner(
            hadoop_bin=['/ha/do/op/bin-parent/bin/hadoop'])

        self.mock_paths.append('/ha/do/op/bin-parent/hadoop-streaming.jar')

        self.test_hadoop_mapred_home()

    # infer from hadoop_bin

    def test_infer_from_hadoop_bin_parent_dir(self):
        self.runner = HadoopJobRunner(
            hadoop_bin=['/ha/do/op/bin-parent/bin/hadoop'])

        self.mock_paths.append('/ha/do/op/bin-parent/hadoop-streaming.jar')
        self.assertEqual(self.runner._find_hadoop_streaming_jar(),
                         '/ha/do/op/bin-parent/hadoop-streaming.jar')

    def test_hadoop_bin_beats_hadoop_anything_home(self):
        os.environ['HADOOP_ANYTHING_HOME'] = '/ha/do/op/anything-home'
        self.mock_paths.append('/ha/do/op/anything-home/hadoop-streaming.jar')

        self.test_infer_from_hadoop_bin_parent_dir()

    def test_dont_infer_from_bin_hadoop(self):
        self.runner = HadoopJobRunner(hadoop_bin=['/bin/hadoop'])
        self.mock_paths.append('/hadoop-streaming.jar')

        self.assertEqual(self.runner._find_hadoop_streaming_jar(), None)

    def test_dont_infer_from_usr_bin_hadoop(self):
        self.runner = HadoopJobRunner(hadoop_bin=['/usr/bin/hadoop'])
        self.mock_paths.append('/usr/hadoop-streaming.jar')

        self.assertEqual(self.runner._find_hadoop_streaming_jar(), None)

    def test_dont_infer_from_usr_local_bin_hadoop(self):
        self.runner = HadoopJobRunner(hadoop_bin=['/usr/local/bin/hadoop'])
        self.mock_paths.append('/usr/local/hadoop-streaming.jar')

        self.assertEqual(self.runner._find_hadoop_streaming_jar(), None)

    def test_infer_from_hadoop_bin_realpath(self):
        with patch('posixpath.realpath', return_value='/ha/do/op/bin'):
            self.runner = HadoopJobRunner(hadoop_bin=['/usr/bin/hadoop'])
            self.mock_paths.append('/ha/do/op/hadoop-streaming.jar')

            self.assertEqual(self.runner._find_hadoop_streaming_jar(),
                             '/ha/do/op/hadoop-streaming.jar')

    # tests of fallback environment variables ($HADOOP_*_HOME)

    def test_hadoop_anything_home(self):
        os.environ['HADOOP_WHATEVER_HOME'] = '/ha/do/op/whatever-home'
        self.mock_paths.append('/ha/do/op/whatever-home/hadoop-streaming.jar')

        self.assertEqual(self.runner._find_hadoop_streaming_jar(),
                         '/ha/do/op/whatever-home/hadoop-streaming.jar')

        # $HADOOP_ANYTHING_HOME comes before $HADOOP_WHATEVER_HOME
        os.environ['HADOOP_ANYTHING_HOME'] = '/ha/do/op/anything-home'
        self.mock_paths.append('/ha/do/op/anything-home/hadoop-streaming.jar')

        self.assertEqual(self.runner._find_hadoop_streaming_jar(),
                         '/ha/do/op/anything-home/hadoop-streaming.jar')

    def test_hadoop_anything_home_beats_hard_coded_paths(self):
        self.mock_paths.append('/home/hadoop/contrib/hadoop-streaming.jar')
        self.mock_paths.append(
            '/usr/lib/hadoop-mapreduce/hadoop-streaming.jar')

        self.test_hadoop_anything_home()

    # hard-coded paths (for Hadoop inside EMR)

    def test_hard_coded_emr_paths(self):
        self.mock_paths.append(
            '/usr/lib/hadoop-mapreduce/hadoop-streaming.jar')
        self.assertEqual(self.runner._find_hadoop_streaming_jar(),
                         '/usr/lib/hadoop-mapreduce/hadoop-streaming.jar')

        # /home/hadoop/contrib takes precedence
        self.mock_paths.append('/home/hadoop/contrib/hadoop-streaming.jar')
        self.assertEqual(self.runner._find_hadoop_streaming_jar(),
                         '/home/hadoop/contrib/hadoop-streaming.jar')

    # invalid environment variables

    def test_other_environment_variable(self):
        os.environ['HADOOP_YARN_MRJOB_DIR'] = '/ha/do/op/yarn-mrjob-dir'
        self.mock_paths.append(
            '/ha/do/op/yarn-mrjob-dir/hadoop-streaming.jar')

        self.assertEqual(self.runner._find_hadoop_streaming_jar(), None)

    # alternate jar names and paths

    def test_subdirs(self):
        os.environ['HADOOP_PREFIX'] = '/ha/do/op'
        self.mock_paths.append('/ha/do/op/contrib/hadoop-streaming.jar')

        self.assertEqual(self.runner._find_hadoop_streaming_jar(),
                         '/ha/do/op/contrib/hadoop-streaming.jar')

    def test_hadoop_streaming_jar_name_with_version(self):
        os.environ['HADOOP_PREFIX'] = '/ha/do/op'

        self.mock_paths.append('/ha/do/op/hadoop-streaming-2.6.0-amzn-0.jar')
        self.assertEqual(self.runner._find_hadoop_streaming_jar(),
                         '/ha/do/op/hadoop-streaming-2.6.0-amzn-0.jar')

    def test_skip_hadoop_streaming_source_jar(self):
        os.environ['HADOOP_PREFIX'] = '/ha/do/op'

        # Googled it; it really is named *-sources.jar, not *-source.jar
        self.mock_paths.append(
            '/ha/do/op/hadoop-streaming-2.0.0-mr1-cdh4.3.1-sources.jar')
        self.assertEqual(self.runner._find_hadoop_streaming_jar(), None)

    # multiple matching jars in same directory

    def test_pick_shortest_name(self):
        os.environ['HADOOP_PREFIX'] = '/ha/do/op'

        self.mock_paths.append('/ha/do/op/hadoop-streaming-1.0.3.jar')
        self.mock_paths.append('/ha/do/op/hadoop-streaming.jar')

        # hadoop-streaming-1.0.3.jar comes first in alphabetical order
        self.assertEqual(sorted(self.mock_paths), self.mock_paths)

        self.assertEqual(self.runner._find_hadoop_streaming_jar(),
                         '/ha/do/op/hadoop-streaming.jar')

    def test_pick_shallowest_subpath(self):
        os.environ['HADOOP_PREFIX'] = '/ha/do/op'

        self.mock_paths.append('/ha/do/op/hadoop-streaming-1.0.3.jar')
        self.mock_paths.append('/ha/do/op/old/hadoop-streaming.jar')

        self.assertEqual(self.runner._find_hadoop_streaming_jar(),
                         '/ha/do/op/hadoop-streaming-1.0.3.jar')

    def test_fall_back_to_alphabetical_order(self):
        os.environ['HADOOP_PREFIX'] = '/ha/do/op'

        self.mock_paths.append('/ha/do/op/hadoop-streaming-a.jar')
        self.mock_paths.append('/ha/do/op/hadoop-streaming-b.jar')

        self.assertEqual(self.runner._find_hadoop_streaming_jar(),
                         '/ha/do/op/hadoop-streaming-a.jar')

    # sanity-check that directory order overrides path sort order

    def test_directory_order_overrides_path_sort_order(self):
        os.environ['HADOOP_HOME'] = '/ha/do/op/a'
        os.environ['HADOOP_PREFIX'] = '/ha/do/op/b'

        self.mock_paths.append('/ha/do/op/a/hadoop-streaming-a.jar')
        self.mock_paths.append('/ha/do/op/b/hadoop-streaming-b.jar')

        # $HADOOP_PREFIX takes precendence over $HADOOP_HOME, so sort
        # order doesn't matter
        self.assertEqual(self.runner._find_hadoop_streaming_jar(),
                         '/ha/do/op/b/hadoop-streaming-b.jar')

        # now search in parent dir (/ha/do/op) to invoke sort order
        os.environ['HADOOP_PREFIX'] = '/ha/do/op'
        self.assertEqual(self.runner._find_hadoop_streaming_jar(),
                         '/ha/do/op/a/hadoop-streaming-a.jar')
Ejemplo n.º 41
0
class StreamingArgsTestCase(EmptyMrjobConfTestCase):

    MRJOB_CONF_CONTENTS = {'runners': {'hadoop': {
        'hadoop_home': 'kansas',
        'hadoop_streaming_jar': 'binks.jar.jar',
    }}}

    def setUp(self):
        super(StreamingArgsTestCase, self).setUp()
        self.runner = HadoopJobRunner(
            hadoop_bin='hadoop', hadoop_streaming_jar='streaming.jar',
            mr_job_script='my_job.py', stdin=BytesIO())
        self.runner._add_job_files_for_upload()

        self.start(patch.object(self.runner, '_upload_args',
                                return_value=['new_upload_args']))
        self.start(patch.object(self.runner, '_pre_0_20_upload_args',
                                return_value=['old_upload_args']))
        self.start(patch.object(self.runner, '_hadoop_args_for_step',
                                return_value=['hadoop_args_for_step']))
        self.start(patch.object(self.runner, '_hdfs_step_input_files',
                                return_value=['hdfs_step_input_files']))
        self.start(patch.object(self.runner, '_hdfs_step_output_dir',
                                return_value='hdfs_step_output_dir'))
        self.start(patch.object(HadoopFilesystem, 'get_hadoop_version',
                                return_value='1.2.0'))
        self.runner._script_path = 'my_job.py'

        self._new_basic_args = [
            'hadoop', 'jar', 'streaming.jar',
             'new_upload_args', 'hadoop_args_for_step',
             '-input', 'hdfs_step_input_files',
             '-output', 'hdfs_step_output_dir']

        self._old_basic_args = [
            'hadoop', 'jar', 'streaming.jar',
             'hadoop_args_for_step',
             '-input', 'hdfs_step_input_files',
             '-output', 'hdfs_step_output_dir',
             'old_upload_args']

    def _assert_streaming_step(self, step, args):
        self.runner._steps = [step]
        self.assertEqual(
            self.runner._args_for_streaming_step(0),
            self._new_basic_args + args)

    def _assert_streaming_step_old(self, step, args):
        HadoopFilesystem.get_hadoop_version.return_value = '0.18'
        self.runner._steps = [step]
        self.assertEqual(
            self.runner._args_for_streaming_step(0),
            self._old_basic_args + args)

    def test_basic_mapper(self):
        self._assert_streaming_step(
            {
                'type': 'streaming',
                'mapper': {
                    'type': 'script',
                },
            },
            ['-mapper',
             PYTHON_BIN + ' my_job.py --step-num=0 --mapper',
             '-jobconf',
             'mapred.reduce.tasks=0'])

    def test_basic_reducer(self):
        self._assert_streaming_step(
            {
                'type': 'streaming',
                'reducer': {
                    'type': 'script',
                },
            },
            ['-mapper',
             'cat',
             '-reducer',
             PYTHON_BIN + ' my_job.py --step-num=0 --reducer'])

    def test_pre_filters(self):
        self._assert_streaming_step(
            {
                'type': 'streaming',
                'mapper': {
                    'type': 'script',
                    'pre_filter': 'grep anything',
                },
                'combiner': {
                    'type': 'script',
                    'pre_filter': 'grep nothing',
                },
                'reducer': {
                    'type': 'script',
                    'pre_filter': 'grep something',
                },
            },
            ["-mapper",
             "bash -c 'grep anything | " + PYTHON_BIN +
             " my_job.py --step-num=0 --mapper'",
             "-combiner",
             "bash -c 'grep nothing | " + PYTHON_BIN +
             " my_job.py --step-num=0 --combiner'",
             "-reducer",
             "bash -c 'grep something | " + PYTHON_BIN +
             " my_job.py --step-num=0 --reducer'"])

    def test_combiner_018(self):
        self._assert_streaming_step_old(
            {
                'type': 'streaming',
                'mapper': {
                    'type': 'command',
                    'command': 'cat',
                },
                'combiner': {
                    'type': 'script',
                },
            },
            ["-mapper",
             "bash -c 'cat | sort | " + PYTHON_BIN +
             " my_job.py --step-num=0 --combiner'",
             '-jobconf', 'mapred.reduce.tasks=0'])

    def test_pre_filters_018(self):
        self._assert_streaming_step_old(
            {
                'type': 'streaming',
                'mapper': {
                    'type': 'script',
                    'pre_filter': 'grep anything',
                },
                'combiner': {
                    'type': 'script',
                    'pre_filter': 'grep nothing',
                },
                'reducer': {
                    'type': 'script',
                    'pre_filter': 'grep something',
                },
            },
            ['-mapper',
             "bash -c 'grep anything | " + PYTHON_BIN +
             " my_job.py --step-num=0"
             " --mapper | sort | grep nothing | " + PYTHON_BIN +
             " my_job.py --step-num=0 --combiner'",
             '-reducer',
             "bash -c 'grep something | " + PYTHON_BIN +
             " my_job.py --step-num=0 --reducer'"])

    def test_pre_filter_escaping(self):
        # ESCAPE ALL THE THINGS!!!
        self._assert_streaming_step(
            {
                'type': 'streaming',
                'mapper': {
                    'type': 'script',
                    'pre_filter': bash_wrap("grep 'anything'"),
                },
            },
            ['-mapper',
             "bash -c 'bash -c '\\''grep"
             " '\\''\\'\\'''\\''anything'\\''\\'\\'''\\'''\\'' | " +
             PYTHON_BIN +
             " my_job.py --step-num=0 --mapper'",
             '-jobconf', 'mapred.reduce.tasks=0'])
Ejemplo n.º 42
0
class StreamingArgsTestCase(EmptyMrjobConfTestCase):

    MRJOB_CONF_CONTENTS = {"runners": {"hadoop": {"hadoop_home": "kansas", "hadoop_streaming_jar": "binks.jar.jar"}}}

    def setUp(self):
        super(StreamingArgsTestCase, self).setUp()
        self.runner = HadoopJobRunner(
            hadoop_bin="hadoop", hadoop_streaming_jar="streaming.jar", mr_job_script="my_job.py", stdin=StringIO()
        )
        self.runner._add_job_files_for_upload()

        self.runner._hadoop_version = "0.20.204"
        self.simple_patch(self.runner, "_new_upload_args", return_value=["new_upload_args"])
        self.simple_patch(self.runner, "_old_upload_args", return_value=["old_upload_args"])
        self.simple_patch(self.runner, "_hadoop_args_for_step", return_value=["hadoop_args_for_step"])
        self.simple_patch(self.runner, "_hdfs_step_input_files", return_value=["hdfs_step_input_files"])
        self.simple_patch(self.runner, "_hdfs_step_output_dir", return_value="hdfs_step_output_dir")
        self.runner._script_path = "my_job.py"

        self._new_basic_args = [
            "hadoop",
            "jar",
            "streaming.jar",
            "new_upload_args",
            "hadoop_args_for_step",
            "-input",
            "hdfs_step_input_files",
            "-output",
            "hdfs_step_output_dir",
        ]

        self._old_basic_args = [
            "hadoop",
            "jar",
            "streaming.jar",
            "hadoop_args_for_step",
            "-input",
            "hdfs_step_input_files",
            "-output",
            "hdfs_step_output_dir",
            "old_upload_args",
        ]

    def simple_patch(self, obj, attr, side_effect=None, return_value=None):
        patcher = patch.object(obj, attr, side_effect=side_effect, return_value=return_value)
        patcher.start()
        self.addCleanup(patcher.stop)

    def _assert_streaming_step(self, step, args):
        self.runner._steps = [step]
        self.assertEqual(self.runner._args_for_streaming_step(0), self._new_basic_args + args)

    def _assert_streaming_step_old(self, step, args):
        self.runner._hadoop_version = "0.18"
        self.runner._steps = [step]
        self.assertEqual(self.runner._args_for_streaming_step(0), self._old_basic_args + args)

    def test_basic_mapper(self):
        self._assert_streaming_step(
            {"type": "streaming", "mapper": {"type": "script"}},
            ["-mapper", "python my_job.py --step-num=0 --mapper", "-jobconf", "mapred.reduce.tasks=0"],
        )

    def test_basic_reducer(self):
        self._assert_streaming_step(
            {"type": "streaming", "reducer": {"type": "script"}},
            ["-mapper", "cat", "-reducer", "python my_job.py --step-num=0 --reducer"],
        )

    def test_pre_filters(self):
        self._assert_streaming_step(
            {
                "type": "streaming",
                "mapper": {"type": "script", "pre_filter": "grep anything"},
                "combiner": {"type": "script", "pre_filter": "grep nothing"},
                "reducer": {"type": "script", "pre_filter": "grep something"},
            },
            [
                "-mapper",
                "bash -c 'grep anything | python my_job.py --step-num=0" " --mapper'",
                "-combiner",
                "bash -c 'grep nothing | python my_job.py --step-num=0" " --combiner'",
                "-reducer",
                "bash -c 'grep something | python my_job.py --step-num=0" " --reducer'",
            ],
        )

    def test_combiner_018(self):
        self._assert_streaming_step_old(
            {"type": "streaming", "mapper": {"type": "command", "command": "cat"}, "combiner": {"type": "script"}},
            [
                "-mapper",
                "bash -c 'cat | sort | python my_job.py --step-num=0" " --combiner'",
                "-jobconf",
                "mapred.reduce.tasks=0",
            ],
        )

    def test_pre_filters_018(self):
        self._assert_streaming_step_old(
            {
                "type": "streaming",
                "mapper": {"type": "script", "pre_filter": "grep anything"},
                "combiner": {"type": "script", "pre_filter": "grep nothing"},
                "reducer": {"type": "script", "pre_filter": "grep something"},
            },
            [
                "-mapper",
                "bash -c 'grep anything | python my_job.py --step-num=0"
                " --mapper | sort | grep nothing | python my_job.py"
                " --step-num=0 --combiner'",
                "-reducer",
                "bash -c 'grep something | python my_job.py --step-num=0" " --reducer'",
            ],
        )

    def test_pre_filter_escaping(self):
        # ESCAPE ALL THE THINGS!!!
        self._assert_streaming_step(
            {"type": "streaming", "mapper": {"type": "script", "pre_filter": bash_wrap("grep 'anything'")}},
            [
                "-mapper",
                "bash -c 'bash -c '\\''grep"
                " '\\''\\'\\'''\\''anything'\\''\\'\\'''\\'''\\'' |"
                " python my_job.py --step-num=0 --mapper'",
                "-jobconf",
                "mapred.reduce.tasks=0",
            ],
        )
Ejemplo n.º 43
0
 def test_missing_hadoop_version(self):
     with patch.dict('os.environ', MOCK_HADOOP_VERSION=''):
         runner = HadoopJobRunner()
         self.assertRaises(Exception, runner.get_hadoop_version)
Ejemplo n.º 44
0
class StreamingArgsTestCase(EmptyMrjobConfTestCase):

    MRJOB_CONF_CONTENTS = {'runners': {'hadoop': {
        'hadoop_home': 'kansas',
        'hadoop_streaming_jar': 'binks.jar.jar',
    }}}

    BASIC_HADOOP_ARGS = [
        'hadoop',
        'jar', '<streaming jar>',
        '<upload args>',
        '<hadoop args for step>',
    ]

    BASIC_JOB_ARGS = [
        '-input', '<hdfs step input files>',
        '-output', '<hdfs step output dir>',
    ]


    def setUp(self):
        super(StreamingArgsTestCase, self).setUp()
        self.runner = HadoopJobRunner(
            hadoop_bin='hadoop', hadoop_streaming_jar='<streaming jar>',
            mr_job_script='my_job.py', stdin=BytesIO())
        self.runner._add_job_files_for_upload()

        self.start(patch.object(self.runner, '_upload_args',
                                return_value=['<upload args>']))
        self.start(patch.object(self.runner, '_hadoop_args_for_step',
                                return_value=['<hadoop args for step>']))
        self.start(patch.object(self.runner, '_hdfs_step_input_files',
                                return_value=['<hdfs step input files>']))
        self.start(patch.object(self.runner, '_hdfs_step_output_dir',
                                return_value='<hdfs step output dir>'))
        self.start(patch.object(HadoopFilesystem, 'get_hadoop_version',
                                return_value='2.7.1'))
        self.runner._script_path = 'my_job.py'

    def _assert_streaming_step(self, step, args):
        self.runner._steps = [step]
        self.assertEqual(
            self.runner._args_for_streaming_step(0),
            self._new_basic_args + args)

    def test_basic_mapper(self):
        self.runner._steps = [
            {
                'type': 'streaming',
                'mapper': {
                    'type': 'script',
                },
            },
        ]

        self.assertEqual(
            self.runner._args_for_streaming_step(0),
            (self.BASIC_HADOOP_ARGS + ['-D', 'mapreduce.job.reduces=0'] +
             self.BASIC_JOB_ARGS + [
                 '-mapper',
                 PYTHON_BIN + ' my_job.py --step-num=0 --mapper']))

    def test_basic_mapper_pre_yarn(self):
        # use a different jobconf (-D) on pre-YARN
        self.start(patch.object(HadoopFilesystem, 'get_hadoop_version',
                                return_value='1.0.3'))

        self.runner._steps = [
            {
                'type': 'streaming',
                'mapper': {
                    'type': 'script',
                },
            },
        ]

        self.assertEqual(
            self.runner._args_for_streaming_step(0),
            (self.BASIC_HADOOP_ARGS + ['-D', 'mapred.reduce.tasks=0'] +
             self.BASIC_JOB_ARGS + [
                 '-mapper',
                 PYTHON_BIN + ' my_job.py --step-num=0 --mapper']))

    def test_basic_reducer(self):
        self.runner._steps = [
            {
                'type': 'streaming',
                'reducer': {
                    'type': 'script',
                },
            },
        ]

        self.assertEqual(
            self.runner._args_for_streaming_step(0),
            (self.BASIC_HADOOP_ARGS + self.BASIC_JOB_ARGS + [
                '-mapper',
                'cat',
                '-reducer',
                PYTHON_BIN + ' my_job.py --step-num=0 --reducer']))

    def test_pre_filters(self):
        self.runner._steps = [
            {
                'type': 'streaming',
                'mapper': {
                    'type': 'script',
                    'pre_filter': 'grep anything',
                },
                'combiner': {
                    'type': 'script',
                    'pre_filter': 'grep nothing',
                },
                'reducer': {
                    'type': 'script',
                    'pre_filter': 'grep something',
                },
            },
        ]

        self.assertEqual(
            self.runner._args_for_streaming_step(0),
            (self.BASIC_HADOOP_ARGS + self.BASIC_JOB_ARGS + [
             '-mapper',
             "bash -c 'grep anything | " + PYTHON_BIN +
             " my_job.py --step-num=0 --mapper'",
             '-combiner',
             "bash -c 'grep nothing | " + PYTHON_BIN +
             " my_job.py --step-num=0 --combiner'",
             '-reducer',
             "bash -c 'grep something | " + PYTHON_BIN +
             " my_job.py --step-num=0 --reducer'"]))

    def test_pre_filter_escaping(self):
        # ESCAPE ALL THE THINGS!!!
        self.runner._steps = [
            {
                'type': 'streaming',
                'mapper': {
                    'type': 'script',
                    'pre_filter': bash_wrap("grep 'anything'"),
                },
            },
        ]

        self.assertEqual(
            self.runner._args_for_streaming_step(0),
            (self.BASIC_HADOOP_ARGS + ['-D', 'mapreduce.job.reduces=0'] +
             self.BASIC_JOB_ARGS + [
                 '-mapper',
                 "bash -c 'bash -c '\\''grep"
                 " '\\''\\'\\'''\\''anything'\\''\\'\\'''\\'''\\'' | " +
                 PYTHON_BIN +
                 " my_job.py --step-num=0 --mapper'"]))
Ejemplo n.º 45
0
class StreamingArgsTestCase(EmptyMrjobConfTestCase):

    MRJOB_CONF_CONTENTS = {'runners': {'hadoop': {
        'hadoop_home': 'kansas',
        'hadoop_streaming_jar': 'binks.jar.jar',
    }}}

    BASIC_HADOOP_ARGS = [
        'hadoop',
        'jar', '<streaming jar>',
        '<upload args>',
        '<hadoop args for step>',
    ]

    BASIC_JOB_ARGS = [
        '-input', '<hdfs step input files>',
        '-output', '<hdfs step output dir>',
    ]

    def setUp(self):
        super(StreamingArgsTestCase, self).setUp()
        self.runner = HadoopJobRunner(
            hadoop_bin='hadoop', hadoop_streaming_jar='<streaming jar>',
            mr_job_script='my_job.py', stdin=BytesIO())
        self.runner._add_job_files_for_upload()

        self.start(patch.object(self.runner, '_upload_args',
                                return_value=['<upload args>']))
        self.start(patch.object(self.runner, '_hadoop_args_for_step',
                                return_value=['<hadoop args for step>']))
        self.start(patch.object(self.runner, '_hdfs_step_input_files',
                                return_value=['<hdfs step input files>']))
        self.start(patch.object(self.runner, '_hdfs_step_output_dir',
                                return_value='<hdfs step output dir>'))
        self.start(patch.object(HadoopFilesystem, 'get_hadoop_version',
                                return_value='2.7.1'))
        self.runner._script_path = 'my_job.py'

    def _assert_streaming_step(self, step, args):
        self.runner._steps = [step]
        self.assertEqual(
            self.runner._args_for_streaming_step(0),
            self._new_basic_args + args)

    def test_basic_mapper(self):
        self.runner._steps = [
            {
                'type': 'streaming',
                'mapper': {
                    'type': 'script',
                },
            },
        ]

        self.assertEqual(
            self.runner._args_for_streaming_step(0),
            (self.BASIC_HADOOP_ARGS + ['-D', 'mapreduce.job.reduces=0'] +
             self.BASIC_JOB_ARGS + [
                 '-mapper',
                 PYTHON_BIN + ' my_job.py --step-num=0 --mapper']))

    def test_basic_mapper_pre_yarn(self):
        # use a different jobconf (-D) on pre-YARN
        self.start(patch.object(HadoopFilesystem, 'get_hadoop_version',
                                return_value='1.0.3'))

        self.runner._steps = [
            {
                'type': 'streaming',
                'mapper': {
                    'type': 'script',
                },
            },
        ]

        self.assertEqual(
            self.runner._args_for_streaming_step(0),
            (self.BASIC_HADOOP_ARGS + ['-D', 'mapred.reduce.tasks=0'] +
             self.BASIC_JOB_ARGS + [
                 '-mapper',
                 PYTHON_BIN + ' my_job.py --step-num=0 --mapper']))

    def test_basic_reducer(self):
        self.runner._steps = [
            {
                'type': 'streaming',
                'reducer': {
                    'type': 'script',
                },
            },
        ]

        self.assertEqual(
            self.runner._args_for_streaming_step(0),
            (self.BASIC_HADOOP_ARGS + self.BASIC_JOB_ARGS + [
                '-mapper',
                'cat',
                '-reducer',
                PYTHON_BIN + ' my_job.py --step-num=0 --reducer']))

    def test_pre_filters(self):
        self.runner._steps = [
            {
                'type': 'streaming',
                'mapper': {
                    'type': 'script',
                    'pre_filter': 'grep anything',
                },
                'combiner': {
                    'type': 'script',
                    'pre_filter': 'grep nothing',
                },
                'reducer': {
                    'type': 'script',
                    'pre_filter': 'grep something',
                },
            },
        ]

        self.assertEqual(
            self.runner._args_for_streaming_step(0),
            (self.BASIC_HADOOP_ARGS + self.BASIC_JOB_ARGS + [
             '-mapper',
             "bash -c 'grep anything | " + PYTHON_BIN +
             " my_job.py --step-num=0 --mapper'",
             '-combiner',
             "bash -c 'grep nothing | " + PYTHON_BIN +
             " my_job.py --step-num=0 --combiner'",
             '-reducer',
             "bash -c 'grep something | " + PYTHON_BIN +
             " my_job.py --step-num=0 --reducer'"]))

    def test_pre_filter_escaping(self):
        # ESCAPE ALL THE THINGS!!!
        self.runner._steps = [
            {
                'type': 'streaming',
                'mapper': {
                    'type': 'script',
                    'pre_filter': bash_wrap("grep 'anything'"),
                },
            },
        ]

        self.assertEqual(
            self.runner._args_for_streaming_step(0),
            (self.BASIC_HADOOP_ARGS + ['-D', 'mapreduce.job.reduces=0'] +
             self.BASIC_JOB_ARGS + [
                 '-mapper',
                 "bash -c 'bash -c '\\''grep"
                 " '\\''\\'\\'''\\''anything'\\''\\'\\'''\\'''\\'' | " +
                 PYTHON_BIN +
                 " my_job.py --step-num=0 --mapper'"]))
Ejemplo n.º 46
0
class HadoopLogDirsTestCase(SandboxedTestCase):

    def setUp(self):
        super(HadoopLogDirsTestCase, self).setUp()

        os.environ.clear()

        self.mock_hadoop_version = '2.7.0'
        # the result of _hadoop_dir(). This handles non-log-specific
        # environment variables, such as $HADOOP_PREFIX, and also guesses
        # based on the path of the Hadoop binary
        self.mock_hadoop_dirs = []

        def mock_get_hadoop_version():
            return self.mock_hadoop_version

        def mock_hadoop_dirs_method():
            return (d for d in self.mock_hadoop_dirs)

        self.start(patch('mrjob.hadoop.HadoopJobRunner.get_hadoop_version',
                         side_effect=mock_get_hadoop_version))
        self.start(patch('mrjob.hadoop.HadoopJobRunner._hadoop_dirs',
                         side_effect=mock_hadoop_dirs_method))

        self.runner = HadoopJobRunner()

    def test_empty(self):
        self.assertEqual(list(self.runner._hadoop_log_dirs()),
                         ['hdfs:///tmp/hadoop-yarn/staging',
                          '/mnt/var/log/hadoop'])

    def test_precedence(self):
        os.environ['HADOOP_LOG_DIR'] = '/path/to/hadoop-log-dir'
        os.environ['YARN_LOG_DIR'] = '/path/to/yarn-log-dir'
        self.mock_hadoop_dirs = ['/path/to/hadoop-prefix',
                                 '/path/to/hadoop-home']

        self.assertEqual(
            list(self.runner._hadoop_log_dirs(output_dir='hdfs:///output/')),
            ['/path/to/hadoop-log-dir',
             '/path/to/yarn-log-dir',
             'hdfs:///tmp/hadoop-yarn/staging',
             'hdfs:///output/_logs',
             '/path/to/hadoop-prefix/logs',
             '/path/to/hadoop-home/logs',
             '/mnt/var/log/hadoop'])

    def test_hadoop_log_dirs_opt(self):
        self.runner = HadoopJobRunner(hadoop_log_dirs=['/logs1', '/logs2'])

        os.environ['HADOOP_LOG_DIR'] = '/path/to/hadoop-log-dir'

        # setting hadoop_log_dirs short-circuits automatic discovery of logs
        self.assertEqual(
            list(self.runner._hadoop_log_dirs()),
            ['/logs1', '/logs2'])


    def test_need_yarn_for_yarn_log_dir_and_hdfs_log_dir(self):
        os.environ['YARN_LOG_DIR'] = '/path/to/yarn-log-dir'

        self.mock_hadoop_version = '2.0.0'
        self.assertEqual(list(self.runner._hadoop_log_dirs()),
                         ['/path/to/yarn-log-dir',
                          'hdfs:///tmp/hadoop-yarn/staging',
                          '/mnt/var/log/hadoop'])

        self.mock_hadoop_version = '1.0.3'
        self.assertEqual(list(self.runner._hadoop_log_dirs()),
                         ['/mnt/var/log/hadoop'])
Ejemplo n.º 47
0
class HadoopStreamingJarTestCase(SandboxedTestCase):

    def setUp(self):
        super(HadoopStreamingJarTestCase, self).setUp()

        self.mock_paths = []

        def mock_ls(path):  # don't bother to support globs
            return (p for p in sorted(self.mock_paths) if p.startswith(path))

        self.start(patch('mrjob.fs.local.LocalFilesystem.ls',
                         side_effect=mock_ls))

        os.environ.clear()

        self.runner = HadoopJobRunner()

    def test_empty_fs(self):
        self.assertEqual(self.runner._find_hadoop_streaming_jar(), None)

    def test_deprecated_hadoop_home_option(self):
        self.runner = HadoopJobRunner(hadoop_home='/ha/do/op/home-option')

        self.mock_paths.append('/ha/do/op/home-option/hadoop-streaming.jar')
        self.assertEqual(self.runner._find_hadoop_streaming_jar(),
                         '/ha/do/op/home-option/hadoop-streaming.jar')

    def test_deprecated_hadoop_home_option_beats_hadoop_prefix(self):
        os.environ['HADOOP_PREFIX'] = '/ha/do/op/prefix'
        self.mock_paths.append('/ha/do/op/prefix/hadoop-streaming.jar')

        self.test_deprecated_hadoop_home_option()

    # tests of well-known environment variables

    def test_hadoop_prefix(self):
        os.environ['HADOOP_PREFIX'] = '/ha/do/op/prefix'
        self.mock_paths.append('/ha/do/op/prefix/hadoop-streaming.jar')

        self.assertEqual(self.runner._find_hadoop_streaming_jar(),
                         '/ha/do/op/prefix/hadoop-streaming.jar')

    def test_hadoop_prefix_beats_hadoop_home(self):
        os.environ['HADOOP_HOME'] = '/ha/do/op/home'
        self.mock_paths.append('/ha/do/op/home/hadoop-streaming.jar')

        self.test_hadoop_prefix()

    def test_hadoop_home(self):
        os.environ['HADOOP_HOME'] = '/ha/do/op/home'
        self.mock_paths.append('/ha/do/op/home/hadoop-streaming.jar')

        self.assertEqual(self.runner._find_hadoop_streaming_jar(),
                         '/ha/do/op/home/hadoop-streaming.jar')

    def test_hadoop_home_beats_hadoop_install(self):
        os.environ['HADOOP_INSTALL'] = '/ha/do/op/install'
        self.mock_paths.append('/ha/do/op/install/hadoop-streaming.jar')

        self.test_hadoop_home()

    def test_hadoop_install(self):
        os.environ['HADOOP_INSTALL'] = '/ha/do/op/install'
        self.mock_paths.append('/ha/do/op/install/hadoop-streaming.jar')

        self.assertEqual(self.runner._find_hadoop_streaming_jar(),
                         '/ha/do/op/install/hadoop-streaming.jar')

    def test_hadoop_install_beats_hadoop_mapred_home(self):
        os.environ['HADOOP_MAPRED_HOME'] = '/ha/do/op/mapred-home'
        self.mock_paths.append('/ha/do/op/mapred-home/hadoop-streaming.jar')

        self.test_hadoop_install()

    def test_hadoop_mapred_home(self):
        os.environ['HADOOP_MAPRED_HOME'] = '/ha/do/op/mapred-home'
        self.mock_paths.append('/ha/do/op/mapred-home/hadoop-streaming.jar')

        self.assertEqual(self.runner._find_hadoop_streaming_jar(),
                         '/ha/do/op/mapred-home/hadoop-streaming.jar')

    def test_hadoop_mapred_home_beats_infer_from_hadoop_bin(self):
        self.runner = HadoopJobRunner(
            hadoop_bin=['/ha/do/op/bin-parent/bin/hadoop'])

        self.mock_paths.append('/ha/do/op/bin-parent/hadoop-streaming.jar')

        self.test_hadoop_mapred_home()

    # infer from hadoop_bin

    def test_infer_from_hadoop_bin_parent_dir(self):
        self.runner = HadoopJobRunner(
            hadoop_bin=['/ha/do/op/bin-parent/bin/hadoop'])

        self.mock_paths.append('/ha/do/op/bin-parent/hadoop-streaming.jar')
        self.assertEqual(self.runner._find_hadoop_streaming_jar(),
                         '/ha/do/op/bin-parent/hadoop-streaming.jar')

    def test_hadoop_bin_beats_hadoop_anything_home(self):
        os.environ['HADOOP_ANYTHING_HOME'] = '/ha/do/op/anything-home'
        self.mock_paths.append('/ha/do/op/anything-home/hadoop-streaming.jar')

        self.test_infer_from_hadoop_bin_parent_dir()

    def test_dont_infer_from_bin_hadoop(self):
        self.runner = HadoopJobRunner(hadoop_bin=['/bin/hadoop'])
        self.mock_paths.append('/hadoop-streaming.jar')

        self.assertEqual(self.runner._find_hadoop_streaming_jar(), None)

    def test_dont_infer_from_usr_bin_hadoop(self):
        self.runner = HadoopJobRunner(hadoop_bin=['/usr/bin/hadoop'])
        self.mock_paths.append('/usr/hadoop-streaming.jar')

        self.assertEqual(self.runner._find_hadoop_streaming_jar(), None)

    def test_dont_infer_from_usr_local_bin_hadoop(self):
        self.runner = HadoopJobRunner(hadoop_bin=['/usr/local/bin/hadoop'])
        self.mock_paths.append('/usr/local/hadoop-streaming.jar')

        self.assertEqual(self.runner._find_hadoop_streaming_jar(), None)

    def test_infer_from_hadoop_bin_realpath(self):
        with patch('posixpath.realpath', return_value='/ha/do/op/bin'):
            self.runner = HadoopJobRunner(hadoop_bin=['/usr/bin/hadoop'])
            self.mock_paths.append('/ha/do/op/hadoop-streaming.jar')

            self.assertEqual(self.runner._find_hadoop_streaming_jar(),
                             '/ha/do/op/hadoop-streaming.jar')

    # tests of fallback environment variables ($HADOOP_*_HOME)

    def test_hadoop_anything_home(self):
        os.environ['HADOOP_WHATEVER_HOME'] = '/ha/do/op/whatever-home'
        self.mock_paths.append('/ha/do/op/whatever-home/hadoop-streaming.jar')

        self.assertEqual(self.runner._find_hadoop_streaming_jar(),
                         '/ha/do/op/whatever-home/hadoop-streaming.jar')

        # $HADOOP_ANYTHING_HOME comes before $HADOOP_WHATEVER_HOME
        os.environ['HADOOP_ANYTHING_HOME'] = '/ha/do/op/anything-home'
        self.mock_paths.append('/ha/do/op/anything-home/hadoop-streaming.jar')

        self.assertEqual(self.runner._find_hadoop_streaming_jar(),
                         '/ha/do/op/anything-home/hadoop-streaming.jar')

    def test_hadoop_anything_home_beats_hard_coded_paths(self):
        self.mock_paths.append('/home/hadoop/contrib/hadoop-streaming.jar')
        self.mock_paths.append(
            '/usr/lib/hadoop-mapreduce/hadoop-streaming.jar')

        self.test_hadoop_anything_home()

    # hard-coded paths (for Hadoop inside EMR)

    def test_hard_coded_emr_paths(self):
        self.mock_paths.append(
            '/usr/lib/hadoop-mapreduce/hadoop-streaming.jar')
        self.assertEqual(self.runner._find_hadoop_streaming_jar(),
                         '/usr/lib/hadoop-mapreduce/hadoop-streaming.jar')

        # /home/hadoop/contrib takes precedence
        self.mock_paths.append('/home/hadoop/contrib/hadoop-streaming.jar')
        self.assertEqual(self.runner._find_hadoop_streaming_jar(),
                         '/home/hadoop/contrib/hadoop-streaming.jar')

    # invalid environment variables

    def test_other_environment_variable(self):
        os.environ['HADOOP_YARN_MRJOB_DIR'] = '/ha/do/op/yarn-mrjob-dir'
        self.mock_paths.append(
            '/ha/do/op/yarn-mrjob-dir/hadoop-streaming.jar')

        self.assertEqual(self.runner._find_hadoop_streaming_jar(), None)

    # alternate jar names and paths

    def test_subdirs(self):
        os.environ['HADOOP_PREFIX'] = '/ha/do/op'
        self.mock_paths.append('/ha/do/op/contrib/hadoop-streaming.jar')

        self.assertEqual(self.runner._find_hadoop_streaming_jar(),
                         '/ha/do/op/contrib/hadoop-streaming.jar')

    def test_hadoop_streaming_jar_name_with_version(self):
        os.environ['HADOOP_PREFIX'] = '/ha/do/op'

        self.mock_paths.append('/ha/do/op/hadoop-streaming-2.6.0-amzn-0.jar')
        self.assertEqual(self.runner._find_hadoop_streaming_jar(),
                         '/ha/do/op/hadoop-streaming-2.6.0-amzn-0.jar')

    def test_skip_hadoop_streaming_source_jar(self):
        os.environ['HADOOP_PREFIX'] = '/ha/do/op'

        # Googled it; it really is named *-sources.jar, not *-source.jar
        self.mock_paths.append(
            '/ha/do/op/hadoop-streaming-2.0.0-mr1-cdh4.3.1-sources.jar')
        self.assertEqual(self.runner._find_hadoop_streaming_jar(), None)

    # multiple matching jars in same directory

    def test_pick_shortest_name(self):
        os.environ['HADOOP_PREFIX'] = '/ha/do/op'

        self.mock_paths.append('/ha/do/op/hadoop-streaming-1.0.3.jar')
        self.mock_paths.append('/ha/do/op/hadoop-streaming.jar')

        # hadoop-streaming-1.0.3.jar comes first in alphabetical order
        self.assertEqual(sorted(self.mock_paths), self.mock_paths)

        self.assertEqual(self.runner._find_hadoop_streaming_jar(),
                         '/ha/do/op/hadoop-streaming.jar')

    def test_pick_shallowest_subpath(self):
        os.environ['HADOOP_PREFIX'] = '/ha/do/op'

        self.mock_paths.append('/ha/do/op/hadoop-streaming-1.0.3.jar')
        self.mock_paths.append('/ha/do/op/old/hadoop-streaming.jar')

        self.assertEqual(self.runner._find_hadoop_streaming_jar(),
                         '/ha/do/op/hadoop-streaming-1.0.3.jar')

    def test_fall_back_to_alphabetical_order(self):
        os.environ['HADOOP_PREFIX'] = '/ha/do/op'

        self.mock_paths.append('/ha/do/op/hadoop-streaming-a.jar')
        self.mock_paths.append('/ha/do/op/hadoop-streaming-b.jar')

        self.assertEqual(self.runner._find_hadoop_streaming_jar(),
                         '/ha/do/op/hadoop-streaming-a.jar')

    # sanity-check that directory order overrides path sort order

    def test_directory_order_overrides_path_sort_order(self):
        os.environ['HADOOP_HOME'] = '/ha/do/op/a'
        os.environ['HADOOP_PREFIX'] = '/ha/do/op/b'

        self.mock_paths.append('/ha/do/op/a/hadoop-streaming-a.jar')
        self.mock_paths.append('/ha/do/op/b/hadoop-streaming-b.jar')

        # $HADOOP_PREFIX takes precendence over $HADOOP_HOME, so sort
        # order doesn't matter
        self.assertEqual(self.runner._find_hadoop_streaming_jar(),
                         '/ha/do/op/b/hadoop-streaming-b.jar')

        # now search in parent dir (/ha/do/op) to invoke sort order
        os.environ['HADOOP_PREFIX'] = '/ha/do/op'
        self.assertEqual(self.runner._find_hadoop_streaming_jar(),
                         '/ha/do/op/a/hadoop-streaming-a.jar')
Ejemplo n.º 48
0
    def test_dont_infer_from_usr_local_bin_hadoop(self):
        self.runner = HadoopJobRunner(hadoop_bin=['/usr/local/bin/hadoop'])
        self.mock_paths.append('/usr/local/hadoop-streaming.jar')

        self.assertEqual(self.runner._find_hadoop_streaming_jar(), None)
Ejemplo n.º 49
0
    def setUp(self):
        super(PickErrorTestCase, self).setUp()

        os.environ['MOCK_HADOOP_VERSION'] = '2.7.0'

        self.runner = HadoopJobRunner()
Ejemplo n.º 50
0
 def test_get_hadoop_version(self):
     runner = HadoopJobRunner()
     self.assertEqual(runner.get_hadoop_version(), '1.2.0')
Ejemplo n.º 51
0
class StreamingArgsTestCase(EmptyMrjobConfTestCase):

    MRJOB_CONF_CONTENTS = {'runners': {'hadoop': {
        'hadoop_home': 'kansas',
        'hadoop_streaming_jar': 'binks.jar.jar',
    }}}

    def setUp(self):
        super(StreamingArgsTestCase, self).setUp()
        self.runner = HadoopJobRunner(
            hadoop_bin='hadoop', hadoop_streaming_jar='streaming.jar',
            mr_job_script='my_job.py', stdin=StringIO())
        self.runner._add_job_files_for_upload()

        self.runner._hadoop_version='0.20.204'
        self.simple_patch(self.runner, '_new_upload_args',
                          return_value=['new_upload_args'])
        self.simple_patch(self.runner, '_old_upload_args',
                          return_value=['old_upload_args'])
        self.simple_patch(self.runner, '_hadoop_args_for_step',
                          return_value=['hadoop_args_for_step'])
        self.simple_patch(self.runner, '_hdfs_step_input_files',
                          return_value=['hdfs_step_input_files'])
        self.simple_patch(self.runner, '_hdfs_step_output_dir',
                          return_value='hdfs_step_output_dir')
        self.runner._script_path = 'my_job.py'

        self._new_basic_args = [
            'hadoop', 'jar', 'streaming.jar',
             'new_upload_args', 'hadoop_args_for_step',
             '-input', 'hdfs_step_input_files',
             '-output', 'hdfs_step_output_dir']

        self._old_basic_args = [
            'hadoop', 'jar', 'streaming.jar',
             'hadoop_args_for_step',
             '-input', 'hdfs_step_input_files',
             '-output', 'hdfs_step_output_dir',
             'old_upload_args']

    def simple_patch(self, obj, attr, side_effect=None, return_value=None):
        patcher = patch.object(obj, attr, side_effect=side_effect,
                               return_value=return_value)
        patcher.start()
        self.addCleanup(patcher.stop)

    def _assert_streaming_step(self, step, args):
        self.runner._steps = [step]
        self.assertEqual(
            self.runner._args_for_streaming_step(0),
            self._new_basic_args + args)

    def _assert_streaming_step_old(self, step, args):
        self.runner._hadoop_version = '0.18'
        self.runner._steps = [step]
        self.assertEqual(
            self.runner._args_for_streaming_step(0),
            self._old_basic_args + args)

    def test_basic_mapper(self):
        self._assert_streaming_step(
            {
                'type': 'streaming',
                'mapper': {
                    'type': 'script',
                },
            },
            ['-mapper', 'python my_job.py --step-num=0 --mapper',
             '-jobconf', 'mapred.reduce.tasks=0'])

    def test_basic_reducer(self):
        self._assert_streaming_step(
            {
                'type': 'streaming',
                'reducer': {
                    'type': 'script',
                },
            },
            ['-mapper', 'cat',
             '-reducer', 'python my_job.py --step-num=0 --reducer'])

    def test_pre_filters(self):
        self._assert_streaming_step(
            {
                'type': 'streaming',
                'mapper': {
                    'type': 'script',
                    'pre_filter': 'grep anything',
                },
                'combiner': {
                    'type': 'script',
                    'pre_filter': 'grep nothing',
                },
                'reducer': {
                    'type': 'script',
                    'pre_filter': 'grep something',
                },
            },
            ["-mapper",
             "bash -c 'grep anything | python my_job.py --step-num=0"
                 " --mapper'",
             "-combiner",
             "bash -c 'grep nothing | python my_job.py --step-num=0"
                 " --combiner'",
             "-reducer",
             "bash -c 'grep something | python my_job.py --step-num=0"
                 " --reducer'"])

    def test_combiner_018(self):
        self._assert_streaming_step_old(
            {
                'type': 'streaming',
                'mapper': {
                    'type': 'command',
                    'command': 'cat',
                },
                'combiner': {
                    'type': 'script',
                },
            },
            ["-mapper",
             "bash -c 'cat | sort | python my_job.py --step-num=0"
                " --combiner'",
             '-jobconf', 'mapred.reduce.tasks=0'])

    def test_pre_filters_018(self):
        self._assert_streaming_step_old(
            {
                'type': 'streaming',
                'mapper': {
                    'type': 'script',
                    'pre_filter': 'grep anything',
                },
                'combiner': {
                    'type': 'script',
                    'pre_filter': 'grep nothing',
                },
                'reducer': {
                    'type': 'script',
                    'pre_filter': 'grep something',
                },
            },
            ['-mapper',
             "bash -c 'grep anything | python my_job.py --step-num=0"
                " --mapper | sort | grep nothing | python my_job.py"
                " --step-num=0 --combiner'",
             '-reducer',
             "bash -c 'grep something | python my_job.py --step-num=0"
                " --reducer'"])

    def test_pre_filter_escaping(self):
        # ESCAPE ALL THE THINGS!!!
        self._assert_streaming_step(
            {
                'type': 'streaming',
                'mapper': {
                    'type': 'script',
                    'pre_filter': bash_wrap("grep 'anything'"),
                },
            },
            ['-mapper',
             "bash -c 'bash -c '\\''grep"
                 " '\\''\\'\\'''\\''anything'\\''\\'\\'''\\'''\\'' |"
                 " python my_job.py --step-num=0 --mapper'",
             '-jobconf', 'mapred.reduce.tasks=0'])
Ejemplo n.º 52
0
class HadoopLogDirsTestCase(SandboxedTestCase):

    def setUp(self):
        super(HadoopLogDirsTestCase, self).setUp()

        os.environ.clear()

        self.mock_hadoop_version = '2.7.0'
        # the result of _hadoop_dir(). This handles non-log-specific
        # environment variables, such as $HADOOP_PREFIX, and also guesses
        # based on the path of the Hadoop binary
        self.mock_hadoop_dirs = []

        def mock_get_hadoop_version():
            return self.mock_hadoop_version

        def mock_hadoop_dirs_method():
            return (d for d in self.mock_hadoop_dirs)

        self.start(patch('mrjob.hadoop.HadoopJobRunner.get_hadoop_version',
                         side_effect=mock_get_hadoop_version))
        self.start(patch('mrjob.hadoop.HadoopJobRunner._hadoop_dirs',
                         side_effect=mock_hadoop_dirs_method))

        self.runner = HadoopJobRunner()

    def test_empty(self):
        self.assertEqual(list(self.runner._hadoop_log_dirs()),
                         ['hdfs:///tmp/hadoop-yarn/staging',
                          '/mnt/var/log/hadoop'])

    def test_precedence(self):
        os.environ['HADOOP_LOG_DIR'] = '/path/to/hadoop-log-dir'
        os.environ['YARN_LOG_DIR'] = '/path/to/yarn-log-dir'
        self.mock_hadoop_dirs = ['/path/to/hadoop-prefix',
                                 '/path/to/hadoop-home']

        self.assertEqual(
            list(self.runner._hadoop_log_dirs(output_dir='hdfs:///output/')),
            ['/path/to/hadoop-log-dir',
             '/path/to/yarn-log-dir',
             'hdfs:///tmp/hadoop-yarn/staging',
             'hdfs:///output/_logs',
             '/path/to/hadoop-prefix/logs',
             '/path/to/hadoop-home/logs',
             '/mnt/var/log/hadoop'])

    def test_hadoop_log_dirs_opt(self):
        self.runner = HadoopJobRunner(hadoop_log_dirs=['/logs1', '/logs2'])

        os.environ['HADOOP_LOG_DIR'] = '/path/to/hadoop-log-dir'

        # setting hadoop_log_dirs short-circuits automatic discovery of logs
        self.assertEqual(
            list(self.runner._hadoop_log_dirs()),
            ['/logs1', '/logs2'])

    def test_need_yarn_for_yarn_log_dir_and_hdfs_log_dir(self):
        os.environ['YARN_LOG_DIR'] = '/path/to/yarn-log-dir'

        self.mock_hadoop_version = '2.0.0'
        self.assertEqual(list(self.runner._hadoop_log_dirs()),
                         ['/path/to/yarn-log-dir',
                          'hdfs:///tmp/hadoop-yarn/staging',
                          '/mnt/var/log/hadoop'])

        self.mock_hadoop_version = '1.0.3'
        self.assertEqual(list(self.runner._hadoop_log_dirs()),
                         ['/mnt/var/log/hadoop'])
Ejemplo n.º 53
0
    def test_dont_infer_from_usr_local_bin_hadoop(self):
        self.runner = HadoopJobRunner(hadoop_bin=['/usr/local/bin/hadoop'])
        self.mock_paths.append('/usr/local/hadoop-streaming.jar')

        self.assertEqual(self.runner._find_hadoop_streaming_jar(), None)
Ejemplo n.º 54
0
 def test_get_hadoop_version(self):
     runner = HadoopJobRunner()
     self.assertEqual(runner.get_hadoop_version(), '1.2.0')
Ejemplo n.º 55
0
    def setUp(self):
        super(PickErrorTestCase, self).setUp()

        os.environ['MOCK_HADOOP_VERSION'] = '2.7.0'

        self.runner = HadoopJobRunner()