Beispiel #1
0
    def test_spark_with_step_num(self):
        job = MRJob(["--step-num=1", "--spark", "input_dir", "output_dir"])

        mapper = MagicMock()
        spark = MagicMock()

        job.steps = Mock(return_value=[MRStep(mapper=mapper), SparkStep(spark)])

        job.execute()

        spark.assert_called_once_with("input_dir", "output_dir")
        self.assertFalse(mapper.called)
Beispiel #2
0
        def mock_Popen(*args, **kwargs):
            mock_proc = MagicMock()

            mock_proc.stdout = MagicMock()
            mock_proc.stdout.__iter__.return_value = [
                b'line1\n', b'line2\n']

            mock_proc.stderr = MagicMock()
            mock_proc.stderr.__iter__.return_value = [
                b'Emergency, everybody to get from street\n']

            mock_proc.wait.return_value = 0

            return mock_proc
Beispiel #3
0
    def test_spark(self):
        job = MRJob(['--spark', 'input_dir', 'output_dir'])
        job.spark = MagicMock()

        job.execute()

        job.spark.assert_called_once_with('input_dir', 'output_dir')
Beispiel #4
0
    def test_wrong_type_of_step(self):
        mr_job = MRJob()
        mr_job.spark = MagicMock()

        self.assertRaises(TypeError, mr_job.run_mapper)
        self.assertRaises(TypeError, mr_job.run_combiner)
        self.assertRaises(TypeError, mr_job.run_reducer)
Beispiel #5
0
    def test_spark_method(self):
        j = MRJob(['--no-conf'])
        j.spark = MagicMock()

        self.assertEqual(j.steps(), [SparkStep(j.spark)])

        self.assertEqual(j._steps_desc(), [dict(type='spark', spark_args=[])])
Beispiel #6
0
    def prepare_runner_for_ssh(self, runner, num_workers=0):
        # TODO: Refactor this abomination of a test harness

        # Set up environment variables
        os.environ['MOCK_SSH_VERIFY_KEY_FILE'] = 'true'

        # Create temporary directories and add them to MOCK_SSH_ROOTS
        master_ssh_root = tempfile.mkdtemp(prefix='master_ssh_root.')
        os.environ['MOCK_SSH_ROOTS'] = 'testmaster=%s' % master_ssh_root
        mock_ssh_dir('testmaster', _EMR_LOG_DIR + '/hadoop/history')

        if not hasattr(self, 'worker_ssh_roots'):
            self.worker_ssh_roots = []

        self.addCleanup(self.teardown_ssh, master_ssh_root)

        # Make the fake binary
        os.mkdir(os.path.join(master_ssh_root, 'bin'))
        self.ssh_bin = os.path.join(master_ssh_root, 'bin', 'ssh')
        create_mock_ssh_script(self.ssh_bin)
        self.ssh_add_bin = os.path.join(master_ssh_root, 'bin', 'ssh-add')
        create_mock_ssh_script(self.ssh_add_bin)

        # Make a fake keyfile so that the 'file exists' requirements are
        # satsified
        self.keyfile_path = os.path.join(master_ssh_root, 'key.pem')
        with open(self.keyfile_path, 'w') as f:
            f.write('I AM DEFINITELY AN SSH KEY FILE')

        # Tell the runner to use the fake binary
        runner._opts['ssh_bin'] = [self.ssh_bin]
        runner._opts['ssh_add_bin'] = [self.ssh_add_bin]
        # Also pretend to have an SSH key pair file
        runner._opts['ec2_key_pair_file'] = self.keyfile_path

        # use fake hostname
        runner._address_of_master = MagicMock(return_value='testmaster')
        runner._master_private_ip = MagicMock(return_value='172.172.172.172')

        # re-initialize fs
        runner._fs = None
Beispiel #7
0
    def _make_launcher(self, *args):
        """Make a launcher, add a mock runner (``launcher.mock_runner``), and
        set it up so that ``launcher.make_runner().__enter__()`` returns
        ``launcher.mock_runner()``.
        """
        launcher = MRJobLauncher(args=['--no-conf', ''] + list(args))
        launcher.sandbox()

        launcher.mock_runner = Mock()
        launcher.mock_runner.stream_output.return_value = [b'a line\n']

        launcher.make_runner = MagicMock()  # include __enter__
        launcher.make_runner.return_value.__enter__.return_value = (
            launcher.mock_runner)

        return launcher
    def setUp(self):
        super(SparkSubmitToolTestCase, self).setUp()

        self.runner_class = None

        self.runner = MagicMock()

        def _mock_runner_class(runner_alias):
            rc = _runner_class(runner_alias)

            self.runner_class = Mock()
            self.runner_class.return_value = self.runner
            self.runner_class.alias = rc.alias
            self.runner_class.OPT_NAMES = rc.OPT_NAMES

            return self.runner_class

        self.runner_class = self.start(
            patch('mrjob.tools.spark_submit._runner_class',
                  side_effect=_mock_runner_class))

        self.runner_log = self.start(patch('mrjob.runner.log'))

        # don't actually want to exit after printing help
        self.exit = self.start(patch('sys.exit', side_effect=MockSystemExit))

        # don't set up logging
        self.set_up_logging = self.start(
            patch('mrjob.job.MRJob.set_up_logging'))

        # save printout, rather than actually printing
        self.printout = ''

        def _mock_print_message(self_, message, file=None):
            self.printout += message

        self.start(
            patch('argparse.ArgumentParser._print_message',
                  _mock_print_message))

        def _mock_print(s=''):
            self.printout += s + '\n'

        # print() isn't considered part of the module in Python 3.4
        self.start(
            patch('mrjob.tools.spark_submit.print', _mock_print, create=True))
Beispiel #9
0
        def mock_Popen(*args, **kwargs):
            mock_proc = MagicMock()

            mock_proc.stdout = MagicMock()
            mock_proc.stdout.__iter__.return_value = [b'line1\n', b'line2\n']

            mock_proc.stderr = MagicMock()
            mock_proc.stderr.__iter__.return_value = [
                b'Emergency, everybody to get from street\n'
            ]

            mock_proc.wait.return_value = 0

            return mock_proc
Beispiel #10
0
    def setUpClass(cls):
        super(SingleSparkContextTestCase, cls).setUpClass()

        if not PY2:
            # ignore Python 3 warnings about unclosed filehandles
            filterwarnings('ignore', category=ResourceWarning)

        from pyspark import SparkContext
        cls.spark_context = SparkContext()

        # move stop() so that scripts can't call it
        cls.spark_context.really_stop = cls.spark_context.stop
        cls.spark_context.stop = MagicMock()

        try:
            cls.spark_context.setLogLevel('FATAL')
        except:
            # tearDownClass() won't be called if there's an exception
            cls.spark_context.really_stop()
            raise
Beispiel #11
0
    def test_too_few_args(self):
        job = MRJob(['--spark'])
        job.spark = MagicMock()

        self.assertRaises(ValueError, job.execute)
Beispiel #12
0
    def test_wrong_step_num(self):
        job = MRJob(['--step-num=1', '--spark', 'input_dir', 'output_dir'])
        job.spark = MagicMock()

        self.assertRaises(ValueError, job.execute)
Beispiel #13
0
    def test_wrong_step_type(self):
        job = MRJob(['--spark', 'input_dir', 'output_dir'])
        job.mapper = MagicMock()

        self.assertRaises(TypeError, job.execute)
Beispiel #14
0
    def test_spark_args_ignored_without_spark(self):
        j = MRJob(['--no-conf'])
        j.reducer = MagicMock()
        j.spark_args = MagicMock(spark_args=['argh', 'ARRRRGH!'])

        self.assertEqual(j.steps(), [MRStep(reducer=j.reducer)])
Beispiel #15
0
    def test_spark_and_streaming_dont_mix(self):
        j = MRJob(['--no-conf'])
        j.mapper = MagicMock()
        j.spark = MagicMock()

        self.assertRaises(ValueError, j.steps)
Beispiel #16
0
    def test_too_many_args(self):
        job = MRJob(['--spark', 'input_dir', 'output_dir', 'error_dir'])
        job.spark = MagicMock()

        self.assertRaises(ValueError, job.execute)
Beispiel #17
0
def patch_fs_s3():
    m_boto = MagicMock()
    m_s3 = m_boto.connect_s3()
    m_s3.get_all_buckets.__name__ = 'get_all_buckets'
    return patch('mrjob.fs.s3.boto', m_boto)
Beispiel #18
0
def patch_fs_s3():
    m_boto = MagicMock()
    m_s3 = m_boto.connect_s3()
    m_s3.get_all_buckets.__name__ = 'get_all_buckets'
    return patch('mrjob.fs.s3.boto', m_boto)