Example #1
0
    def test_defaults(self):
        step = SparkStep(spark=spark_func)

        self.assertEqual(step.spark, spark_func)
        self.assertEqual(step.spark_args, [])
        self.assertEqual(
            step.description(0),
            dict(type='spark', jobconf={}, spark_args=[]),
        )
Example #2
0
    def test_defaults(self):
        step = SparkStep(spark=spark_func)

        self.assertEqual(step.spark, spark_func)
        self.assertEqual(step.spark_args, [])
        self.assertEqual(
            step.description(0),
            dict(type='spark', jobconf={}, spark_args=[]),
        )
Example #3
0
    def test_all_args(self):
        step = SparkStep(spark=spark_func, spark_args=['argh', 'argh'])

        self.assertEqual(step.spark, spark_func)
        self.assertEqual(step.spark_args, ['argh', 'argh'])
        self.assertEqual(
            step.description(0),
            dict(type='spark', spark_args=['argh', 'argh']),
        )
Example #4
0
    def test_all_args(self):
        step = SparkStep(spark=spark_func, spark_args=['argh', 'argh'])

        self.assertEqual(step.spark, spark_func)
        self.assertEqual(step.spark_args, ['argh', 'argh'])
        self.assertEqual(
            step.description(0),
            dict(type='spark', spark_args=['argh', 'argh']),
        )
Example #5
0
    def test_spark_method(self):
        j = MRJob(['--no-conf'])
        j.spark = MagicMock()

        self.assertEqual(j.steps(), [SparkStep(j.spark)])

        self.assertEqual(j._steps_desc(), [dict(type='spark', spark_args=[])])
Example #6
0
    def test_spark_and_spark_args_methods(self):
        j = MRJob(['--no-conf'])
        j.spark = MagicMock()
        j.spark_args = MagicMock(return_value=['argh', 'ARRRRGH!'])

        self.assertEqual(j.steps(),
                         [SparkStep(j.spark, spark_args=['argh', 'ARRRRGH!'])])

        self.assertEqual(j._steps_desc(),
                         [dict(type='spark', spark_args=['argh', 'ARRRRGH!'])])
Example #7
0
    def test_spark_with_step_num(self):
        job = MRJob(['--step-num=1', '--spark', 'input_dir', 'output_dir'])

        mapper = MagicMock()
        spark = MagicMock()

        job.steps = Mock(
            return_value=[MRStep(mapper=mapper), SparkStep(spark)])

        job.execute()

        spark.assert_called_once_with('input_dir', 'output_dir')
        self.assertFalse(mapper.called)
Example #8
0
    def steps(self):
        """Re-define this to make a multi-step job.

        If you don't re-define this, we'll automatically create a one-step
        job using any of :py:meth:`mapper`, :py:meth:`mapper_init`,
        :py:meth:`mapper_final`, :py:meth:`reducer_init`,
        :py:meth:`reducer_final`, and :py:meth:`reducer` that you've
        re-defined. For example::

            def steps(self):
                return [MRStep(mapper=self.transform_input,
                               reducer=self.consolidate_1),
                        MRStep(reducer_init=self.log_mapper_init,
                               reducer=self.consolidate_2)]

        :return: a list of steps constructed with
                 :py:class:`~mrjob.step.MRStep` or other classes in
                 :py:mod:`mrjob.step`.
        """
        # only include methods that have been redefined
        kwargs = dict(
            (func_name, getattr(self, func_name))
            for func_name in _JOB_STEP_FUNC_PARAMS + ('spark',)
            if (_im_func(getattr(self, func_name)) is not
                _im_func(getattr(MRJob, func_name))))

        # special case for spark()
        # TODO: support jobconf as well
        if 'spark' in kwargs:
            if sorted(kwargs) != ['spark']:
                raise ValueError(
                    "Can't mix spark() and streaming functions")
            return [SparkStep(
                spark=kwargs['spark'],
                spark_args=self.spark_args())]

        # MRStep takes commands as strings, but the user defines them in the
        # class as functions that return strings, so call the functions.
        updates = {}
        for k, v in kwargs.items():
            if k.endswith('_cmd') or k.endswith('_pre_filter'):
                updates[k] = v()

        kwargs.update(updates)

        if kwargs:
            return [MRStep(**kwargs)]
        else:
            return []
Example #9
0
    def test_positional_spark_arg(self):
        step1 = SparkStep(spark_func)
        step2 = SparkStep(spark=spark_func)

        self.assertEqual(step1, step2)
        self.assertEqual(step1.description(0), step2.description(0))
Example #10
0
 def steps(self):
     return [
         MRStep(mapper=self.mapper),
         SparkStep(self.spark),
     ]
Example #11
0
    def test_positional_spark_arg(self):
        step1 = SparkStep(spark_func)
        step2 = SparkStep(spark=spark_func)

        self.assertEqual(step1, step2)
        self.assertEqual(step1.description(0), step2.description(0))