def _harness_job(self, job_class, input_bytes=b'', input_paths=(),
                     runner_alias='inline', compression_codec=None,
                     job_args=None, spark_conf=None, first_step_num=None,
                     last_step_num=None, counter_output_dir=None,
                     num_reducers=None):
        job_class_path = '%s.%s' % (job_class.__module__, job_class.__name__)

        harness_job_args = ['-r', runner_alias, '--job-class', job_class_path]
        if spark_conf:
            for key, value in spark_conf.items():
                harness_job_args.append('--jobconf')
                harness_job_args.append('%s=%s' % (key, value))
        if compression_codec:
            harness_job_args.append('--compression-codec')
            harness_job_args.append(compression_codec)
        if job_args:
            harness_job_args.extend(['--job-args', cmd_line(job_args)])
        if first_step_num is not None:
            harness_job_args.extend(['--first-step-num', str(first_step_num)])
        if last_step_num is not None:
            harness_job_args.extend(['--last-step-num', str(last_step_num)])
        if counter_output_dir is not None:
            harness_job_args.extend(
                ['--counter-output-dir', counter_output_dir])
        if num_reducers is not None:
            harness_job_args.extend(
                ['--num-reducers', str(num_reducers)])

        harness_job_args.extend(input_paths)

        harness_job = MRSparkHarness(harness_job_args)
        harness_job.sandbox(stdin=BytesIO(input_bytes))

        return harness_job
Beispiel #2
0
    def _harness_job(self,
                     job_class,
                     input_bytes=b'',
                     input_paths=(),
                     runner_alias='inline',
                     compression_codec=None,
                     job_args=None,
                     spark_conf=None,
                     first_step_num=None,
                     last_step_num=None,
                     counter_output_dir=None,
                     num_reducers=None,
                     max_output_files=None,
                     emulate_map_input_file=False,
                     skip_internal_protocol=False):
        from tests.mr_spark_harness import MRSparkHarness

        job_class_path = '%s.%s' % (job_class.__module__, job_class.__name__)

        harness_job_args = ['-r', runner_alias, '--job-class', job_class_path]
        if spark_conf:
            for key, value in spark_conf.items():
                harness_job_args.append('--jobconf')
                harness_job_args.append('%s=%s' % (key, value))
        if compression_codec:
            harness_job_args.append('--compression-codec')
            harness_job_args.append(compression_codec)
        if job_args:
            harness_job_args.extend(['--job-args', cmd_line(job_args)])
        if first_step_num is not None:
            harness_job_args.extend(['--first-step-num', str(first_step_num)])
        if last_step_num is not None:
            harness_job_args.extend(['--last-step-num', str(last_step_num)])
        if counter_output_dir is not None:
            harness_job_args.extend(
                ['--counter-output-dir', counter_output_dir])
        if num_reducers is not None:
            harness_job_args.extend(['--num-reducers', str(num_reducers)])

        if max_output_files is not None:
            harness_job_args.extend(
                ['--max-output-files',
                 str(max_output_files)])

        if emulate_map_input_file:
            harness_job_args.append('--emulate-map-input-file')

        if skip_internal_protocol:
            harness_job_args.append('--skip-internal-protocol')

        harness_job_args.extend(input_paths)

        harness_job = MRSparkHarness(harness_job_args)
        harness_job.sandbox(stdin=BytesIO(input_bytes))

        return harness_job