Example #1
0
File: hadoop.py Project: Yelp/mrjob
 def _default_opts(self):
     return combine_dicts(
         super(HadoopJobRunner, self)._default_opts(),
         dict(
             hadoop_tmp_dir='tmp/mrjob',
         )
     )
Example #2
0
File: bin.py Project: Affirm/mrjob
 def _default_opts(self):
     return combine_dicts(
         super(MRJobBinRunner, self)._default_opts(),
         dict(
             read_logs=True,
         )
     )
Example #3
0
def _add_runner_args_for_opt(parser, opt_name, include_deprecated=True):
    """Add switches for a single option (*opt_name*) to the given parser."""
    conf = _RUNNER_OPTS[opt_name]

    if conf.get('deprecated') and not include_deprecated:
        return

    switches = conf.get('switches') or []

    for args, kwargs in switches:
        kwargs = dict(kwargs)

        deprecated_aliases = kwargs.pop('deprecated_aliases', None)

        kwargs['dest'] = opt_name

        if kwargs.get('action') == 'append':
            kwargs['default'] = []
        else:
            kwargs['default'] = None

        parser.add_argument(*args, **kwargs)

        # add a switch for deprecated aliases
        if deprecated_aliases and include_deprecated:
            help = 'Deprecated alias%s for %s' % (
                ('es' if len(deprecated_aliases) > 1 else ''),
                args[-1])
            parser.add_argument(
                *deprecated_aliases,
                **combine_dicts(kwargs, dict(help=help)))
Example #4
0
 def _default_opts(self):
     return combine_dicts(
         super(MRJobBinRunner, self)._default_opts(),
         dict(
             sh_bin=['sh', '-ex'],
         )
     )
Example #5
0
 def default_options(self):
     super_opts = super(HadoopRunnerOptionStore, self).default_options()
     return combine_dicts(super_opts, {
         'hadoop_home': os.environ.get('HADOOP_HOME'),
         'hdfs_scratch_dir': 'tmp/mrjob',
         'check_input_paths': True
     })
    def jobconf(self):
        orig_jobconf = super(ZipNumClusterJob, self).jobconf()
        custom_jobconf = {'mapreduce.job.reduces': self.options.shards,
                          'mapreduce.totalorderpartitioner.path': self.options.splitfile}

        combined = combine_dicts(orig_jobconf, custom_jobconf)
        return combined
Example #7
0
File: runner.py Project: Yelp/mrjob
 def _default_opts(self):
     return combine_dicts(
         super(SparkMRJobRunner, self)._default_opts(),
         dict(
             cloud_part_size_mb=_DEFAULT_CLOUD_PART_SIZE_MB,
         ),
     )
Example #8
0
 def _opts_combiners(cls):
     """Map from option name to a combine_*() function used to combine
     values for that option. This allows us to specify that some options
     are lists, or contain environment variables, or whatever."""
     return combine_dicts(
         super(HadoopJobRunner, cls)._opts_combiners(),
         {"hadoop_bin": combine_paths, "hadoop_home": combine_paths, "hdfs_scratch_dir": combine_paths},
     )
Example #9
0
    def test_hadoop_1(self):
        updated, warnings = self.updated_and_warnings(
            self.JOBCONF, '1.0')

        self.assertEqual(updated,
                         combine_dicts(self.JOBCONF, {'user.name': 'dave'}))
        self.assertIn('do not match hadoop version', warnings)
        self.assertIn('mapreduce.job.user.name: user.name', warnings)
Example #10
0
File: util.py Project: Affirm/mrjob
    def paginate(self, **kwargs):
        result = self.method(**kwargs)

        values = result[self.result_key]

        for page_start in range(0, len(values), self.page_size):
            page = values[page_start:page_start + self.page_size]
            yield combine_dicts(result, {self.result_key: page})
Example #11
0
 def _job_runner_kwargs_for_runner(self, runner_alias):
     """Helper method that powers the *_job_runner_kwargs()
     methods."""
     # user can no longer silently ignore switches by overriding
     # job_runner_kwargs()
     return combine_dicts(
         self._kwargs_from_switches(_allowed_keys(runner_alias)),
         self.job_runner_kwargs(),
     )
Example #12
0
    def test_hadoop_2(self):
        updated, warnings = self.updated_and_warnings(
            self.JOBCONF, '2.0')

        self.assertEqual(updated,
                         combine_dicts(self.JOBCONF,
                                       {'mapreduce.job.jar': 'a.jar'}))
        self.assertIn('do not match hadoop version', warnings)
        self.assertIn('mapred.jar: mapreduce.job.jar', warnings)
Example #13
0
File: launch.py Project: Yelp/mrjob
 def _runner_kwargs(self):
     # just use combine_dicts() and not combine_confs(); leave the
     # magic to the runner
     return combine_dicts(
         self._non_option_kwargs(),
         # don't screen out irrelevant opts (see #1898)
         self._kwargs_from_switches(set(_RUNNER_OPTS)),
         self._job_kwargs(),
     )
Example #14
0
 def _default_opts(self):
     return combine_dicts(
         super(HadoopJobRunner, self)._default_opts(),
         dict(
             hadoop_tmp_dir='tmp/mrjob',
             spark_deploy_mode='client',
             spark_master='yarn',
         )
     )
Example #15
0
    def test_can_override_sort_values_from_job(self):
        mr_job = MRSortValuesAndMore()

        self.assertEqual(
            mr_job.partitioner(),
            'org.apache.hadoop.mapred.lib.HashPartitioner')

        self.assertEqual(
            mr_job.jobconf(),
            combine_dicts(_SORT_VALUES_JOBCONF, MRSortValuesAndMore.JOBCONF))
Example #16
0
    def _jobconf_for_step(self, step_num):
        """Get the jobconf dictionary, optionally including step-specific
        jobconf info.

        Also translate jobconfs to the current Hadoop version, if necessary.
        """
        step = self._get_step(step_num)
        jobconf = combine_dicts(self._opts["jobconf"], step.get("jobconf"))

        return add_translated_jobconf_for_hadoop_version(jobconf, self.get_hadoop_version())
Example #17
0
    def hadoop_job_runner_kwargs(self):
        """Keyword arguments to create create runners when
        :py:meth:`make_runner` is called, when we run a job on EMR
        (``-r hadoop``).

        :return: map from arg name to value

        Re-define this if you want finer control when running jobs on hadoop.
        """
        return combine_dicts(self.job_runner_kwargs(), self._get_kwargs_from_opt_group(self.hadoop_opt_group))
Example #18
0
    def _hadoop_conf_args(self, step, step_num, num_steps):
        """Build a list of extra arguments to the hadoop binary.

        This handles *cmdenv*, *hadoop_extra_args*, *hadoop_input_format*,
        *hadoop_output_format*, *jobconf*, and *partitioner*.

        This doesn't handle input, output, mappers, reducers, or uploading
        files.
        """
        assert 0 <= step_num < num_steps

        args = []

        jobconf = combine_dicts(self._opts['jobconf'], step.get('jobconf'))

        # hadoop_extra_args
        args.extend(self._opts['hadoop_extra_args'])

        # new-style jobconf
        version = self.get_hadoop_version()

        # translate the jobconf configuration names to match
        # the hadoop version
        jobconf = add_translated_jobconf_for_hadoop_version(jobconf,
                                                            version)
        if uses_generic_jobconf(version):
            for key, value in sorted(jobconf.iteritems()):
                if value is not None:
                    args.extend(['-D', '%s=%s' % (key, value)])
        # old-style jobconf
        else:
            for key, value in sorted(jobconf.iteritems()):
                if value is not None:
                    args.extend(['-jobconf', '%s=%s' % (key, value)])

        # partitioner
        if self._partitioner:
            args.extend(['-partitioner', self._partitioner])

        # cmdenv
        for key, value in sorted(self._opts['cmdenv'].iteritems()):
            args.append('-cmdenv')
            args.append('%s=%s' % (key, value))

        # hadoop_input_format
        if (step_num == 0 and self._hadoop_input_format):
            args.extend(['-inputformat', self._hadoop_input_format])

        # hadoop_output_format
        if (step_num == num_steps - 1 and self._hadoop_output_format):
            args.extend(['-outputformat', self._hadoop_output_format])

        return args
Example #19
0
 def test_no_special_logic_for_paths(self):
     assert_equal(combine_dicts(
         {'PATH': '/bin:/usr/bin',
          'PYTHONPATH': '/usr/lib/python/site-packages',
          'PS1': '> '},
         {'PATH': '/home/dave/bin',
          'PYTHONPATH': '/home/dave/python',
          'CLASSPATH': '/home/dave/java',
          'PS1': '\w> '}),
         {'PATH': '/home/dave/bin',
          'PYTHONPATH': '/home/dave/python',
          'CLASSPATH': '/home/dave/java',
          'PS1': '\w> '})
Example #20
0
    def _jobconf_for_step(self, step_num):
        """Get the jobconf dictionary, optionally including step-specific
        jobconf info.

        Also translate jobconfs to the current Hadoop version, if necessary.
        """
        step = self._get_step(step_num)
        jobconf = combine_dicts(self._opts["jobconf"], step.get("jobconf"))

        # if user is using the wrong jobconfs, add in the correct ones
        self._update_jobconf_for_hadoop_version(jobconf, self.get_hadoop_version())

        return jobconf
Example #21
0
    def test_can_override_sort_values_from_cmd_line(self):
        mr_job = MRSortValues(
            ['--partitioner', 'org.pants.FancyPantsPartitioner',
             '--jobconf', 'stream.num.map.output.key.fields=lots'])

        self.assertEqual(
            mr_job.partitioner(),
            'org.pants.FancyPantsPartitioner')

        self.assertEqual(
            mr_job.jobconf(),
            combine_dicts(_SORT_VALUES_JOBCONF,
                          {'stream.num.map.output.key.fields': 'lots'}))
Example #22
0
 def _default_opts(self):
     return combine_dicts(
         super(DataprocJobRunner, self)._default_opts(),
         dict(
             bootstrap_python=True,
             check_cluster_every=_DEFAULT_CHECK_CLUSTER_EVERY,
             cleanup=['CLUSTER', 'JOB', 'LOCAL_TMP'],
             cloud_fs_sync_secs=_DEFAULT_CLOUD_FS_SYNC_SECS,
             image_version=_DEFAULT_IMAGE_VERSION,
             instance_type=_DEFAULT_INSTANCE_TYPE,
             master_instance_type=_DEFAULT_INSTANCE_TYPE,
             num_core_instances=_DATAPROC_MIN_WORKERS,
             num_task_instances=0,
         )
     )
 def emr_job_runner_kwargs(self):
     args = super(DownloadToS3, self).emr_job_runner_kwargs()
     
     # set up AWS credentials on EMR instances
     access_key = os.environ['AWS_ACCESS_KEY_ID']
     secret = os.environ['AWS_SECRET_ACCESS_KEY']
     args['cmdenv'] = combine_dicts(args['cmdenv'], {'AWS_ACCESS_KEY_ID': access_key, 'AWS_SECRET_ACCESS_KEY': secret})
     
     # install pip, aws-cli, and boto
     args['bootstrap_cmds'] = combine_lists(args['bootstrap_cmds'],
                                            ['sysctl -w "net.ipv4.tcp_window_scaling=0"',
                                             'sudo apt-get install python-pip',
                                             'sudo pip install awscli',
                                             'sudo pip install boto'])
     return args
Example #24
0
 def _default_opts(self):
     return combine_dicts(
         super(HadoopInTheCloudJobRunner, self)._default_opts(),
         dict(
             cloud_part_size_mb=100,  # 100 MB
             max_mins_idle=_DEFAULT_MAX_MINS_IDLE,
             # don't use a list because it makes it hard to read option
             # values when running in verbose mode. See #1284
             ssh_bind_ports=xrange(40001, 40841),
             ssh_tunnel=False,
             ssh_tunnel_is_open=False,
             # ssh_bin isn't included here. For example, the Dataproc
             # runner launches ssh through the gcloud util
         ),
     )
Example #25
0
    def default_options(self):
        super_opts = super(RunnerOptionStore, self).default_options()

        try:
            owner = getpass.getuser()
        except:
            owner = None

        return combine_dicts(super_opts, {
            'base_tmp_dir': tempfile.gettempdir(),
            'bootstrap_mrjob': True,
            'cleanup': ['ALL'],
            'cleanup_on_failure': ['NONE'],
            'hadoop_version': '0.20',
            'owner': owner,
        })
Example #26
0
    def job_runner_kwargs(self):
        """Keyword arguments used to create runners when
        :py:meth:`make_runner` is called.

        :return: map from arg name to value

        Re-define this if you want finer control of runner initialization.

        You might find :py:meth:`mrjob.conf.combine_dicts` useful if you
        want to add or change lots of keyword arguments.
        """
        return combine_dicts(
            self._non_option_kwargs(),
            self._kwargs_from_switches(_allowed_keys('base')),
            self._job_kwargs(),
        )
Example #27
0
    def default_options(self):
        super_opts = super(RunnerOptionStore, self).default_options()

        try:
            owner = getpass.getuser()
        except:
            owner = None

        return combine_dicts(super_opts, {
            'check_input_paths': True,
            'cleanup': ['ALL'],
            'cleanup_on_failure': ['NONE'],
            'local_tmp_dir': tempfile.gettempdir(),
            'owner': owner,
            'sh_bin': ['sh', '-ex'],
            'strict_protocols': True,
        })
Example #28
0
 def test_no_special_logic_for_paths(self):
     self.assertEqual(
         combine_dicts(
             {"PATH": "/bin:/usr/bin", "PYTHONPATH": "/usr/lib/python/site-packages", "PS1": "> "},
             {
                 "PATH": "/home/dave/bin",
                 "PYTHONPATH": "/home/dave/python",
                 "CLASSPATH": "/home/dave/java",
                 "PS1": "\w> ",
             },
         ),
         {
             "PATH": "/home/dave/bin",
             "PYTHONPATH": "/home/dave/python",
             "CLASSPATH": "/home/dave/java",
             "PS1": "\w> ",
         },
     )
Example #29
0
    def jobconf(self):
        """``-D`` args to pass to hadoop streaming. This should be a map
        from property name to value.

        By default, this combines :option:`jobconf` options from the command
        lines with :py:attr:`JOBCONF`, with command line arguments taking
        precedence.

        We also blank out ``mapred.output.key.comparator.class``
        and ``mapred.text.key.comparator.options`` to prevent interference
        from :file:`mrjob.conf`.

        :py:attr:`SORT_VALUES` *can* be overridden by :py:attr:`JOBCONF`, the
        command line, and step-specific ``jobconf`` values.

        For example, if you know your values are numbers, and want to sort
        them in reverse, you could do::

            SORT_VALUES = True

            JOBCONF = {
              'mapred.output.key.comparator.class':
                  'org.apache.hadoop.mapred.lib.KeyFieldBasedComparator',
              'mapred.text.key.comparator.options': '-k1 -k2nr',
            }

        If you want to re-define this, it's strongly recommended that do
        something like this, so as not to inadvertently disable
        the :option:`jobconf` option::

            def jobconf(self):
                orig_jobconf = super(MyMRJobClass, self).jobconf()
                custom_jobconf = ...

                return mrjob.conf.combine_dicts(orig_jobconf, custom_jobconf)
        """
        # combine job and runner jobconf
        unfiltered_jobconf = combine_dicts(self.JOBCONF, self.options.jobconf)

        # turn booleans into the Java equivalent ("false", not "False")
        return {
            k: json.dumps(v) if not isinstance(v, string_types) else v
            for k, v in unfiltered_jobconf.items() if v is not None
        }
Example #30
0
    def default_options(self):
        super_opts = super(RunnerOptionStore, self).default_options()

        try:
            owner = getpass.getuser()
        except:
            owner = None

        return combine_dicts(
            super_opts,
            {
                "check_input_paths": True,
                "cleanup": ["ALL"],
                "cleanup_on_failure": ["NONE"],
                "local_tmp_dir": tempfile.gettempdir(),
                "owner": owner,
                "sh_bin": ["sh", "-ex"],
                "strict_protocols": True,
            },
        )
Example #31
0
 def _default_opts(self):
     return combine_dicts(
         super(SparkMRJobRunner, self)._default_opts(),
         dict(cloud_part_size_mb=_DEFAULT_CLOUD_PART_SIZE_MB, ),
     )
Example #32
0
class DataprocRunnerOptionStore(RunnerOptionStore):
    ALLOWED_KEYS = RunnerOptionStore.ALLOWED_KEYS.union(set([
        'gcp_project',

        'cluster_id',
        'region',
        'zone',
        'image_version',
        'check_cluster_every',

        'instance_type',
        'master_instance_type',
        'core_instance_type',
        'task_instance_type',

        'num_core_instances',
        'num_task_instances',

        'cloud_fs_sync_secs',
        'cloud_tmp_dir',

        'bootstrap',
        'bootstrap_python',
        'max_hours_idle',
    ]))

    COMBINERS = combine_dicts(RunnerOptionStore.COMBINERS, {
        'bootstrap': combine_lists,
        'cloud_tmp_dir': combine_paths,
    })

    DEFAULT_FALLBACKS = {
        'core_instance_type': 'instance_type',
        'task_instance_type': 'instance_type'
    }

    def __init__(self, alias, opts, conf_paths):
        super(DataprocRunnerOptionStore, self).__init__(
            alias, opts, conf_paths)

        # Dataproc requires a master and >= 2 core instances
        # num_core_instances refers ONLY to number of CORE instances and does
        # NOT include the required 1 instance for master
        # In other words, minimum cluster size is 3 machines, 1 master and 2
        # "num_core_instances" workers
        if self['num_core_instances'] < _DATAPROC_MIN_WORKERS:
            raise DataprocException(
                'Dataproc expects at LEAST %d workers' % _DATAPROC_MIN_WORKERS)

        for varname, fallback_varname in self.DEFAULT_FALLBACKS.items():
            self[varname] = self[varname] or self[fallback_varname]

        if self['core_instance_type'] != self['task_instance_type']:
            raise DataprocException(
                'Dataproc v1 expects core/task instance types to be identical')

    def default_options(self):
        super_opts = super(DataprocRunnerOptionStore, self).default_options()
        return combine_dicts(super_opts, {
            'bootstrap_python': True,
            'image_version': _DEFAULT_IMAGE_VERSION,
            'check_cluster_every': _DEFAULT_CHECK_CLUSTER_EVERY,

            'instance_type': _DEFAULT_INSTANCE_TYPE,
            'master_instance_type': _DEFAULT_INSTANCE_TYPE,

            'num_core_instances': _DATAPROC_MIN_WORKERS,
            'num_task_instances': 0,

            'cloud_fs_sync_secs': _DEFAULT_CLOUD_FS_SYNC_SECS,

            'max_hours_idle': _DEFAULT_MAX_HOURS_IDLE,
            'sh_bin': ['/bin/sh', '-ex'],

            'cleanup': ['CLUSTER', 'JOB', 'LOCAL_TMP']
        })
Example #33
0
 def _default_opts(self):
     return combine_dicts(
         super(MRJobBinRunner, self)._default_opts(),
         dict(sh_bin=['sh', '-ex'], ))
Example #34
0
 def test_later_values_take_precedence(self):
     self.assertEqual(
         combine_dicts({'TMPDIR': '/tmp', 'HOME': '/home/dave'},
                       {'TMPDIR': '/var/tmp'}),
         {'TMPDIR': '/var/tmp', 'HOME': '/home/dave'})
Example #35
0
class SimRunnerOptionStore(RunnerOptionStore):

    COMBINERS = combine_dicts(RunnerOptionStore.COMBINERS, {
        'cmdenv': combine_local_envs,
    })
Example #36
0
 def test_None_value(self):
     self.assertEqual(
         combine_dicts({'USER': '******', 'TERM': 'xterm'}, {'USER': None}),
         {'TERM': 'xterm', 'USER': None})
Example #37
0
 def _runner_kwargs(self):
     return combine_dicts(
         self._non_option_kwargs(),
         self._kwargs_from_switches(self._runner_opt_names()),
         self._job_kwargs(),
     )
Example #38
0
 def test_cleared_value(self):
     self.assertEqual(
         combine_dicts({'USER': '******', 'TERM': 'xterm'},
                       {'USER': ClearedValue('caleb')}),
         {'TERM': 'xterm', 'USER': '******'})
Example #39
0
 def test_skip_None(self):
     self.assertEqual(combine_dicts(None, {'USER': '******'}, None,
                               {'TERM': 'xterm'}, None),
                  {'USER': '******', 'TERM': 'xterm'})
Example #40
0
class RunnerOptionStore(OptionStore):

    # Test cases for this class live in tests.test_option_store rather than
    # tests.test_runner.

    ALLOWED_KEYS = OptionStore.ALLOWED_KEYS.union(
        set([
            'base_tmp_dir',
            'bootstrap_mrjob',
            'cleanup',
            'cleanup_on_failure',
            'cmdenv',
            'hadoop_extra_args',
            'hadoop_streaming_jar',
            'hadoop_version',
            'interpreter',
            'jobconf',
            'label',
            'owner',
            'python_archives',
            'python_bin',
            'setup',
            'setup_cmds',
            'setup_scripts',
            'sh_bin',
            'steps_interpreter',
            'steps_python_bin',
            'upload_archives',
            'upload_files',
        ]))

    COMBINERS = combine_dicts(
        OptionStore.COMBINERS, {
            'base_tmp_dir': combine_paths,
            'cmdenv': combine_envs,
            'hadoop_extra_args': combine_lists,
            'interpreter': combine_cmds,
            'jobconf': combine_dicts,
            'python_archives': combine_path_lists,
            'python_bin': combine_cmds,
            'setup': combine_lists,
            'setup_cmds': combine_lists,
            'setup_scripts': combine_path_lists,
            'sh_bin': combine_cmds,
            'steps_interpreter': combine_cmds,
            'steps_python_bin': combine_cmds,
            'upload_archives': combine_path_lists,
            'upload_files': combine_path_lists,
        })

    def __init__(self, alias, opts, conf_paths):
        """
        :param alias: Runner alias (e.g. ``'local'``)
        :param opts: Options from the command line
        :param conf_paths: Either a file path or an iterable of paths to config
                           files
        """
        super(RunnerOptionStore, self).__init__()

        # sanitize incoming options and issue warnings for bad keys
        opts = self.validated_options(opts,
                                      'Got unexpected keyword arguments: %s')

        unsanitized_opt_dicts = load_opts_from_mrjob_confs(
            alias, conf_paths=conf_paths)

        for path, mrjob_conf_opts in unsanitized_opt_dicts:
            self.cascading_dicts.append(
                self.validated_options(
                    mrjob_conf_opts,
                    'Got unexpected opts from %s: %%s' % path))

        self.cascading_dicts.append(opts)

        if (len(self.cascading_dicts) > 2
                and all(len(d) == 0 for d in self.cascading_dicts[2:-1])
                and (len(conf_paths or []) > 0 or len(opts) == 0)):
            log.warning('No configs specified for %s runner' % alias)

        self.populate_values_from_cascading_dicts()

        self._validate_cleanup()

        self._fix_interp_options()

        log.debug('Active configuration:')
        log.debug(pprint.pformat(self))

    def default_options(self):
        super_opts = super(RunnerOptionStore, self).default_options()

        try:
            owner = getpass.getuser()
        except:
            owner = None

        return combine_dicts(
            super_opts, {
                'base_tmp_dir': tempfile.gettempdir(),
                'bootstrap_mrjob': True,
                'cleanup': ['ALL'],
                'cleanup_on_failure': ['NONE'],
                'hadoop_version': '0.20',
                'owner': owner,
                'sh_bin': ['sh'],
            })

    def _validate_cleanup(self):
        # old API accepts strings for cleanup
        # new API wants lists
        for opt_key in ('cleanup', 'cleanup_on_failure'):
            if isinstance(self[opt_key], basestring):
                self[opt_key] = [self[opt_key]]

        def validate_cleanup(error_str, opt_list):
            for choice in opt_list:
                if choice not in CLEANUP_CHOICES:
                    raise ValueError(error_str % choice)
            if 'NONE' in opt_list and len(set(opt_list)) > 1:
                raise ValueError('Cannot clean up both nothing and something!')

        cleanup_error = ('cleanup must be one of %s, not %%s' %
                         ', '.join(CLEANUP_CHOICES))
        validate_cleanup(cleanup_error, self['cleanup'])

        cleanup_failure_error = (
            'cleanup_on_failure must be one of %s, not %%s' %
            ', '.join(CLEANUP_CHOICES))
        validate_cleanup(cleanup_failure_error, self['cleanup_on_failure'])

    def _fix_interp_options(self):
        if not self['steps_python_bin']:
            self['steps_python_bin'] = (self['python_bin'] or [sys.executable]
                                        or ['python'])

        if not self['python_bin']:
            self['python_bin'] = ['python']

        if not self['steps_interpreter']:
            if self['interpreter']:
                self['steps_interpreter'] = self['interpreter']
            else:
                self['steps_interpreter'] = self['steps_python_bin']

        if not self['interpreter']:
            self['interpreter'] = self['python_bin']
Example #41
0
 def _default_opts(cls):
     """A dictionary giving the default value of options."""
     return combine_dicts(super(LocalMRJobRunner, cls)._default_opts(), {
         # prefer whatever interpreter we're currently using
         'python_bin': [sys.executable or 'python'],
     })
Example #42
0
 def default_options(self):
     super_opts = super(HadoopRunnerOptionStore, self).default_options()
     return combine_dicts(super_opts, {
         'hadoop_tmp_dir': 'tmp/mrjob',
     })
Example #43
0
    def jobconf(self):
        """``-D`` args to pass to hadoop streaming. This should be a map
        from property name to value.

        By default, this combines :option:`jobconf` options from the command
        lines with :py:attr:`JOBCONF`, with command line arguments taking
        precedence.

        We also blank out ``mapred.output.key.comparator.class``
        and ``mapred.text.key.comparator.options`` to prevent interference
        from :file:`mrjob.conf`.

        :py:attr:`SORT_VALUES` *can* be overridden by :py:attr:`JOBCONF`, the
        command line, and step-specific ``jobconf`` values.

        For example, if you know your values are numbers, and want to sort
        them in reverse, you could do::

            SORT_VALUES = True

            JOBCONF = {
              'mapred.output.key.comparator.class':
                  'org.apache.hadoop.mapred.lib.KeyFieldBasedComparator',
              'mapred.text.key.comparator.options': '-k1 -k2nr',
            }

        If you want to re-define this, it's strongly recommended that do
        something like this, so as not to inadvertently disable
        the :option:`jobconf` option::

            def jobconf(self):
                orig_jobconf = super(MyMRJobClass, self).jobconf()
                custom_jobconf = ...

                return mrjob.conf.combine_dicts(orig_jobconf, custom_jobconf)
        """

        # deal with various forms of bad behavior by users
        unfiltered_jobconf = combine_dicts(self.JOBCONF, self.options.jobconf)
        filtered_jobconf = {}

        def format_hadoop_version(v_float):
            if v_float >= 1.0:
                # e.g. 1.0
                return '%.1f' % v_float
            else:
                # e.g. 0.20
                return '%.2f' % v_float

        for key in unfiltered_jobconf:
            unfiltered_val = unfiltered_jobconf[key]
            filtered_val = unfiltered_val

            # boolean values need to be lowercased
            if isinstance(unfiltered_val, bool):
                if unfiltered_val:
                    filtered_val = 'true'
                else:
                    filtered_val = 'false'

            # TODO: why would a jobconf variable be named 'hadoop_version'?
            # hadoop_version should be a string
            elif (key == 'hadoop_version'
                  and isinstance(unfiltered_val, float)):
                log.warning('hadoop_version should be a string, not %s' %
                            unfiltered_val)
                filtered_val = format_hadoop_version(unfiltered_val)
            filtered_jobconf[key] = filtered_val

        return filtered_jobconf
Example #44
0
 def test_deleted_value(self):
     self.assertEqual(
         combine_dicts({
             'USER': '******',
             'TERM': 'xterm'
         }, {'USER': ClearedValue(None)}), {'TERM': 'xterm'})
Example #45
0
 def _opts_combiners(cls):
     # on windows, PYTHONPATH should use ;, not :
     return combine_dicts(
         super(LocalMRJobRunner, cls)._opts_combiners(),
         {'cmdenv': combine_local_envs})
Example #46
0
 def _opt_combiners(self):
     """Combine *cmdenv* with :py:func:`~mrjob.conf.combine_local_envs`"""
     return combine_dicts(
         super(SimMRJobRunner, self)._opt_combiners(),
         dict(cmdenv=combine_local_envs),
     )
Example #47
0
class RunnerOptionStore(OptionStore):

    # Test cases for this class live in tests.test_option_store rather than
    # tests.test_runner.

    ALLOWED_KEYS = OptionStore.ALLOWED_KEYS.union(
        set([
            'bootstrap_mrjob',
            'check_input_paths',
            'cleanup',
            'cleanup_on_failure',
            'cmdenv',
            'hadoop_version',
            'interpreter',
            'jobconf',
            'label',
            'local_tmp_dir',
            'owner',
            'python_archives',
            'python_bin',
            'setup',
            'setup_cmds',
            'setup_scripts',
            'sh_bin',
            'steps_interpreter',
            'steps_python_bin',
            'strict_protocols',
            'upload_archives',
            'upload_files',
        ]))

    COMBINERS = combine_dicts(
        OptionStore.COMBINERS, {
            'cmdenv': combine_envs,
            'interpreter': combine_cmds,
            'jobconf': combine_dicts,
            'local_tmp_dir': combine_paths,
            'python_archives': combine_path_lists,
            'python_bin': combine_cmds,
            'setup': combine_lists,
            'setup_cmds': combine_lists,
            'setup_scripts': combine_path_lists,
            'sh_bin': combine_cmds,
            'steps_interpreter': combine_cmds,
            'steps_python_bin': combine_cmds,
            'upload_archives': combine_path_lists,
            'upload_files': combine_path_lists,
        })

    DEPRECATED_ALIASES = {
        'base_tmp_dir': 'local_tmp_dir',
    }

    def __init__(self, alias, opts, conf_paths):
        """
        :param alias: Runner alias (e.g. ``'local'``)
        :param opts: Keyword args to runner's constructor (usually from the
                     command line).
        :param conf_paths: An iterable of paths to config files
        """
        super(RunnerOptionStore, self).__init__()

        # sanitize incoming options and issue warnings for bad keys
        opts = self.validated_options(opts)

        unsanitized_opt_dicts = load_opts_from_mrjob_confs(
            alias, conf_paths=conf_paths)

        for path, mrjob_conf_opts in unsanitized_opt_dicts:
            self.cascading_dicts.append(
                self.validated_options(mrjob_conf_opts,
                                       from_where=(' from %s' % path)))

        self.cascading_dicts.append(opts)

        if (len(self.cascading_dicts) > 2
                and all(len(d) == 0 for d in self.cascading_dicts[2:-1])
                and (len(conf_paths or []) > 0)):
            log.warning('No configs specified for %s runner' % alias)

        self.populate_values_from_cascading_dicts()

        log.debug('Active configuration:')
        log.debug(pprint.pformat(self))

    def default_options(self):
        super_opts = super(RunnerOptionStore, self).default_options()

        try:
            owner = getpass.getuser()
        except:
            owner = None

        return combine_dicts(
            super_opts, {
                'check_input_paths': True,
                'cleanup': ['ALL'],
                'cleanup_on_failure': ['NONE'],
                'local_tmp_dir': tempfile.gettempdir(),
                'owner': owner,
                'sh_bin': ['sh', '-ex'],
                'strict_protocols': True,
            })

    def validated_options(self, opts, from_where=''):
        opts = super(RunnerOptionStore,
                     self).validated_options(opts, from_where)

        self._fix_cleanup_opt('cleanup', opts, from_where)
        self._fix_cleanup_opt('cleanup_on_failure', opts, from_where)

        return opts

    def _fix_cleanup_opt(self, opt_key, opts, from_where=''):
        if opts.get(opt_key) is None:
            return

        opt_list = opts[opt_key]

        # runner expects list of string, not string
        if isinstance(opt_list, string_types):
            opt_list = [opt_list]

        if 'NONE' in opt_list and len(set(opt_list)) > 1:
            raise ValueError('Cannot clean up both nothing and something!')

        def handle_cleanup_opt(opt):
            if opt in CLEANUP_CHOICES:
                return opt

            if opt in _CLEANUP_DEPRECATED_ALIASES:
                aliased_opt = _CLEANUP_DEPRECATED_ALIASES[opt]
                # TODO: don't do this when option value is None
                log.warning(
                    'Deprecated %s option %s%s has been renamed to %s' %
                    (opt_key, opt, from_where, aliased_opt))
                return aliased_opt

            raise ValueError('%s must be one of %s, not %s' %
                             (opt_key, ', '.join(CLEANUP_CHOICES), opt))

        opt_list = [handle_cleanup_opt(opt) for opt in opt_list]

        opts[opt_key] = opt_list
Example #48
0
 def test_empty(self):
     assert_equal(combine_dicts(), {})
Example #49
0
 def test_empty(self):
     self.assertEqual(combine_dicts(), {})
Example #50
0
 def default_options(self):
     # don't bootstrap mrjob by default when running locally
     super_opts = super(SimRunnerOptionStore, self).default_options()
     return combine_dicts(super_opts, {
         'bootstrap_mrjob': False,
     })
Example #51
0
 def default_options(self):
     super_opts = super(LocalRunnerOptionStore, self).default_options()
     return combine_dicts(super_opts, {
         # prefer whatever interpreter we're currently using
         'python_bin': [sys.executable or 'python'],
     })
Example #52
0
 def _default_opts(self):
     return combine_dicts(
         super(MRJobBinRunner, self)._default_opts(),
         dict(read_logs=True, ))