def _job_runner_kwargs_for_runner(self, runner_alias): """Helper method that powers the *_job_runner_kwargs() methods.""" # user can no longer silently ignore switches by overriding # job_runner_kwargs() return combine_dicts( self._kwargs_from_switches(_allowed_keys(runner_alias)), self.job_runner_kwargs(), )
class HadoopRunnerOptionStore(RunnerOptionStore): ALLOWED_KEYS = _allowed_keys('hadoop') COMBINERS = _combiners('hadoop') DEPRECATED_ALIASES = _deprecated_aliases('hadoop') def default_options(self): super_opts = super(HadoopRunnerOptionStore, self).default_options() return combine_dicts(super_opts, { 'hadoop_tmp_dir': 'tmp/mrjob', })
class DataprocRunnerOptionStore(RunnerOptionStore): ALLOWED_KEYS = _allowed_keys('dataproc') COMBINERS = _combiners('dataproc') DEPRECATED_ALIASES = _deprecated_aliases('dataproc') DEFAULT_FALLBACKS = { 'core_instance_type': 'instance_type', 'task_instance_type': 'instance_type' } def __init__(self, alias, opts, conf_paths): super(DataprocRunnerOptionStore, self).__init__( alias, opts, conf_paths) # Dataproc requires a master and >= 2 core instances # num_core_instances refers ONLY to number of CORE instances and does # NOT include the required 1 instance for master # In other words, minimum cluster size is 3 machines, 1 master and 2 # "num_core_instances" workers if self['num_core_instances'] < _DATAPROC_MIN_WORKERS: raise DataprocException( 'Dataproc expects at LEAST %d workers' % _DATAPROC_MIN_WORKERS) for varname, fallback_varname in self.DEFAULT_FALLBACKS.items(): self[varname] = self[varname] or self[fallback_varname] if self['core_instance_type'] != self['task_instance_type']: raise DataprocException( 'Dataproc v1 expects core/task instance types to be identical') def default_options(self): super_opts = super(DataprocRunnerOptionStore, self).default_options() return combine_dicts(super_opts, { 'bootstrap_python': True, 'image_version': _DEFAULT_IMAGE_VERSION, 'check_cluster_every': _DEFAULT_CHECK_CLUSTER_EVERY, 'instance_type': _DEFAULT_INSTANCE_TYPE, 'master_instance_type': _DEFAULT_INSTANCE_TYPE, 'num_core_instances': _DATAPROC_MIN_WORKERS, 'num_task_instances': 0, 'cloud_fs_sync_secs': _DEFAULT_CLOUD_FS_SYNC_SECS, 'max_hours_idle': _DEFAULT_MAX_HOURS_IDLE, 'sh_bin': ['/bin/sh', '-ex'], 'cleanup': ['CLUSTER', 'JOB', 'LOCAL_TMP'] })
def job_runner_kwargs(self): """Keyword arguments used to create runners when :py:meth:`make_runner` is called. :return: map from arg name to value Re-define this if you want finer control of runner initialization. You might find :py:meth:`mrjob.conf.combine_dicts` useful if you want to add or change lots of keyword arguments. """ return combine_dicts( self._non_option_kwargs(), self._kwargs_from_switches(_allowed_keys('base')), self._job_kwargs(), )
class SimRunnerOptionStore(RunnerOptionStore): # these are the same for 'local' and 'inline' runners ALLOWED_KEYS = _allowed_keys('local') COMBINERS = _combiners('local') DEPRECATED_ALIASES = _deprecated_aliases('local')
class RunnerOptionStore(OptionStore): # 'base' is aritrary; if an option support all runners, it won't # have "runners" set in _RUNNER_OPTS at all ALLOWED_KEYS = _allowed_keys('base') COMBINERS = _combiners('base') DEPRECATED_ALIASES = _deprecated_aliases('base') def __init__(self, alias, opts, conf_paths): """ :param alias: Runner alias (e.g. ``'local'``) :param opts: Keyword args to runner's constructor (usually from the command line). :param conf_paths: An iterable of paths to config files """ super(RunnerOptionStore, self).__init__() # sanitize incoming options and issue warnings for bad keys opts = self.validated_options(opts) unsanitized_opt_dicts = load_opts_from_mrjob_confs( alias, conf_paths=conf_paths) for path, mrjob_conf_opts in unsanitized_opt_dicts: self.cascading_dicts.append( self.validated_options(mrjob_conf_opts, from_where=(' from %s' % path))) self.cascading_dicts.append(opts) if (len(self.cascading_dicts) > 2 and all(len(d) == 0 for d in self.cascading_dicts[2:-1]) and (len(conf_paths or []) > 0)): log.warning('No configs specified for %s runner' % alias) self.populate_values_from_cascading_dicts() log.debug('Active configuration:') log.debug( pprint.pformat( dict((opt_key, self._obfuscate(opt_key, opt_value)) for opt_key, opt_value in self.items()))) def default_options(self): super_opts = super(RunnerOptionStore, self).default_options() try: owner = getpass.getuser() except: owner = None return combine_dicts( super_opts, { 'check_input_paths': True, 'cleanup': ['ALL'], 'cleanup_on_failure': ['NONE'], 'local_tmp_dir': tempfile.gettempdir(), 'owner': owner, 'sh_bin': ['sh', '-ex'], 'strict_protocols': True, }) def validated_options(self, opts, from_where=''): opts = super(RunnerOptionStore, self).validated_options(opts, from_where) self._fix_cleanup_opt('cleanup', opts, from_where) self._fix_cleanup_opt('cleanup_on_failure', opts, from_where) return opts def _fix_cleanup_opt(self, opt_key, opts, from_where=''): if opts.get(opt_key) is None: return opt_list = opts[opt_key] # runner expects list of string, not string if isinstance(opt_list, string_types): opt_list = [opt_list] if 'NONE' in opt_list and len(set(opt_list)) > 1: raise ValueError('Cannot clean up both nothing and something!') def handle_cleanup_opt(opt): if opt in CLEANUP_CHOICES: return opt if opt in _CLEANUP_DEPRECATED_ALIASES: aliased_opt = _CLEANUP_DEPRECATED_ALIASES[opt] # TODO: don't do this when option value is None log.warning( 'Deprecated %s option %s%s has been renamed to %s' % (opt_key, opt, from_where, aliased_opt)) return aliased_opt raise ValueError('%s must be one of %s, not %s' % (opt_key, ', '.join(CLEANUP_CHOICES), opt)) opt_list = [handle_cleanup_opt(opt) for opt in opt_list] opts[opt_key] = opt_list def _obfuscate(self, opt_key, opt_value): """Return value of opt to show in debug printout. Used to obfuscate credentials, etc.""" return opt_value