def _default_opts(self): return combine_dicts( super(HadoopJobRunner, self)._default_opts(), dict( hadoop_tmp_dir='tmp/mrjob', ) )
def _default_opts(self): return combine_dicts( super(MRJobBinRunner, self)._default_opts(), dict( read_logs=True, ) )
def _add_runner_args_for_opt(parser, opt_name, include_deprecated=True): """Add switches for a single option (*opt_name*) to the given parser.""" conf = _RUNNER_OPTS[opt_name] if conf.get('deprecated') and not include_deprecated: return switches = conf.get('switches') or [] for args, kwargs in switches: kwargs = dict(kwargs) deprecated_aliases = kwargs.pop('deprecated_aliases', None) kwargs['dest'] = opt_name if kwargs.get('action') == 'append': kwargs['default'] = [] else: kwargs['default'] = None parser.add_argument(*args, **kwargs) # add a switch for deprecated aliases if deprecated_aliases and include_deprecated: help = 'Deprecated alias%s for %s' % ( ('es' if len(deprecated_aliases) > 1 else ''), args[-1]) parser.add_argument( *deprecated_aliases, **combine_dicts(kwargs, dict(help=help)))
def _default_opts(self): return combine_dicts( super(MRJobBinRunner, self)._default_opts(), dict( sh_bin=['sh', '-ex'], ) )
def default_options(self): super_opts = super(HadoopRunnerOptionStore, self).default_options() return combine_dicts(super_opts, { 'hadoop_home': os.environ.get('HADOOP_HOME'), 'hdfs_scratch_dir': 'tmp/mrjob', 'check_input_paths': True })
def jobconf(self): orig_jobconf = super(ZipNumClusterJob, self).jobconf() custom_jobconf = {'mapreduce.job.reduces': self.options.shards, 'mapreduce.totalorderpartitioner.path': self.options.splitfile} combined = combine_dicts(orig_jobconf, custom_jobconf) return combined
def _default_opts(self): return combine_dicts( super(SparkMRJobRunner, self)._default_opts(), dict( cloud_part_size_mb=_DEFAULT_CLOUD_PART_SIZE_MB, ), )
def _opts_combiners(cls): """Map from option name to a combine_*() function used to combine values for that option. This allows us to specify that some options are lists, or contain environment variables, or whatever.""" return combine_dicts( super(HadoopJobRunner, cls)._opts_combiners(), {"hadoop_bin": combine_paths, "hadoop_home": combine_paths, "hdfs_scratch_dir": combine_paths}, )
def test_hadoop_1(self): updated, warnings = self.updated_and_warnings( self.JOBCONF, '1.0') self.assertEqual(updated, combine_dicts(self.JOBCONF, {'user.name': 'dave'})) self.assertIn('do not match hadoop version', warnings) self.assertIn('mapreduce.job.user.name: user.name', warnings)
def paginate(self, **kwargs): result = self.method(**kwargs) values = result[self.result_key] for page_start in range(0, len(values), self.page_size): page = values[page_start:page_start + self.page_size] yield combine_dicts(result, {self.result_key: page})
def _job_runner_kwargs_for_runner(self, runner_alias): """Helper method that powers the *_job_runner_kwargs() methods.""" # user can no longer silently ignore switches by overriding # job_runner_kwargs() return combine_dicts( self._kwargs_from_switches(_allowed_keys(runner_alias)), self.job_runner_kwargs(), )
def test_hadoop_2(self): updated, warnings = self.updated_and_warnings( self.JOBCONF, '2.0') self.assertEqual(updated, combine_dicts(self.JOBCONF, {'mapreduce.job.jar': 'a.jar'})) self.assertIn('do not match hadoop version', warnings) self.assertIn('mapred.jar: mapreduce.job.jar', warnings)
def _runner_kwargs(self): # just use combine_dicts() and not combine_confs(); leave the # magic to the runner return combine_dicts( self._non_option_kwargs(), # don't screen out irrelevant opts (see #1898) self._kwargs_from_switches(set(_RUNNER_OPTS)), self._job_kwargs(), )
def _default_opts(self): return combine_dicts( super(HadoopJobRunner, self)._default_opts(), dict( hadoop_tmp_dir='tmp/mrjob', spark_deploy_mode='client', spark_master='yarn', ) )
def test_can_override_sort_values_from_job(self): mr_job = MRSortValuesAndMore() self.assertEqual( mr_job.partitioner(), 'org.apache.hadoop.mapred.lib.HashPartitioner') self.assertEqual( mr_job.jobconf(), combine_dicts(_SORT_VALUES_JOBCONF, MRSortValuesAndMore.JOBCONF))
def _jobconf_for_step(self, step_num): """Get the jobconf dictionary, optionally including step-specific jobconf info. Also translate jobconfs to the current Hadoop version, if necessary. """ step = self._get_step(step_num) jobconf = combine_dicts(self._opts["jobconf"], step.get("jobconf")) return add_translated_jobconf_for_hadoop_version(jobconf, self.get_hadoop_version())
def hadoop_job_runner_kwargs(self): """Keyword arguments to create create runners when :py:meth:`make_runner` is called, when we run a job on EMR (``-r hadoop``). :return: map from arg name to value Re-define this if you want finer control when running jobs on hadoop. """ return combine_dicts(self.job_runner_kwargs(), self._get_kwargs_from_opt_group(self.hadoop_opt_group))
def _hadoop_conf_args(self, step, step_num, num_steps): """Build a list of extra arguments to the hadoop binary. This handles *cmdenv*, *hadoop_extra_args*, *hadoop_input_format*, *hadoop_output_format*, *jobconf*, and *partitioner*. This doesn't handle input, output, mappers, reducers, or uploading files. """ assert 0 <= step_num < num_steps args = [] jobconf = combine_dicts(self._opts['jobconf'], step.get('jobconf')) # hadoop_extra_args args.extend(self._opts['hadoop_extra_args']) # new-style jobconf version = self.get_hadoop_version() # translate the jobconf configuration names to match # the hadoop version jobconf = add_translated_jobconf_for_hadoop_version(jobconf, version) if uses_generic_jobconf(version): for key, value in sorted(jobconf.iteritems()): if value is not None: args.extend(['-D', '%s=%s' % (key, value)]) # old-style jobconf else: for key, value in sorted(jobconf.iteritems()): if value is not None: args.extend(['-jobconf', '%s=%s' % (key, value)]) # partitioner if self._partitioner: args.extend(['-partitioner', self._partitioner]) # cmdenv for key, value in sorted(self._opts['cmdenv'].iteritems()): args.append('-cmdenv') args.append('%s=%s' % (key, value)) # hadoop_input_format if (step_num == 0 and self._hadoop_input_format): args.extend(['-inputformat', self._hadoop_input_format]) # hadoop_output_format if (step_num == num_steps - 1 and self._hadoop_output_format): args.extend(['-outputformat', self._hadoop_output_format]) return args
def test_no_special_logic_for_paths(self): assert_equal(combine_dicts( {'PATH': '/bin:/usr/bin', 'PYTHONPATH': '/usr/lib/python/site-packages', 'PS1': '> '}, {'PATH': '/home/dave/bin', 'PYTHONPATH': '/home/dave/python', 'CLASSPATH': '/home/dave/java', 'PS1': '\w> '}), {'PATH': '/home/dave/bin', 'PYTHONPATH': '/home/dave/python', 'CLASSPATH': '/home/dave/java', 'PS1': '\w> '})
def _jobconf_for_step(self, step_num): """Get the jobconf dictionary, optionally including step-specific jobconf info. Also translate jobconfs to the current Hadoop version, if necessary. """ step = self._get_step(step_num) jobconf = combine_dicts(self._opts["jobconf"], step.get("jobconf")) # if user is using the wrong jobconfs, add in the correct ones self._update_jobconf_for_hadoop_version(jobconf, self.get_hadoop_version()) return jobconf
def test_can_override_sort_values_from_cmd_line(self): mr_job = MRSortValues( ['--partitioner', 'org.pants.FancyPantsPartitioner', '--jobconf', 'stream.num.map.output.key.fields=lots']) self.assertEqual( mr_job.partitioner(), 'org.pants.FancyPantsPartitioner') self.assertEqual( mr_job.jobconf(), combine_dicts(_SORT_VALUES_JOBCONF, {'stream.num.map.output.key.fields': 'lots'}))
def _default_opts(self): return combine_dicts( super(DataprocJobRunner, self)._default_opts(), dict( bootstrap_python=True, check_cluster_every=_DEFAULT_CHECK_CLUSTER_EVERY, cleanup=['CLUSTER', 'JOB', 'LOCAL_TMP'], cloud_fs_sync_secs=_DEFAULT_CLOUD_FS_SYNC_SECS, image_version=_DEFAULT_IMAGE_VERSION, instance_type=_DEFAULT_INSTANCE_TYPE, master_instance_type=_DEFAULT_INSTANCE_TYPE, num_core_instances=_DATAPROC_MIN_WORKERS, num_task_instances=0, ) )
def emr_job_runner_kwargs(self): args = super(DownloadToS3, self).emr_job_runner_kwargs() # set up AWS credentials on EMR instances access_key = os.environ['AWS_ACCESS_KEY_ID'] secret = os.environ['AWS_SECRET_ACCESS_KEY'] args['cmdenv'] = combine_dicts(args['cmdenv'], {'AWS_ACCESS_KEY_ID': access_key, 'AWS_SECRET_ACCESS_KEY': secret}) # install pip, aws-cli, and boto args['bootstrap_cmds'] = combine_lists(args['bootstrap_cmds'], ['sysctl -w "net.ipv4.tcp_window_scaling=0"', 'sudo apt-get install python-pip', 'sudo pip install awscli', 'sudo pip install boto']) return args
def _default_opts(self): return combine_dicts( super(HadoopInTheCloudJobRunner, self)._default_opts(), dict( cloud_part_size_mb=100, # 100 MB max_mins_idle=_DEFAULT_MAX_MINS_IDLE, # don't use a list because it makes it hard to read option # values when running in verbose mode. See #1284 ssh_bind_ports=xrange(40001, 40841), ssh_tunnel=False, ssh_tunnel_is_open=False, # ssh_bin isn't included here. For example, the Dataproc # runner launches ssh through the gcloud util ), )
def default_options(self): super_opts = super(RunnerOptionStore, self).default_options() try: owner = getpass.getuser() except: owner = None return combine_dicts(super_opts, { 'base_tmp_dir': tempfile.gettempdir(), 'bootstrap_mrjob': True, 'cleanup': ['ALL'], 'cleanup_on_failure': ['NONE'], 'hadoop_version': '0.20', 'owner': owner, })
def job_runner_kwargs(self): """Keyword arguments used to create runners when :py:meth:`make_runner` is called. :return: map from arg name to value Re-define this if you want finer control of runner initialization. You might find :py:meth:`mrjob.conf.combine_dicts` useful if you want to add or change lots of keyword arguments. """ return combine_dicts( self._non_option_kwargs(), self._kwargs_from_switches(_allowed_keys('base')), self._job_kwargs(), )
def default_options(self): super_opts = super(RunnerOptionStore, self).default_options() try: owner = getpass.getuser() except: owner = None return combine_dicts(super_opts, { 'check_input_paths': True, 'cleanup': ['ALL'], 'cleanup_on_failure': ['NONE'], 'local_tmp_dir': tempfile.gettempdir(), 'owner': owner, 'sh_bin': ['sh', '-ex'], 'strict_protocols': True, })
def test_no_special_logic_for_paths(self): self.assertEqual( combine_dicts( {"PATH": "/bin:/usr/bin", "PYTHONPATH": "/usr/lib/python/site-packages", "PS1": "> "}, { "PATH": "/home/dave/bin", "PYTHONPATH": "/home/dave/python", "CLASSPATH": "/home/dave/java", "PS1": "\w> ", }, ), { "PATH": "/home/dave/bin", "PYTHONPATH": "/home/dave/python", "CLASSPATH": "/home/dave/java", "PS1": "\w> ", }, )
def jobconf(self): """``-D`` args to pass to hadoop streaming. This should be a map from property name to value. By default, this combines :option:`jobconf` options from the command lines with :py:attr:`JOBCONF`, with command line arguments taking precedence. We also blank out ``mapred.output.key.comparator.class`` and ``mapred.text.key.comparator.options`` to prevent interference from :file:`mrjob.conf`. :py:attr:`SORT_VALUES` *can* be overridden by :py:attr:`JOBCONF`, the command line, and step-specific ``jobconf`` values. For example, if you know your values are numbers, and want to sort them in reverse, you could do:: SORT_VALUES = True JOBCONF = { 'mapred.output.key.comparator.class': 'org.apache.hadoop.mapred.lib.KeyFieldBasedComparator', 'mapred.text.key.comparator.options': '-k1 -k2nr', } If you want to re-define this, it's strongly recommended that do something like this, so as not to inadvertently disable the :option:`jobconf` option:: def jobconf(self): orig_jobconf = super(MyMRJobClass, self).jobconf() custom_jobconf = ... return mrjob.conf.combine_dicts(orig_jobconf, custom_jobconf) """ # combine job and runner jobconf unfiltered_jobconf = combine_dicts(self.JOBCONF, self.options.jobconf) # turn booleans into the Java equivalent ("false", not "False") return { k: json.dumps(v) if not isinstance(v, string_types) else v for k, v in unfiltered_jobconf.items() if v is not None }
def default_options(self): super_opts = super(RunnerOptionStore, self).default_options() try: owner = getpass.getuser() except: owner = None return combine_dicts( super_opts, { "check_input_paths": True, "cleanup": ["ALL"], "cleanup_on_failure": ["NONE"], "local_tmp_dir": tempfile.gettempdir(), "owner": owner, "sh_bin": ["sh", "-ex"], "strict_protocols": True, }, )
def _default_opts(self): return combine_dicts( super(SparkMRJobRunner, self)._default_opts(), dict(cloud_part_size_mb=_DEFAULT_CLOUD_PART_SIZE_MB, ), )
class DataprocRunnerOptionStore(RunnerOptionStore): ALLOWED_KEYS = RunnerOptionStore.ALLOWED_KEYS.union(set([ 'gcp_project', 'cluster_id', 'region', 'zone', 'image_version', 'check_cluster_every', 'instance_type', 'master_instance_type', 'core_instance_type', 'task_instance_type', 'num_core_instances', 'num_task_instances', 'cloud_fs_sync_secs', 'cloud_tmp_dir', 'bootstrap', 'bootstrap_python', 'max_hours_idle', ])) COMBINERS = combine_dicts(RunnerOptionStore.COMBINERS, { 'bootstrap': combine_lists, 'cloud_tmp_dir': combine_paths, }) DEFAULT_FALLBACKS = { 'core_instance_type': 'instance_type', 'task_instance_type': 'instance_type' } def __init__(self, alias, opts, conf_paths): super(DataprocRunnerOptionStore, self).__init__( alias, opts, conf_paths) # Dataproc requires a master and >= 2 core instances # num_core_instances refers ONLY to number of CORE instances and does # NOT include the required 1 instance for master # In other words, minimum cluster size is 3 machines, 1 master and 2 # "num_core_instances" workers if self['num_core_instances'] < _DATAPROC_MIN_WORKERS: raise DataprocException( 'Dataproc expects at LEAST %d workers' % _DATAPROC_MIN_WORKERS) for varname, fallback_varname in self.DEFAULT_FALLBACKS.items(): self[varname] = self[varname] or self[fallback_varname] if self['core_instance_type'] != self['task_instance_type']: raise DataprocException( 'Dataproc v1 expects core/task instance types to be identical') def default_options(self): super_opts = super(DataprocRunnerOptionStore, self).default_options() return combine_dicts(super_opts, { 'bootstrap_python': True, 'image_version': _DEFAULT_IMAGE_VERSION, 'check_cluster_every': _DEFAULT_CHECK_CLUSTER_EVERY, 'instance_type': _DEFAULT_INSTANCE_TYPE, 'master_instance_type': _DEFAULT_INSTANCE_TYPE, 'num_core_instances': _DATAPROC_MIN_WORKERS, 'num_task_instances': 0, 'cloud_fs_sync_secs': _DEFAULT_CLOUD_FS_SYNC_SECS, 'max_hours_idle': _DEFAULT_MAX_HOURS_IDLE, 'sh_bin': ['/bin/sh', '-ex'], 'cleanup': ['CLUSTER', 'JOB', 'LOCAL_TMP'] })
def _default_opts(self): return combine_dicts( super(MRJobBinRunner, self)._default_opts(), dict(sh_bin=['sh', '-ex'], ))
def test_later_values_take_precedence(self): self.assertEqual( combine_dicts({'TMPDIR': '/tmp', 'HOME': '/home/dave'}, {'TMPDIR': '/var/tmp'}), {'TMPDIR': '/var/tmp', 'HOME': '/home/dave'})
class SimRunnerOptionStore(RunnerOptionStore): COMBINERS = combine_dicts(RunnerOptionStore.COMBINERS, { 'cmdenv': combine_local_envs, })
def test_None_value(self): self.assertEqual( combine_dicts({'USER': '******', 'TERM': 'xterm'}, {'USER': None}), {'TERM': 'xterm', 'USER': None})
def _runner_kwargs(self): return combine_dicts( self._non_option_kwargs(), self._kwargs_from_switches(self._runner_opt_names()), self._job_kwargs(), )
def test_cleared_value(self): self.assertEqual( combine_dicts({'USER': '******', 'TERM': 'xterm'}, {'USER': ClearedValue('caleb')}), {'TERM': 'xterm', 'USER': '******'})
def test_skip_None(self): self.assertEqual(combine_dicts(None, {'USER': '******'}, None, {'TERM': 'xterm'}, None), {'USER': '******', 'TERM': 'xterm'})
class RunnerOptionStore(OptionStore): # Test cases for this class live in tests.test_option_store rather than # tests.test_runner. ALLOWED_KEYS = OptionStore.ALLOWED_KEYS.union( set([ 'base_tmp_dir', 'bootstrap_mrjob', 'cleanup', 'cleanup_on_failure', 'cmdenv', 'hadoop_extra_args', 'hadoop_streaming_jar', 'hadoop_version', 'interpreter', 'jobconf', 'label', 'owner', 'python_archives', 'python_bin', 'setup', 'setup_cmds', 'setup_scripts', 'sh_bin', 'steps_interpreter', 'steps_python_bin', 'upload_archives', 'upload_files', ])) COMBINERS = combine_dicts( OptionStore.COMBINERS, { 'base_tmp_dir': combine_paths, 'cmdenv': combine_envs, 'hadoop_extra_args': combine_lists, 'interpreter': combine_cmds, 'jobconf': combine_dicts, 'python_archives': combine_path_lists, 'python_bin': combine_cmds, 'setup': combine_lists, 'setup_cmds': combine_lists, 'setup_scripts': combine_path_lists, 'sh_bin': combine_cmds, 'steps_interpreter': combine_cmds, 'steps_python_bin': combine_cmds, 'upload_archives': combine_path_lists, 'upload_files': combine_path_lists, }) def __init__(self, alias, opts, conf_paths): """ :param alias: Runner alias (e.g. ``'local'``) :param opts: Options from the command line :param conf_paths: Either a file path or an iterable of paths to config files """ super(RunnerOptionStore, self).__init__() # sanitize incoming options and issue warnings for bad keys opts = self.validated_options(opts, 'Got unexpected keyword arguments: %s') unsanitized_opt_dicts = load_opts_from_mrjob_confs( alias, conf_paths=conf_paths) for path, mrjob_conf_opts in unsanitized_opt_dicts: self.cascading_dicts.append( self.validated_options( mrjob_conf_opts, 'Got unexpected opts from %s: %%s' % path)) self.cascading_dicts.append(opts) if (len(self.cascading_dicts) > 2 and all(len(d) == 0 for d in self.cascading_dicts[2:-1]) and (len(conf_paths or []) > 0 or len(opts) == 0)): log.warning('No configs specified for %s runner' % alias) self.populate_values_from_cascading_dicts() self._validate_cleanup() self._fix_interp_options() log.debug('Active configuration:') log.debug(pprint.pformat(self)) def default_options(self): super_opts = super(RunnerOptionStore, self).default_options() try: owner = getpass.getuser() except: owner = None return combine_dicts( super_opts, { 'base_tmp_dir': tempfile.gettempdir(), 'bootstrap_mrjob': True, 'cleanup': ['ALL'], 'cleanup_on_failure': ['NONE'], 'hadoop_version': '0.20', 'owner': owner, 'sh_bin': ['sh'], }) def _validate_cleanup(self): # old API accepts strings for cleanup # new API wants lists for opt_key in ('cleanup', 'cleanup_on_failure'): if isinstance(self[opt_key], basestring): self[opt_key] = [self[opt_key]] def validate_cleanup(error_str, opt_list): for choice in opt_list: if choice not in CLEANUP_CHOICES: raise ValueError(error_str % choice) if 'NONE' in opt_list and len(set(opt_list)) > 1: raise ValueError('Cannot clean up both nothing and something!') cleanup_error = ('cleanup must be one of %s, not %%s' % ', '.join(CLEANUP_CHOICES)) validate_cleanup(cleanup_error, self['cleanup']) cleanup_failure_error = ( 'cleanup_on_failure must be one of %s, not %%s' % ', '.join(CLEANUP_CHOICES)) validate_cleanup(cleanup_failure_error, self['cleanup_on_failure']) def _fix_interp_options(self): if not self['steps_python_bin']: self['steps_python_bin'] = (self['python_bin'] or [sys.executable] or ['python']) if not self['python_bin']: self['python_bin'] = ['python'] if not self['steps_interpreter']: if self['interpreter']: self['steps_interpreter'] = self['interpreter'] else: self['steps_interpreter'] = self['steps_python_bin'] if not self['interpreter']: self['interpreter'] = self['python_bin']
def _default_opts(cls): """A dictionary giving the default value of options.""" return combine_dicts(super(LocalMRJobRunner, cls)._default_opts(), { # prefer whatever interpreter we're currently using 'python_bin': [sys.executable or 'python'], })
def default_options(self): super_opts = super(HadoopRunnerOptionStore, self).default_options() return combine_dicts(super_opts, { 'hadoop_tmp_dir': 'tmp/mrjob', })
def jobconf(self): """``-D`` args to pass to hadoop streaming. This should be a map from property name to value. By default, this combines :option:`jobconf` options from the command lines with :py:attr:`JOBCONF`, with command line arguments taking precedence. We also blank out ``mapred.output.key.comparator.class`` and ``mapred.text.key.comparator.options`` to prevent interference from :file:`mrjob.conf`. :py:attr:`SORT_VALUES` *can* be overridden by :py:attr:`JOBCONF`, the command line, and step-specific ``jobconf`` values. For example, if you know your values are numbers, and want to sort them in reverse, you could do:: SORT_VALUES = True JOBCONF = { 'mapred.output.key.comparator.class': 'org.apache.hadoop.mapred.lib.KeyFieldBasedComparator', 'mapred.text.key.comparator.options': '-k1 -k2nr', } If you want to re-define this, it's strongly recommended that do something like this, so as not to inadvertently disable the :option:`jobconf` option:: def jobconf(self): orig_jobconf = super(MyMRJobClass, self).jobconf() custom_jobconf = ... return mrjob.conf.combine_dicts(orig_jobconf, custom_jobconf) """ # deal with various forms of bad behavior by users unfiltered_jobconf = combine_dicts(self.JOBCONF, self.options.jobconf) filtered_jobconf = {} def format_hadoop_version(v_float): if v_float >= 1.0: # e.g. 1.0 return '%.1f' % v_float else: # e.g. 0.20 return '%.2f' % v_float for key in unfiltered_jobconf: unfiltered_val = unfiltered_jobconf[key] filtered_val = unfiltered_val # boolean values need to be lowercased if isinstance(unfiltered_val, bool): if unfiltered_val: filtered_val = 'true' else: filtered_val = 'false' # TODO: why would a jobconf variable be named 'hadoop_version'? # hadoop_version should be a string elif (key == 'hadoop_version' and isinstance(unfiltered_val, float)): log.warning('hadoop_version should be a string, not %s' % unfiltered_val) filtered_val = format_hadoop_version(unfiltered_val) filtered_jobconf[key] = filtered_val return filtered_jobconf
def test_deleted_value(self): self.assertEqual( combine_dicts({ 'USER': '******', 'TERM': 'xterm' }, {'USER': ClearedValue(None)}), {'TERM': 'xterm'})
def _opts_combiners(cls): # on windows, PYTHONPATH should use ;, not : return combine_dicts( super(LocalMRJobRunner, cls)._opts_combiners(), {'cmdenv': combine_local_envs})
def _opt_combiners(self): """Combine *cmdenv* with :py:func:`~mrjob.conf.combine_local_envs`""" return combine_dicts( super(SimMRJobRunner, self)._opt_combiners(), dict(cmdenv=combine_local_envs), )
class RunnerOptionStore(OptionStore): # Test cases for this class live in tests.test_option_store rather than # tests.test_runner. ALLOWED_KEYS = OptionStore.ALLOWED_KEYS.union( set([ 'bootstrap_mrjob', 'check_input_paths', 'cleanup', 'cleanup_on_failure', 'cmdenv', 'hadoop_version', 'interpreter', 'jobconf', 'label', 'local_tmp_dir', 'owner', 'python_archives', 'python_bin', 'setup', 'setup_cmds', 'setup_scripts', 'sh_bin', 'steps_interpreter', 'steps_python_bin', 'strict_protocols', 'upload_archives', 'upload_files', ])) COMBINERS = combine_dicts( OptionStore.COMBINERS, { 'cmdenv': combine_envs, 'interpreter': combine_cmds, 'jobconf': combine_dicts, 'local_tmp_dir': combine_paths, 'python_archives': combine_path_lists, 'python_bin': combine_cmds, 'setup': combine_lists, 'setup_cmds': combine_lists, 'setup_scripts': combine_path_lists, 'sh_bin': combine_cmds, 'steps_interpreter': combine_cmds, 'steps_python_bin': combine_cmds, 'upload_archives': combine_path_lists, 'upload_files': combine_path_lists, }) DEPRECATED_ALIASES = { 'base_tmp_dir': 'local_tmp_dir', } def __init__(self, alias, opts, conf_paths): """ :param alias: Runner alias (e.g. ``'local'``) :param opts: Keyword args to runner's constructor (usually from the command line). :param conf_paths: An iterable of paths to config files """ super(RunnerOptionStore, self).__init__() # sanitize incoming options and issue warnings for bad keys opts = self.validated_options(opts) unsanitized_opt_dicts = load_opts_from_mrjob_confs( alias, conf_paths=conf_paths) for path, mrjob_conf_opts in unsanitized_opt_dicts: self.cascading_dicts.append( self.validated_options(mrjob_conf_opts, from_where=(' from %s' % path))) self.cascading_dicts.append(opts) if (len(self.cascading_dicts) > 2 and all(len(d) == 0 for d in self.cascading_dicts[2:-1]) and (len(conf_paths or []) > 0)): log.warning('No configs specified for %s runner' % alias) self.populate_values_from_cascading_dicts() log.debug('Active configuration:') log.debug(pprint.pformat(self)) def default_options(self): super_opts = super(RunnerOptionStore, self).default_options() try: owner = getpass.getuser() except: owner = None return combine_dicts( super_opts, { 'check_input_paths': True, 'cleanup': ['ALL'], 'cleanup_on_failure': ['NONE'], 'local_tmp_dir': tempfile.gettempdir(), 'owner': owner, 'sh_bin': ['sh', '-ex'], 'strict_protocols': True, }) def validated_options(self, opts, from_where=''): opts = super(RunnerOptionStore, self).validated_options(opts, from_where) self._fix_cleanup_opt('cleanup', opts, from_where) self._fix_cleanup_opt('cleanup_on_failure', opts, from_where) return opts def _fix_cleanup_opt(self, opt_key, opts, from_where=''): if opts.get(opt_key) is None: return opt_list = opts[opt_key] # runner expects list of string, not string if isinstance(opt_list, string_types): opt_list = [opt_list] if 'NONE' in opt_list and len(set(opt_list)) > 1: raise ValueError('Cannot clean up both nothing and something!') def handle_cleanup_opt(opt): if opt in CLEANUP_CHOICES: return opt if opt in _CLEANUP_DEPRECATED_ALIASES: aliased_opt = _CLEANUP_DEPRECATED_ALIASES[opt] # TODO: don't do this when option value is None log.warning( 'Deprecated %s option %s%s has been renamed to %s' % (opt_key, opt, from_where, aliased_opt)) return aliased_opt raise ValueError('%s must be one of %s, not %s' % (opt_key, ', '.join(CLEANUP_CHOICES), opt)) opt_list = [handle_cleanup_opt(opt) for opt in opt_list] opts[opt_key] = opt_list
def test_empty(self): assert_equal(combine_dicts(), {})
def test_empty(self): self.assertEqual(combine_dicts(), {})
def default_options(self): # don't bootstrap mrjob by default when running locally super_opts = super(SimRunnerOptionStore, self).default_options() return combine_dicts(super_opts, { 'bootstrap_mrjob': False, })
def default_options(self): super_opts = super(LocalRunnerOptionStore, self).default_options() return combine_dicts(super_opts, { # prefer whatever interpreter we're currently using 'python_bin': [sys.executable or 'python'], })
def _default_opts(self): return combine_dicts( super(MRJobBinRunner, self)._default_opts(), dict(read_logs=True, ))