Ejemplo n.º 1
0
 def _job_runner_kwargs_for_runner(self, runner_alias):
     """Helper method that powers the *_job_runner_kwargs()
     methods."""
     # user can no longer silently ignore switches by overriding
     # job_runner_kwargs()
     return combine_dicts(
         self._kwargs_from_switches(_allowed_keys(runner_alias)),
         self.job_runner_kwargs(),
     )
Ejemplo n.º 2
0
 def _job_runner_kwargs_for_runner(self, runner_alias):
     """Helper method that powers the *_job_runner_kwargs()
     methods."""
     # user can no longer silently ignore switches by overriding
     # job_runner_kwargs()
     return combine_dicts(
         self._kwargs_from_switches(_allowed_keys(runner_alias)),
         self.job_runner_kwargs(),
     )
Ejemplo n.º 3
0
class HadoopRunnerOptionStore(RunnerOptionStore):

    ALLOWED_KEYS = _allowed_keys('hadoop')
    COMBINERS = _combiners('hadoop')
    DEPRECATED_ALIASES = _deprecated_aliases('hadoop')

    def default_options(self):
        super_opts = super(HadoopRunnerOptionStore, self).default_options()
        return combine_dicts(super_opts, {
            'hadoop_tmp_dir': 'tmp/mrjob',
        })
Ejemplo n.º 4
0
class DataprocRunnerOptionStore(RunnerOptionStore):
    ALLOWED_KEYS = _allowed_keys('dataproc')
    COMBINERS = _combiners('dataproc')
    DEPRECATED_ALIASES = _deprecated_aliases('dataproc')

    DEFAULT_FALLBACKS = {
        'core_instance_type': 'instance_type',
        'task_instance_type': 'instance_type'
    }

    def __init__(self, alias, opts, conf_paths):
        super(DataprocRunnerOptionStore, self).__init__(
            alias, opts, conf_paths)

        # Dataproc requires a master and >= 2 core instances
        # num_core_instances refers ONLY to number of CORE instances and does
        # NOT include the required 1 instance for master
        # In other words, minimum cluster size is 3 machines, 1 master and 2
        # "num_core_instances" workers
        if self['num_core_instances'] < _DATAPROC_MIN_WORKERS:
            raise DataprocException(
                'Dataproc expects at LEAST %d workers' % _DATAPROC_MIN_WORKERS)

        for varname, fallback_varname in self.DEFAULT_FALLBACKS.items():
            self[varname] = self[varname] or self[fallback_varname]

        if self['core_instance_type'] != self['task_instance_type']:
            raise DataprocException(
                'Dataproc v1 expects core/task instance types to be identical')

    def default_options(self):
        super_opts = super(DataprocRunnerOptionStore, self).default_options()
        return combine_dicts(super_opts, {
            'bootstrap_python': True,
            'image_version': _DEFAULT_IMAGE_VERSION,
            'check_cluster_every': _DEFAULT_CHECK_CLUSTER_EVERY,

            'instance_type': _DEFAULT_INSTANCE_TYPE,
            'master_instance_type': _DEFAULT_INSTANCE_TYPE,

            'num_core_instances': _DATAPROC_MIN_WORKERS,
            'num_task_instances': 0,

            'cloud_fs_sync_secs': _DEFAULT_CLOUD_FS_SYNC_SECS,

            'max_hours_idle': _DEFAULT_MAX_HOURS_IDLE,
            'sh_bin': ['/bin/sh', '-ex'],

            'cleanup': ['CLUSTER', 'JOB', 'LOCAL_TMP']
        })
Ejemplo n.º 5
0
    def job_runner_kwargs(self):
        """Keyword arguments used to create runners when
        :py:meth:`make_runner` is called.

        :return: map from arg name to value

        Re-define this if you want finer control of runner initialization.

        You might find :py:meth:`mrjob.conf.combine_dicts` useful if you
        want to add or change lots of keyword arguments.
        """
        return combine_dicts(
            self._non_option_kwargs(),
            self._kwargs_from_switches(_allowed_keys('base')),
            self._job_kwargs(),
        )
Ejemplo n.º 6
0
    def job_runner_kwargs(self):
        """Keyword arguments used to create runners when
        :py:meth:`make_runner` is called.

        :return: map from arg name to value

        Re-define this if you want finer control of runner initialization.

        You might find :py:meth:`mrjob.conf.combine_dicts` useful if you
        want to add or change lots of keyword arguments.
        """
        return combine_dicts(
            self._non_option_kwargs(),
            self._kwargs_from_switches(_allowed_keys('base')),
            self._job_kwargs(),
        )
Ejemplo n.º 7
0
class SimRunnerOptionStore(RunnerOptionStore):
    # these are the same for 'local' and 'inline' runners
    ALLOWED_KEYS = _allowed_keys('local')
    COMBINERS = _combiners('local')
    DEPRECATED_ALIASES = _deprecated_aliases('local')
Ejemplo n.º 8
0
class RunnerOptionStore(OptionStore):
    # 'base' is aritrary; if an option support all runners, it won't
    # have "runners" set in _RUNNER_OPTS at all
    ALLOWED_KEYS = _allowed_keys('base')
    COMBINERS = _combiners('base')
    DEPRECATED_ALIASES = _deprecated_aliases('base')

    def __init__(self, alias, opts, conf_paths):
        """
        :param alias: Runner alias (e.g. ``'local'``)
        :param opts: Keyword args to runner's constructor (usually from the
                     command line).
        :param conf_paths: An iterable of paths to config files
        """
        super(RunnerOptionStore, self).__init__()

        # sanitize incoming options and issue warnings for bad keys
        opts = self.validated_options(opts)

        unsanitized_opt_dicts = load_opts_from_mrjob_confs(
            alias, conf_paths=conf_paths)

        for path, mrjob_conf_opts in unsanitized_opt_dicts:
            self.cascading_dicts.append(
                self.validated_options(mrjob_conf_opts,
                                       from_where=(' from %s' % path)))

        self.cascading_dicts.append(opts)

        if (len(self.cascading_dicts) > 2
                and all(len(d) == 0 for d in self.cascading_dicts[2:-1])
                and (len(conf_paths or []) > 0)):
            log.warning('No configs specified for %s runner' % alias)

        self.populate_values_from_cascading_dicts()

        log.debug('Active configuration:')
        log.debug(
            pprint.pformat(
                dict((opt_key, self._obfuscate(opt_key, opt_value))
                     for opt_key, opt_value in self.items())))

    def default_options(self):
        super_opts = super(RunnerOptionStore, self).default_options()

        try:
            owner = getpass.getuser()
        except:
            owner = None

        return combine_dicts(
            super_opts, {
                'check_input_paths': True,
                'cleanup': ['ALL'],
                'cleanup_on_failure': ['NONE'],
                'local_tmp_dir': tempfile.gettempdir(),
                'owner': owner,
                'sh_bin': ['sh', '-ex'],
                'strict_protocols': True,
            })

    def validated_options(self, opts, from_where=''):
        opts = super(RunnerOptionStore,
                     self).validated_options(opts, from_where)

        self._fix_cleanup_opt('cleanup', opts, from_where)
        self._fix_cleanup_opt('cleanup_on_failure', opts, from_where)

        return opts

    def _fix_cleanup_opt(self, opt_key, opts, from_where=''):
        if opts.get(opt_key) is None:
            return

        opt_list = opts[opt_key]

        # runner expects list of string, not string
        if isinstance(opt_list, string_types):
            opt_list = [opt_list]

        if 'NONE' in opt_list and len(set(opt_list)) > 1:
            raise ValueError('Cannot clean up both nothing and something!')

        def handle_cleanup_opt(opt):
            if opt in CLEANUP_CHOICES:
                return opt

            if opt in _CLEANUP_DEPRECATED_ALIASES:
                aliased_opt = _CLEANUP_DEPRECATED_ALIASES[opt]
                # TODO: don't do this when option value is None
                log.warning(
                    'Deprecated %s option %s%s has been renamed to %s' %
                    (opt_key, opt, from_where, aliased_opt))
                return aliased_opt

            raise ValueError('%s must be one of %s, not %s' %
                             (opt_key, ', '.join(CLEANUP_CHOICES), opt))

        opt_list = [handle_cleanup_opt(opt) for opt in opt_list]

        opts[opt_key] = opt_list

    def _obfuscate(self, opt_key, opt_value):
        """Return value of opt to show in debug printout. Used to obfuscate
        credentials, etc."""
        return opt_value