def _fix_custom_options(options, option_parser): """Update *options* to handle KEY=VALUE options, etc.""" if hasattr(options, 'cmdenv'): cmdenv_err = '--cmdenv argument %r is not of the form KEY=VALUE' options.cmdenv = parse_key_value_list(options.cmdenv, cmdenv_err, option_parser.error) def parse_commas(cleanup_str): cleanup_error = ('cleanup option %s is not one of ' + ', '.join(CLEANUP_CHOICES)) new_cleanup_options = [] for choice in cleanup_str.split(','): if choice in CLEANUP_CHOICES: new_cleanup_options.append(choice) else: option_parser.error(cleanup_error % choice) if ('NONE' in new_cleanup_options and len(set(new_cleanup_options)) > 1): option_parser.error( 'Cannot clean up both nothing and something!') return new_cleanup_options if getattr(options, 'cleanup', None): options.cleanup = parse_commas(options.cleanup) if getattr(options, 'cleanup_on_failure', None): options.cleanup_on_failure = parse_commas(options.cleanup_on_failure) if hasattr(options, 'emr_api_params'): emr_api_err = ( '--emr-api-params argument %r is not of the form KEY=VALUE') options.emr_api_params = parse_key_value_list(options.emr_api_params, emr_api_err, option_parser.error) if hasattr(options, 'no_emr_api_params'): for param in options.no_emr_api_params: options.emr_api_params[param] = None if hasattr(options, 'emr_tags'): emr_tag_err = '--emr-tag argument %r is not of the form KEY=VALUE' options.emr_tags = parse_key_value_list(options.emr_tags, emr_tag_err, option_parser.error) if hasattr(options, 'jobconf'): jobconf_err = '--jobconf argument %r is not of the form KEY=VALUE' options.jobconf = parse_key_value_list(options.jobconf, jobconf_err, option_parser.error) if getattr(options, 'ssh_bind_ports', None): try: ports = parse_port_range_list(options.ssh_bind_ports) except ValueError as e: option_parser.error('invalid port range list %r: \n%s' % (options.ssh_bind_ports, e.args[0])) options.ssh_bind_ports = ports
def load_options(self, args): """Load command-line options into ``self.options``. Called from :py:meth:`__init__()` after :py:meth:`configure_options`. :type args: list of str :param args: a list of command line arguments. ``None`` will be treated the same as ``[]``. Re-define if you want to post-process command-line arguments:: def load_options(self, args): super(MRYourJob, self).load_options(args) self.stop_words = self.options.stop_words.split(',') ... """ self.options, args = self.option_parser.parse_args(args) if self.options.help_main: self._help_main() if self.options.help_emr: print_help_for_groups(self.hadoop_emr_opt_group, self.emr_opt_group) sys.exit(0) if self.options.help_hadoop: print_help_for_groups(self.hadoop_emr_opt_group, self.hadoop_opts_opt_group) sys.exit(0) if self.options.help_runner: print_help_for_groups(self.runner_opt_group) sys.exit(0) self._process_args(args) # parse custom options here to avoid setting a custom Option subclass # and confusing users if self.options.ssh_bind_ports: try: ports = parse_port_range_list(self.options.ssh_bind_ports) except ValueError as e: self.option_parser.error( 'invalid port range list "%s": \n%s' % (self.options.ssh_bind_ports, e.args[0])) self.options.ssh_bind_ports = ports cmdenv_err = 'cmdenv argument "%s" is not of the form KEY=VALUE' self.options.cmdenv = parse_key_value_list(self.options.cmdenv, cmdenv_err, self.option_parser.error) jobconf_err = 'jobconf argument "%s" is not of the form KEY=VALUE' self.options.jobconf = parse_key_value_list(self.options.jobconf, jobconf_err, self.option_parser.error) emr_api_err = 'emr-api-params argument "%s" is not of the form KEY=VALUE' self.options.emr_api_params = parse_key_value_list( self.options.emr_api_params, emr_api_err, self.option_parser.error) for param in self.options.no_emr_api_params: self.options.emr_api_params[param] = None def parse_commas(cleanup_str): cleanup_error = ('cleanup option %s is not one of ' + ', '.join(CLEANUP_CHOICES)) new_cleanup_options = [] for choice in cleanup_str.split(','): if choice in CLEANUP_CHOICES: new_cleanup_options.append(choice) else: self.option_parser.error(cleanup_error % choice) if ('NONE' in new_cleanup_options and len(set(new_cleanup_options)) > 1): self.option_parser.error( 'Cannot clean up both nothing and something!') return new_cleanup_options if self.options.cleanup is not None: self.options.cleanup = parse_commas(self.options.cleanup) if self.options.cleanup_on_failure is not None: self.options.cleanup_on_failure = parse_commas( self.options.cleanup_on_failure)
class MRJobLauncher(object): """Handle running a MapReduce job on an executable from the command line. This class will eventually support running arbitrary executables; for now it only supports :py:class:`~mrjob.job.MRJob` subclasses. Up to v0.5 it is effectively part of the :py:class:`~mrjob.job.MRJob` class itself and should not be used externally in any way. """ #: :py:class:`optparse.Option` subclass to use with the #: :py:class:`optparse.OptionParser` instance. OPTION_CLASS = Option _DEFAULT_RUNNER = 'local' def __init__(self, script_path=None, args=None, from_cl=False): """ :param script_path: Path to script unless it's the first item of *args* :param args: Command line arguments :param from_cl: If not using sys.argv but still comming from the command line (as opposed to a script, e.g. from mrjob.cmd), don't override the option parser error function (exit instead of throwing ValueError). """ if script_path is not None: script_path = os.path.abspath(script_path) self._script_path = script_path # make sure we respect the $TZ (time zone) environment variable if hasattr(time, 'tzset'): time.tzset() self._passthrough_options = [] self._file_options = [] self.option_parser = OptionParser(usage=self._usage(), option_class=self.OPTION_CLASS, add_help_option=False) self.configure_options() # don't pass None to parse_args unless we're actually running # the MRJob script if args is _READ_ARGS_FROM_SYS_ARGV: self._cl_args = sys.argv[1:] else: # don't pass sys.argv to self.option_parser, and have it # raise an exception on error rather than printing to stderr # and exiting. self._cl_args = args or [] def error(msg): raise ValueError(msg) if not from_cl: self.option_parser.error = error self.load_options(self._cl_args) # Make it possible to redirect stdin, stdout, and stderr, for testing # See sandbox(), below. self.stdin = sys.stdin self.stdout = sys.stdout self.stderr = sys.stderr @classmethod def _usage(cls): """Command line usage string for this class""" return ("usage: mrjob run [script path|executable path|--help]" " [options] [input files]") @classmethod def run(cls, args=_READ_ARGS_FROM_SYS_ARGV): """Entry point for running job from the command-line. This is also the entry point when a mapper or reducer is run by Hadoop Streaming. Does one of: * Print step information (:option:`--steps`). See :py:meth:`show_steps` * Run a mapper (:option:`--mapper`). See :py:meth:`run_mapper` * Run a combiner (:option:`--combiner`). See :py:meth:`run_combiner` * Run a reducer (:option:`--reducer`). See :py:meth:`run_reducer` * Run the entire job. See :py:meth:`run_job` """ # load options from the command line launcher = cls(args=args) launcher.run_job() def execute(self): # Launcher only runs jobs, doesn't do any Hadoop Streaming stuff self.run_job() def make_runner(self): """Make a runner based on command-line arguments, so we can launch this job on EMR, on Hadoop, or locally. :rtype: :py:class:`mrjob.runner.MRJobRunner` """ # have to import here so that we can still run the MRJob # without importing boto from mrjob.emr import EMRJobRunner from mrjob.hadoop import HadoopJobRunner from mrjob.local import LocalMRJobRunner if self.options.runner == 'emr': return EMRJobRunner(**self.emr_job_runner_kwargs()) elif self.options.runner == 'hadoop': return HadoopJobRunner(**self.hadoop_job_runner_kwargs()) elif self.options.runner == 'inline': raise ValueError("inline is not supported in the multi-lingual" " launcher.") else: # run locally by default return LocalMRJobRunner(**self.local_job_runner_kwargs()) @classmethod def set_up_logging(cls, quiet=False, verbose=False, stream=None): """Set up logging when running from the command line. This is also used by the various command-line utilities. :param bool quiet: If true, don't log. Overrides *verbose*. :param bool verbose: If true, set log level to ``DEBUG`` (default is ``INFO``) :param bool stream: Stream to log to (default is ``sys.stderr``) This will also set up a null log handler for boto, so we don't get warnings if boto tries to log about throttling and whatnot. """ if quiet: log_to_null(name='mrjob') log_to_null(name='__main__') else: log_to_stream(name='mrjob', debug=verbose, stream=stream) log_to_stream(name='__main__', debug=verbose, stream=stream) log_to_null(name='boto') def run_job(self): """Run the all steps of the job, logging errors (and debugging output if :option:`--verbose` is specified) to STDERR and streaming the output to STDOUT. Called from :py:meth:`run`. You'd probably only want to call this directly from automated tests. """ self.set_up_logging(quiet=self.options.quiet, verbose=self.options.verbose, stream=self.stderr) with self.make_runner() as runner: runner.run() if not self.options.no_output: for line in runner.stream_output(): self.stdout.write(line) self.stdout.flush() ### Command-line arguments ### def configure_options(self): """Define arguments for this script. Called from :py:meth:`__init__()`. Run ``python -m mrjob.job.MRJob --help`` to see all options. Re-define to define custom command-line arguments:: def configure_options(self): super(MRYourJob, self).configure_options self.add_passthrough_option(...) self.add_file_option(...) ... """ self.option_parser.add_option('--help', dest='help_main', action='store_true', default=False, help='show this message and exit') self.option_parser.add_option('--help-emr', dest='help_emr', action='store_true', default=False, help='show EMR-related options') self.option_parser.add_option('--help-hadoop', dest='help_hadoop', action='store_true', default=False, help='show Hadoop-related options') self.option_parser.add_option('--help-runner', dest='help_runner', action='store_true', default=False, help='show runner-related options') # protocol stuff self.proto_opt_group = OptionGroup(self.option_parser, 'Protocols') self.option_parser.add_option_group(self.proto_opt_group) add_protocol_opts(self.proto_opt_group) # options for running the entire job self.runner_opt_group = OptionGroup(self.option_parser, 'Running the entire job') self.option_parser.add_option_group(self.runner_opt_group) add_runner_opts(self.runner_opt_group, self._DEFAULT_RUNNER) add_basic_opts(self.runner_opt_group) self.hadoop_opts_opt_group = OptionGroup( self.option_parser, 'Configuring or emulating Hadoop (these apply when you set -r' ' hadoop, -r emr, or -r local)') self.option_parser.add_option_group(self.hadoop_opts_opt_group) add_hadoop_shared_opts(self.hadoop_opts_opt_group) # options common to Hadoop and EMR self.hadoop_emr_opt_group = OptionGroup( self.option_parser, 'Running on Hadoop or EMR (these apply when you set -r hadoop or' ' -r emr)') self.option_parser.add_option_group(self.hadoop_emr_opt_group) add_hadoop_emr_opts(self.hadoop_emr_opt_group) # options for running the job on Hadoop self.hadoop_opt_group = OptionGroup( self.option_parser, 'Running on Hadoop (these apply when you set -r hadoop)') self.option_parser.add_option_group(self.hadoop_opt_group) add_hadoop_opts(self.hadoop_opt_group) # options for running the job on EMR self.emr_opt_group = OptionGroup( self.option_parser, 'Running on Amazon Elastic MapReduce (these apply when you set -r' ' emr)') self.option_parser.add_option_group(self.emr_opt_group) add_emr_opts(self.emr_opt_group) def all_option_groups(self): return (self.option_parser, self.proto_opt_group, self.runner_opt_group, self.hadoop_emr_opt_group, self.emr_opt_group, self.hadoop_opts_opt_group) def is_mapper_or_reducer(self): """True if this is a mapper/reducer. This is mostly useful inside :py:meth:`load_options`, to disable loading options when we aren't running inside Hadoop Streaming. """ return False def add_passthrough_option(self, *args, **kwargs): """Function to create options which both the job runner and the job itself respect (we use this for protocols, for example). Use it like you would use :py:func:`optparse.OptionParser.add_option`:: def configure_options(self): super(MRYourJob, self).configure_options() self.add_passthrough_option( '--max-ngram-size', type='int', default=4, help='...') Specify an *opt_group* keyword argument to add the option to that :py:class:`OptionGroup` rather than the top-level :py:class:`OptionParser`. If you want to pass files through to the mapper/reducer, use :py:meth:`add_file_option` instead. """ if 'opt_group' in kwargs: pass_opt = kwargs.pop('opt_group').add_option(*args, **kwargs) else: pass_opt = self.option_parser.add_option(*args, **kwargs) self._passthrough_options.append(pass_opt) def add_file_option(self, *args, **kwargs): """Add a command-line option that sends an external file (e.g. a SQLite DB) to Hadoop:: def configure_options(self): super(MRYourJob, self).configure_options() self.add_file_option('--scoring-db', help=...) This does the right thing: the file will be uploaded to the working dir of the script on Hadoop, and the script will be passed the same option, but with the local name of the file in the script's working directory. We suggest against sending Berkeley DBs to your job, as Berkeley DB is not forwards-compatible (so a Berkeley DB that you construct on your computer may not be readable from within Hadoop). Use SQLite databases instead. If all you need is an on-disk hash table, try out the :py:mod:`sqlite3dbm` module. """ pass_opt = self.option_parser.add_option(*args, **kwargs) if not pass_opt.type == 'string': raise OptionError('passthrough file options must take strings' % pass_opt.type) if not pass_opt.action in ('store', 'append'): raise OptionError("passthrough file options must use the options" " 'store' or 'append'") self._file_options.append(pass_opt) def _process_args(self, args): """mrjob.launch takes the first arg as the script path, but mrjob.job uses all args as input files. This method determines the behavior: MRJobLauncher takes off the first arg as the script path. """ if not self._script_path: if len(args) < 1: self.option_parser.error('Must supply script path') else: self._script_path = os.path.abspath(args[0]) self.args = args[1:] def _help_main(self): self.option_parser.option_groups = [] self.option_parser.print_help() sys.exit(0) def load_options(self, args): """Load command-line options into ``self.options``. Called from :py:meth:`__init__()` after :py:meth:`configure_options`. :type args: list of str :param args: a list of command line arguments. ``None`` will be treated the same as ``[]``. Re-define if you want to post-process command-line arguments:: def load_options(self, args): super(MRYourJob, self).load_options(args) self.stop_words = self.options.stop_words.split(',') ... """ self.options, args = self.option_parser.parse_args(args) if self.options.help_main: self._help_main() if self.options.help_emr: print_help_for_groups(self.hadoop_emr_opt_group, self.emr_opt_group) sys.exit(0) if self.options.help_hadoop: print_help_for_groups(self.hadoop_emr_opt_group, self.hadoop_opts_opt_group) sys.exit(0) if self.options.help_runner: print_help_for_groups(self.runner_opt_group) sys.exit(0) self._process_args(args) # parse custom options here to avoid setting a custom Option subclass # and confusing users if self.options.ssh_bind_ports: try: ports = parse_port_range_list(self.options.ssh_bind_ports) except ValueError, e: self.option_parser.error( 'invalid port range list "%s": \n%s' % (self.options.ssh_bind_ports, e.args[0])) self.options.ssh_bind_ports = ports cmdenv_err = 'cmdenv argument "%s" is not of the form KEY=VALUE' self.options.cmdenv = parse_key_value_list(self.options.cmdenv, cmdenv_err, self.option_parser.error) jobconf_err = 'jobconf argument "%s" is not of the form KEY=VALUE' self.options.jobconf = parse_key_value_list(self.options.jobconf, jobconf_err, self.option_parser.error) emr_api_err = 'emr-api-params argument "%s" is not of the form KEY=VALUE' self.options.emr_api_params = parse_key_value_list( self.options.emr_api_params, emr_api_err, self.option_parser.error) for param in self.options.no_emr_api_params: self.options.emr_api_params[param] = None def parse_commas(cleanup_str): cleanup_error = ('cleanup option %s is not one of ' + ', '.join(CLEANUP_CHOICES)) new_cleanup_options = [] for choice in cleanup_str.split(','): if choice in CLEANUP_CHOICES: new_cleanup_options.append(choice) else: self.option_parser.error(cleanup_error % choice) if ('NONE' in new_cleanup_options and len(set(new_cleanup_options)) > 1): self.option_parser.error( 'Cannot clean up both nothing and something!') return new_cleanup_options if self.options.cleanup is not None: self.options.cleanup = parse_commas(self.options.cleanup) if self.options.cleanup_on_failure is not None: self.options.cleanup_on_failure = parse_commas( self.options.cleanup_on_failure)
def load_options(self, args): """Load command-line options into ``self.options``, ``self._script_path``, and ``self.args``. Called from :py:meth:`__init__()` after :py:meth:`configure_options`. :type args: list of str :param args: a list of command line arguments. ``None`` will be treated the same as ``[]``. Re-define if you want to post-process command-line arguments:: def load_options(self, args): super(MRYourJob, self).load_options(args) self.stop_words = self.options.stop_words.split(',') ... """ self.options, args = self.option_parser.parse_args(args) if self.options.help_main: self._help_main() if self.options.help_emr: print_help_for_groups(self.hadoop_emr_opt_group, self.emr_opt_group) sys.exit(0) if self.options.help_hadoop: print_help_for_groups(self.hadoop_emr_opt_group, self.hadoop_opts_opt_group) sys.exit(0) if self.options.help_runner: print_help_for_groups(self.runner_opt_group) sys.exit(0) self._process_args(args) # parse custom options here to avoid setting a custom Option subclass # and confusing users if self.options.ssh_bind_ports: try: ports = parse_port_range_list(self.options.ssh_bind_ports) except ValueError as e: self.option_parser.error('invalid port range list "%s": \n%s' % (self.options.ssh_bind_ports, e.args[0])) self.options.ssh_bind_ports = ports cmdenv_err = 'cmdenv argument "%s" is not of the form KEY=VALUE' self.options.cmdenv = parse_key_value_list(self.options.cmdenv, cmdenv_err, self.option_parser.error) jobconf_err = 'jobconf argument "%s" is not of the form KEY=VALUE' self.options.jobconf = parse_key_value_list(self.options.jobconf, jobconf_err, self.option_parser.error) # emr_api_params emr_api_err = ( 'emr-api-params argument "%s" is not of the form KEY=VALUE') self.options.emr_api_params = parse_key_value_list( self.options.emr_api_params, emr_api_err, self.option_parser.error) # no_emr_api_params just exists to modify emr_api_params for param in self.options.no_emr_api_params: self.options.emr_api_params[param] = None def parse_commas(cleanup_str): cleanup_error = ('cleanup option %s is not one of ' + ', '.join(CLEANUP_CHOICES)) new_cleanup_options = [] for choice in cleanup_str.split(','): if choice in CLEANUP_CHOICES: new_cleanup_options.append(choice) else: self.option_parser.error(cleanup_error % choice) if ('NONE' in new_cleanup_options and len(set(new_cleanup_options)) > 1): self.option_parser.error( 'Cannot clean up both nothing and something!') return new_cleanup_options if self.options.cleanup is not None: self.options.cleanup = parse_commas(self.options.cleanup) if self.options.cleanup_on_failure is not None: self.options.cleanup_on_failure = parse_commas( self.options.cleanup_on_failure)