Exemple #1
0
def _fix_custom_options(options, option_parser):
    """Update *options* to handle KEY=VALUE options, etc."""
    if hasattr(options, 'cmdenv'):
        cmdenv_err = '--cmdenv argument %r is not of the form KEY=VALUE'
        options.cmdenv = parse_key_value_list(options.cmdenv,
                                              cmdenv_err,
                                              option_parser.error)

    def parse_commas(cleanup_str):
        cleanup_error = ('cleanup option %s is not one of ' +
                         ', '.join(CLEANUP_CHOICES))
        new_cleanup_options = []
        for choice in cleanup_str.split(','):
            if choice in CLEANUP_CHOICES:
                new_cleanup_options.append(choice)
            else:
                option_parser.error(cleanup_error % choice)
        if ('NONE' in new_cleanup_options and
                len(set(new_cleanup_options)) > 1):
            option_parser.error(
                'Cannot clean up both nothing and something!')

        return new_cleanup_options

    if getattr(options, 'cleanup', None):
        options.cleanup = parse_commas(options.cleanup)

    if getattr(options, 'cleanup_on_failure', None):
        options.cleanup_on_failure = parse_commas(options.cleanup_on_failure)

    if hasattr(options, 'emr_api_params'):
        emr_api_err = (
            '--emr-api-params argument %r is not of the form KEY=VALUE')
        options.emr_api_params = parse_key_value_list(options.emr_api_params,
                                                      emr_api_err,
                                                      option_parser.error)

        if hasattr(options, 'no_emr_api_params'):
                for param in options.no_emr_api_params:
                    options.emr_api_params[param] = None

    if hasattr(options, 'emr_tags'):
        emr_tag_err = '--emr-tag argument %r is not of the form KEY=VALUE'
        options.emr_tags = parse_key_value_list(options.emr_tags,
                                                emr_tag_err,
                                                option_parser.error)

    if hasattr(options, 'jobconf'):
        jobconf_err = '--jobconf argument %r is not of the form KEY=VALUE'
        options.jobconf = parse_key_value_list(options.jobconf,
                                               jobconf_err,
                                               option_parser.error)

    if getattr(options, 'ssh_bind_ports', None):
        try:
            ports = parse_port_range_list(options.ssh_bind_ports)
        except ValueError as e:
            option_parser.error('invalid port range list %r: \n%s' %
                                (options.ssh_bind_ports, e.args[0]))
            options.ssh_bind_ports = ports
Exemple #2
0
def _fix_custom_options(options, option_parser):
    """Update *options* to handle KEY=VALUE options, etc."""
    if hasattr(options, 'cmdenv'):
        cmdenv_err = '--cmdenv argument %r is not of the form KEY=VALUE'
        options.cmdenv = parse_key_value_list(options.cmdenv,
                                              cmdenv_err,
                                              option_parser.error)

    def parse_commas(cleanup_str):
        cleanup_error = ('cleanup option %s is not one of ' +
                         ', '.join(CLEANUP_CHOICES))
        new_cleanup_options = []
        for choice in cleanup_str.split(','):
            if choice in CLEANUP_CHOICES:
                new_cleanup_options.append(choice)
            else:
                option_parser.error(cleanup_error % choice)
        if ('NONE' in new_cleanup_options and
                len(set(new_cleanup_options)) > 1):
            option_parser.error(
                'Cannot clean up both nothing and something!')

        return new_cleanup_options

    if getattr(options, 'cleanup', None):
        options.cleanup = parse_commas(options.cleanup)

    if getattr(options, 'cleanup_on_failure', None):
        options.cleanup_on_failure = parse_commas(options.cleanup_on_failure)

    if hasattr(options, 'emr_api_params'):
        emr_api_err = (
            '--emr-api-params argument %r is not of the form KEY=VALUE')
        options.emr_api_params = parse_key_value_list(options.emr_api_params,
                                                      emr_api_err,
                                                      option_parser.error)

        if hasattr(options, 'no_emr_api_params'):
                for param in options.no_emr_api_params:
                    options.emr_api_params[param] = None

    if hasattr(options, 'emr_tags'):
        emr_tag_err = '--emr-tag argument %r is not of the form KEY=VALUE'
        options.emr_tags = parse_key_value_list(options.emr_tags,
                                                emr_tag_err,
                                                option_parser.error)

    if hasattr(options, 'jobconf'):
        jobconf_err = '--jobconf argument %r is not of the form KEY=VALUE'
        options.jobconf = parse_key_value_list(options.jobconf,
                                               jobconf_err,
                                               option_parser.error)

    if getattr(options, 'ssh_bind_ports', None):
        try:
            ports = parse_port_range_list(options.ssh_bind_ports)
        except ValueError as e:
            option_parser.error('invalid port range list %r: \n%s' %
                                (options.ssh_bind_ports, e.args[0]))
            options.ssh_bind_ports = ports
Exemple #3
0
    def load_options(self, args):
        """Load command-line options into ``self.options``.

        Called from :py:meth:`__init__()` after :py:meth:`configure_options`.

        :type args: list of str
        :param args: a list of command line arguments. ``None`` will be
                     treated the same as ``[]``.

        Re-define if you want to post-process command-line arguments::

            def load_options(self, args):
                super(MRYourJob, self).load_options(args)

                self.stop_words = self.options.stop_words.split(',')
                ...
        """
        self.options, args = self.option_parser.parse_args(args)

        if self.options.help_main:
            self._help_main()

        if self.options.help_emr:
            print_help_for_groups(self.hadoop_emr_opt_group,
                                  self.emr_opt_group)
            sys.exit(0)

        if self.options.help_hadoop:
            print_help_for_groups(self.hadoop_emr_opt_group,
                                  self.hadoop_opts_opt_group)
            sys.exit(0)

        if self.options.help_runner:
            print_help_for_groups(self.runner_opt_group)
            sys.exit(0)

        self._process_args(args)

        # parse custom options here to avoid setting a custom Option subclass
        # and confusing users

        if self.options.ssh_bind_ports:
            try:
                ports = parse_port_range_list(self.options.ssh_bind_ports)
            except ValueError as e:
                self.option_parser.error(
                    'invalid port range list "%s": \n%s' %
                    (self.options.ssh_bind_ports, e.args[0]))
            self.options.ssh_bind_ports = ports

        cmdenv_err = 'cmdenv argument "%s" is not of the form KEY=VALUE'
        self.options.cmdenv = parse_key_value_list(self.options.cmdenv,
                                                   cmdenv_err,
                                                   self.option_parser.error)

        jobconf_err = 'jobconf argument "%s" is not of the form KEY=VALUE'
        self.options.jobconf = parse_key_value_list(self.options.jobconf,
                                                    jobconf_err,
                                                    self.option_parser.error)

        emr_api_err = 'emr-api-params argument "%s" is not of the form KEY=VALUE'
        self.options.emr_api_params = parse_key_value_list(
            self.options.emr_api_params, emr_api_err, self.option_parser.error)
        for param in self.options.no_emr_api_params:
            self.options.emr_api_params[param] = None

        def parse_commas(cleanup_str):
            cleanup_error = ('cleanup option %s is not one of ' +
                             ', '.join(CLEANUP_CHOICES))
            new_cleanup_options = []
            for choice in cleanup_str.split(','):
                if choice in CLEANUP_CHOICES:
                    new_cleanup_options.append(choice)
                else:
                    self.option_parser.error(cleanup_error % choice)
            if ('NONE' in new_cleanup_options
                    and len(set(new_cleanup_options)) > 1):
                self.option_parser.error(
                    'Cannot clean up both nothing and something!')
            return new_cleanup_options

        if self.options.cleanup is not None:
            self.options.cleanup = parse_commas(self.options.cleanup)
        if self.options.cleanup_on_failure is not None:
            self.options.cleanup_on_failure = parse_commas(
                self.options.cleanup_on_failure)
Exemple #4
0
class MRJobLauncher(object):
    """Handle running a MapReduce job on an executable from the command line.
    This class will eventually support running arbitrary executables; for now
    it only supports :py:class:`~mrjob.job.MRJob` subclasses. Up to v0.5 it is
    effectively part of the :py:class:`~mrjob.job.MRJob` class itself and
    should not be used externally in any way.
    """

    #: :py:class:`optparse.Option` subclass to use with the
    #: :py:class:`optparse.OptionParser` instance.
    OPTION_CLASS = Option

    _DEFAULT_RUNNER = 'local'

    def __init__(self, script_path=None, args=None, from_cl=False):
        """
        :param script_path: Path to script unless it's the first item of *args*
        :param args: Command line arguments
        :param from_cl: If not using sys.argv but still comming from the
                        command line (as opposed to a script, e.g. from
                        mrjob.cmd), don't override the option parser error
                        function (exit instead of throwing ValueError).
        """

        if script_path is not None:
            script_path = os.path.abspath(script_path)
        self._script_path = script_path

        # make sure we respect the $TZ (time zone) environment variable
        if hasattr(time, 'tzset'):
            time.tzset()

        self._passthrough_options = []
        self._file_options = []

        self.option_parser = OptionParser(usage=self._usage(),
                                          option_class=self.OPTION_CLASS,
                                          add_help_option=False)
        self.configure_options()

        # don't pass None to parse_args unless we're actually running
        # the MRJob script
        if args is _READ_ARGS_FROM_SYS_ARGV:
            self._cl_args = sys.argv[1:]
        else:
            # don't pass sys.argv to self.option_parser, and have it
            # raise an exception on error rather than printing to stderr
            # and exiting.
            self._cl_args = args or []

            def error(msg):
                raise ValueError(msg)

            if not from_cl:
                self.option_parser.error = error

        self.load_options(self._cl_args)

        # Make it possible to redirect stdin, stdout, and stderr, for testing
        # See sandbox(), below.
        self.stdin = sys.stdin
        self.stdout = sys.stdout
        self.stderr = sys.stderr

    @classmethod
    def _usage(cls):
        """Command line usage string for this class"""
        return ("usage: mrjob run [script path|executable path|--help]"
                " [options] [input files]")

    @classmethod
    def run(cls, args=_READ_ARGS_FROM_SYS_ARGV):
        """Entry point for running job from the command-line.

        This is also the entry point when a mapper or reducer is run
        by Hadoop Streaming.

        Does one of:

        * Print step information (:option:`--steps`). See :py:meth:`show_steps`
        * Run a mapper (:option:`--mapper`). See :py:meth:`run_mapper`
        * Run a combiner (:option:`--combiner`). See :py:meth:`run_combiner`
        * Run a reducer (:option:`--reducer`). See :py:meth:`run_reducer`
        * Run the entire job. See :py:meth:`run_job`
        """
        # load options from the command line
        launcher = cls(args=args)
        launcher.run_job()

    def execute(self):
        # Launcher only runs jobs, doesn't do any Hadoop Streaming stuff
        self.run_job()

    def make_runner(self):
        """Make a runner based on command-line arguments, so we can
        launch this job on EMR, on Hadoop, or locally.

        :rtype: :py:class:`mrjob.runner.MRJobRunner`
        """
        # have to import here so that we can still run the MRJob
        # without importing boto
        from mrjob.emr import EMRJobRunner
        from mrjob.hadoop import HadoopJobRunner
        from mrjob.local import LocalMRJobRunner

        if self.options.runner == 'emr':
            return EMRJobRunner(**self.emr_job_runner_kwargs())

        elif self.options.runner == 'hadoop':
            return HadoopJobRunner(**self.hadoop_job_runner_kwargs())

        elif self.options.runner == 'inline':
            raise ValueError("inline is not supported in the multi-lingual"
                             " launcher.")

        else:
            # run locally by default
            return LocalMRJobRunner(**self.local_job_runner_kwargs())

    @classmethod
    def set_up_logging(cls, quiet=False, verbose=False, stream=None):
        """Set up logging when running from the command line. This is also
        used by the various command-line utilities.

        :param bool quiet: If true, don't log. Overrides *verbose*.
        :param bool verbose: If true, set log level to ``DEBUG`` (default is
                             ``INFO``)
        :param bool stream: Stream to log to (default is ``sys.stderr``)

        This will also set up a null log handler for boto, so we don't get
        warnings if boto tries to log about throttling and whatnot.
        """
        if quiet:
            log_to_null(name='mrjob')
            log_to_null(name='__main__')
        else:
            log_to_stream(name='mrjob', debug=verbose, stream=stream)
            log_to_stream(name='__main__', debug=verbose, stream=stream)

        log_to_null(name='boto')

    def run_job(self):
        """Run the all steps of the job, logging errors (and debugging output
        if :option:`--verbose` is specified) to STDERR and streaming the
        output to STDOUT.

        Called from :py:meth:`run`. You'd probably only want to call this
        directly from automated tests.
        """
        self.set_up_logging(quiet=self.options.quiet,
                            verbose=self.options.verbose,
                            stream=self.stderr)

        with self.make_runner() as runner:
            runner.run()

            if not self.options.no_output:
                for line in runner.stream_output():
                    self.stdout.write(line)
                self.stdout.flush()

    ### Command-line arguments ###

    def configure_options(self):
        """Define arguments for this script. Called from :py:meth:`__init__()`.

        Run ``python -m mrjob.job.MRJob --help`` to see all options.

        Re-define to define custom command-line arguments::

            def configure_options(self):
                super(MRYourJob, self).configure_options

                self.add_passthrough_option(...)
                self.add_file_option(...)
                ...
        """
        self.option_parser.add_option('--help',
                                      dest='help_main',
                                      action='store_true',
                                      default=False,
                                      help='show this message and exit')

        self.option_parser.add_option('--help-emr',
                                      dest='help_emr',
                                      action='store_true',
                                      default=False,
                                      help='show EMR-related options')

        self.option_parser.add_option('--help-hadoop',
                                      dest='help_hadoop',
                                      action='store_true',
                                      default=False,
                                      help='show Hadoop-related options')

        self.option_parser.add_option('--help-runner',
                                      dest='help_runner',
                                      action='store_true',
                                      default=False,
                                      help='show runner-related options')

        # protocol stuff
        self.proto_opt_group = OptionGroup(self.option_parser, 'Protocols')
        self.option_parser.add_option_group(self.proto_opt_group)

        add_protocol_opts(self.proto_opt_group)

        # options for running the entire job
        self.runner_opt_group = OptionGroup(self.option_parser,
                                            'Running the entire job')
        self.option_parser.add_option_group(self.runner_opt_group)

        add_runner_opts(self.runner_opt_group, self._DEFAULT_RUNNER)
        add_basic_opts(self.runner_opt_group)

        self.hadoop_opts_opt_group = OptionGroup(
            self.option_parser,
            'Configuring or emulating Hadoop (these apply when you set -r'
            ' hadoop, -r emr, or -r local)')
        self.option_parser.add_option_group(self.hadoop_opts_opt_group)

        add_hadoop_shared_opts(self.hadoop_opts_opt_group)

        # options common to Hadoop and EMR
        self.hadoop_emr_opt_group = OptionGroup(
            self.option_parser,
            'Running on Hadoop or EMR (these apply when you set -r hadoop or'
            ' -r emr)')
        self.option_parser.add_option_group(self.hadoop_emr_opt_group)

        add_hadoop_emr_opts(self.hadoop_emr_opt_group)

        # options for running the job on Hadoop
        self.hadoop_opt_group = OptionGroup(
            self.option_parser,
            'Running on Hadoop (these apply when you set -r hadoop)')
        self.option_parser.add_option_group(self.hadoop_opt_group)

        add_hadoop_opts(self.hadoop_opt_group)

        # options for running the job on EMR
        self.emr_opt_group = OptionGroup(
            self.option_parser,
            'Running on Amazon Elastic MapReduce (these apply when you set -r'
            ' emr)')
        self.option_parser.add_option_group(self.emr_opt_group)

        add_emr_opts(self.emr_opt_group)

    def all_option_groups(self):
        return (self.option_parser, self.proto_opt_group,
                self.runner_opt_group, self.hadoop_emr_opt_group,
                self.emr_opt_group, self.hadoop_opts_opt_group)

    def is_mapper_or_reducer(self):
        """True if this is a mapper/reducer.

        This is mostly useful inside :py:meth:`load_options`, to disable
        loading options when we aren't running inside Hadoop Streaming.
        """
        return False

    def add_passthrough_option(self, *args, **kwargs):
        """Function to create options which both the job runner
        and the job itself respect (we use this for protocols, for example).

        Use it like you would use :py:func:`optparse.OptionParser.add_option`::

            def configure_options(self):
                super(MRYourJob, self).configure_options()
                self.add_passthrough_option(
                    '--max-ngram-size', type='int', default=4, help='...')

        Specify an *opt_group* keyword argument to add the option to that
        :py:class:`OptionGroup` rather than the top-level
        :py:class:`OptionParser`.

        If you want to pass files through to the mapper/reducer, use
        :py:meth:`add_file_option` instead.
        """
        if 'opt_group' in kwargs:
            pass_opt = kwargs.pop('opt_group').add_option(*args, **kwargs)
        else:
            pass_opt = self.option_parser.add_option(*args, **kwargs)

        self._passthrough_options.append(pass_opt)

    def add_file_option(self, *args, **kwargs):
        """Add a command-line option that sends an external file
        (e.g. a SQLite DB) to Hadoop::

             def configure_options(self):
                super(MRYourJob, self).configure_options()
                self.add_file_option('--scoring-db', help=...)

        This does the right thing: the file will be uploaded to the working
        dir of the script on Hadoop, and the script will be passed the same
        option, but with the local name of the file in the script's working
        directory.

        We suggest against sending Berkeley DBs to your job, as
        Berkeley DB is not forwards-compatible (so a Berkeley DB that you
        construct on your computer may not be readable from within
        Hadoop). Use SQLite databases instead. If all you need is an on-disk
        hash table, try out the :py:mod:`sqlite3dbm` module.
        """
        pass_opt = self.option_parser.add_option(*args, **kwargs)

        if not pass_opt.type == 'string':
            raise OptionError('passthrough file options must take strings' %
                              pass_opt.type)

        if not pass_opt.action in ('store', 'append'):
            raise OptionError("passthrough file options must use the options"
                              " 'store' or 'append'")

        self._file_options.append(pass_opt)

    def _process_args(self, args):
        """mrjob.launch takes the first arg as the script path, but mrjob.job
        uses all args as input files. This method determines the behavior:
        MRJobLauncher takes off the first arg as the script path.
        """
        if not self._script_path:
            if len(args) < 1:
                self.option_parser.error('Must supply script path')
            else:
                self._script_path = os.path.abspath(args[0])
                self.args = args[1:]

    def _help_main(self):
        self.option_parser.option_groups = []
        self.option_parser.print_help()
        sys.exit(0)

    def load_options(self, args):
        """Load command-line options into ``self.options``.

        Called from :py:meth:`__init__()` after :py:meth:`configure_options`.

        :type args: list of str
        :param args: a list of command line arguments. ``None`` will be
                     treated the same as ``[]``.

        Re-define if you want to post-process command-line arguments::

            def load_options(self, args):
                super(MRYourJob, self).load_options(args)

                self.stop_words = self.options.stop_words.split(',')
                ...
        """
        self.options, args = self.option_parser.parse_args(args)

        if self.options.help_main:
            self._help_main()

        if self.options.help_emr:
            print_help_for_groups(self.hadoop_emr_opt_group,
                                  self.emr_opt_group)
            sys.exit(0)

        if self.options.help_hadoop:
            print_help_for_groups(self.hadoop_emr_opt_group,
                                  self.hadoop_opts_opt_group)
            sys.exit(0)

        if self.options.help_runner:
            print_help_for_groups(self.runner_opt_group)
            sys.exit(0)

        self._process_args(args)

        # parse custom options here to avoid setting a custom Option subclass
        # and confusing users

        if self.options.ssh_bind_ports:
            try:
                ports = parse_port_range_list(self.options.ssh_bind_ports)
            except ValueError, e:
                self.option_parser.error(
                    'invalid port range list "%s": \n%s' %
                    (self.options.ssh_bind_ports, e.args[0]))
            self.options.ssh_bind_ports = ports

        cmdenv_err = 'cmdenv argument "%s" is not of the form KEY=VALUE'
        self.options.cmdenv = parse_key_value_list(self.options.cmdenv,
                                                   cmdenv_err,
                                                   self.option_parser.error)

        jobconf_err = 'jobconf argument "%s" is not of the form KEY=VALUE'
        self.options.jobconf = parse_key_value_list(self.options.jobconf,
                                                    jobconf_err,
                                                    self.option_parser.error)

        emr_api_err = 'emr-api-params argument "%s" is not of the form KEY=VALUE'
        self.options.emr_api_params = parse_key_value_list(
            self.options.emr_api_params, emr_api_err, self.option_parser.error)
        for param in self.options.no_emr_api_params:
            self.options.emr_api_params[param] = None

        def parse_commas(cleanup_str):
            cleanup_error = ('cleanup option %s is not one of ' +
                             ', '.join(CLEANUP_CHOICES))
            new_cleanup_options = []
            for choice in cleanup_str.split(','):
                if choice in CLEANUP_CHOICES:
                    new_cleanup_options.append(choice)
                else:
                    self.option_parser.error(cleanup_error % choice)
            if ('NONE' in new_cleanup_options
                    and len(set(new_cleanup_options)) > 1):
                self.option_parser.error(
                    'Cannot clean up both nothing and something!')
            return new_cleanup_options

        if self.options.cleanup is not None:
            self.options.cleanup = parse_commas(self.options.cleanup)
        if self.options.cleanup_on_failure is not None:
            self.options.cleanup_on_failure = parse_commas(
                self.options.cleanup_on_failure)
Exemple #5
0
    def load_options(self, args):
        """Load command-line options into ``self.options``,
        ``self._script_path``, and ``self.args``.

        Called from :py:meth:`__init__()` after :py:meth:`configure_options`.

        :type args: list of str
        :param args: a list of command line arguments. ``None`` will be
                     treated the same as ``[]``.

        Re-define if you want to post-process command-line arguments::

            def load_options(self, args):
                super(MRYourJob, self).load_options(args)

                self.stop_words = self.options.stop_words.split(',')
                ...
        """
        self.options, args = self.option_parser.parse_args(args)

        if self.options.help_main:
            self._help_main()

        if self.options.help_emr:
            print_help_for_groups(self.hadoop_emr_opt_group,
                                  self.emr_opt_group)
            sys.exit(0)

        if self.options.help_hadoop:
            print_help_for_groups(self.hadoop_emr_opt_group,
                                  self.hadoop_opts_opt_group)
            sys.exit(0)

        if self.options.help_runner:
            print_help_for_groups(self.runner_opt_group)
            sys.exit(0)

        self._process_args(args)

        # parse custom options here to avoid setting a custom Option subclass
        # and confusing users

        if self.options.ssh_bind_ports:
            try:
                ports = parse_port_range_list(self.options.ssh_bind_ports)
            except ValueError as e:
                self.option_parser.error('invalid port range list "%s": \n%s' %
                                         (self.options.ssh_bind_ports,
                                          e.args[0]))
            self.options.ssh_bind_ports = ports

        cmdenv_err = 'cmdenv argument "%s" is not of the form KEY=VALUE'
        self.options.cmdenv = parse_key_value_list(self.options.cmdenv,
                                                   cmdenv_err,
                                                   self.option_parser.error)

        jobconf_err = 'jobconf argument "%s" is not of the form KEY=VALUE'
        self.options.jobconf = parse_key_value_list(self.options.jobconf,
                                                    jobconf_err,
                                                    self.option_parser.error)

        # emr_api_params
        emr_api_err = (
            'emr-api-params argument "%s" is not of the form KEY=VALUE')

        self.options.emr_api_params = parse_key_value_list(
            self.options.emr_api_params,
            emr_api_err,
            self.option_parser.error)

        # no_emr_api_params just exists to modify emr_api_params
        for param in self.options.no_emr_api_params:
            self.options.emr_api_params[param] = None

        def parse_commas(cleanup_str):
            cleanup_error = ('cleanup option %s is not one of ' +
                             ', '.join(CLEANUP_CHOICES))
            new_cleanup_options = []
            for choice in cleanup_str.split(','):
                if choice in CLEANUP_CHOICES:
                    new_cleanup_options.append(choice)
                else:
                    self.option_parser.error(cleanup_error % choice)
            if ('NONE' in new_cleanup_options and
                    len(set(new_cleanup_options)) > 1):
                self.option_parser.error(
                    'Cannot clean up both nothing and something!')
            return new_cleanup_options

        if self.options.cleanup is not None:
            self.options.cleanup = parse_commas(self.options.cleanup)
        if self.options.cleanup_on_failure is not None:
            self.options.cleanup_on_failure = parse_commas(
                self.options.cleanup_on_failure)