def _parse_setup(self):
        """Parse the *setup* option with
        :py:func:`mrjob.setup.parse_setup_cmd()`.

        If *bootstrap_mrjob* and ``self.BOOTSTRAP_MRJOB_IN_SETUP`` are both
        true, create mrjob.tar.gz (if it doesn't exist already) and
        prepend a setup command that adds it to PYTHONPATH.

        Also patch in the deprecated
        options *python_archives*, *setup_cmd*, and *setup_script*
        as setup commands.
        """
        setup = []

        # python_archives
        for path in self._opts['python_archives']:
            path_dict = parse_legacy_hash_path('archive', path)
            setup.append(['export PYTHONPATH=', path_dict, ':$PYTHONPATH'])

        # setup
        for cmd in self._opts['setup']:
            setup.append(parse_setup_cmd(cmd))

        # setup_cmds
        for cmd in self._opts['setup_cmds']:
            if not isinstance(cmd, basestring):
                cmd = cmd_line(cmd)
            setup.append([cmd])

        # setup_scripts
        for path in self._opts['setup_scripts']:
            path_dict = parse_legacy_hash_path('file', path)
            setup.append([path_dict])

        return setup
Esempio n. 2
0
 def test_must_name(self):
     self.assertEqual(
         parse_legacy_hash_path('file', 'foo#bar', must_name='it'), {
             'type': 'file',
             'path': 'foo',
             'name': 'bar'
         })
     # use basename if no hash
     self.assertEqual(parse_legacy_hash_path('file', 'foo', must_name='it'),
                      {
                          'type': 'file',
                          'path': 'foo',
                          'name': 'foo'
                      })
     # raise error on explicit empty name
     self.assertRaises(ValueError,
                       parse_legacy_hash_path,
                       'file',
                       'foo#',
                       must_name='it')
     # raise error if no basename
     self.assertRaises(ValueError,
                       parse_legacy_hash_path,
                       'file',
                       'foo/',
                       must_name='it')
Esempio n. 3
0
 def test_basic(self):
     self.assertEqual(parse_legacy_hash_path('file', 'foo#bar'), {
         'type': 'file',
         'path': 'foo',
         'name': 'bar'
     })
     self.assertEqual(parse_legacy_hash_path('file', '/dir/foo#bar'), {
         'type': 'file',
         'path': '/dir/foo',
         'name': 'bar'
     })
Esempio n. 4
0
 def test_no_name(self):
     self.assertEqual(parse_legacy_hash_path('file', 'foo'), {
         'type': 'file',
         'path': 'foo',
         'name': None
     })
     self.assertEqual(parse_legacy_hash_path('file', 'foo#'), {
         'type': 'file',
         'path': 'foo',
         'name': None
     })
Esempio n. 5
0
 def test_trailing_slash_in_name(self):
     self.assertRaises(ValueError, parse_legacy_hash_path, "file", "foo.tar.gz#bar/")
     self.assertRaises(ValueError, parse_legacy_hash_path, "file", "foo.tar.gz#/")
     # trailing slash is allowed for archives because that's the new
     # way of indicating archives
     self.assertEqual(
         parse_legacy_hash_path("archive", "foo.tar.gz#bar/"),
         {"type": "archive", "path": "foo.tar.gz", "name": "bar"},
     )
     self.assertEqual(
         parse_legacy_hash_path("archive", "foo.tar.gz#/"), {"type": "archive", "path": "foo.tar.gz", "name": None}
     )
Esempio n. 6
0
 def test_must_name(self):
     self.assertEqual(
         parse_legacy_hash_path("file", "foo#bar", must_name="it"), {"type": "file", "path": "foo", "name": "bar"}
     )
     # use basename if no hash
     self.assertEqual(
         parse_legacy_hash_path("file", "foo", must_name="it"), {"type": "file", "path": "foo", "name": "foo"}
     )
     # raise error on explicit empty name
     self.assertRaises(ValueError, parse_legacy_hash_path, "file", "foo#", must_name="it")
     # raise error if no basename
     self.assertRaises(ValueError, parse_legacy_hash_path, "file", "foo/", must_name="it")
Esempio n. 7
0
 def test_trailing_slash_in_name(self):
     self.assertRaises(
         ValueError,
         parse_legacy_hash_path, 'file', 'foo.tar.gz#bar/')
     self.assertRaises(
         ValueError,
         parse_legacy_hash_path, 'file', 'foo.tar.gz#/')
     # trailing slash is allowed for archives because that's the new
     # way of indicating archives
     self.assertEqual(
         parse_legacy_hash_path('archive', 'foo.tar.gz#bar/'),
         {'type': 'archive', 'path': 'foo.tar.gz', 'name': 'bar'})
     self.assertEqual(
         parse_legacy_hash_path('archive', 'foo.tar.gz#/'),
         {'type': 'archive', 'path': 'foo.tar.gz', 'name': None})
Esempio n. 8
0
 def test_trailing_slash_in_name(self):
     self.assertRaises(
         ValueError,
         parse_legacy_hash_path, 'file', 'foo.tar.gz#bar/')
     self.assertRaises(
         ValueError,
         parse_legacy_hash_path, 'file', 'foo.tar.gz#/')
     # trailing slash is allowed for archives because that's the new
     # way of indicating archives
     self.assertEqual(
         parse_legacy_hash_path('archive', 'foo.tar.gz#bar/'),
         {'type': 'archive', 'path': 'foo.tar.gz', 'name': 'bar'})
     self.assertEqual(
         parse_legacy_hash_path('archive', 'foo.tar.gz#/'),
         {'type': 'archive', 'path': 'foo.tar.gz', 'name': None})
Esempio n. 9
0
 def test_must_name(self):
     self.assertEqual(
         parse_legacy_hash_path('file', 'foo#bar', must_name='it'),
         {'type': 'file', 'path': 'foo', 'name': 'bar'})
     # use basename if no hash
     self.assertEqual(
         parse_legacy_hash_path('file', 'foo', must_name='it'),
         {'type': 'file', 'path': 'foo', 'name': 'foo'})
     # raise error on explicit empty name
     self.assertRaises(ValueError,
                       parse_legacy_hash_path, 'file', 'foo#',
                       must_name='it')
     # raise error if no basename
     self.assertRaises(ValueError,
                       parse_legacy_hash_path, 'file', 'foo/',
                       must_name='it')
Esempio n. 10
0
    def _parse_setup(self):
        """Parse the *setup* option with
        :py:func:`mrjob.setup.parse_setup_cmd()`.

        If *bootstrap_mrjob* and ``self.BOOTSTRAP_MRJOB_IN_SETUP`` are both
        true, create mrjob.tar.gz (if it doesn't exist already) and
        prepend a setup command that adds it to PYTHONPATH.

        Also patch in the deprecated
        options *python_archives*, *setup_cmd*, and *setup_script*
        as setup commands.
        """
        setup = []

        # python_archives
        for path in self._opts["python_archives"]:
            path_dict = parse_legacy_hash_path("archive", path)
            setup.append(["export PYTHONPATH=", path_dict, ":$PYTHONPATH"])

        # setup
        for cmd in self._opts["setup"]:
            setup.append(parse_setup_cmd(cmd))

        # setup_cmds
        if self._opts["setup_cmds"]:
            log.warning(
                "setup_cmds is deprecated since v0.4.2 and will be removed" " in v0.6.0. Consider using setup instead."
            )

        for cmd in self._opts["setup_cmds"]:
            if not isinstance(cmd, string_types):
                cmd = cmd_line(cmd)
            setup.append([cmd])

        # setup_scripts
        if self._opts["setup_scripts"]:
            log.warning(
                "setup_scripts is deprecated since v0.4.2 and will be removed"
                " in v0.6.0. Consider using setup instead."
            )

        for path in self._opts["setup_scripts"]:
            path_dict = parse_legacy_hash_path("file", path)
            setup.append([path_dict])

        return setup
Esempio n. 11
0
    def _parse_setup_and_py_files(self):
        """Parse the *setup* option with
        :py:func:`mrjob.setup.parse_setup_cmd()`, and patch in *py_files*.
        """
        setup = []

        # py_files
        for path in self._opts['py_files']:
            # Spark (at least v1.3.1) doesn't work with # and --py-files,
            # see #1375
            if '#' in path:
                raise ValueError("py_files cannot contain '#'")
            path_dict = parse_legacy_hash_path('file', path)
            setup.append(['export PYTHONPATH=', path_dict, ':$PYTHONPATH'])

        # setup
        for cmd in self._opts['setup']:
            setup.append(parse_setup_cmd(cmd))

        return setup
Esempio n. 12
0
    def _parse_setup_and_py_files(self):
        """Parse the *setup* option with
        :py:func:`mrjob.setup.parse_setup_cmd()`, and patch in *py_files*.
        """
        setup = []

        # py_files
        for path in self._opts['py_files']:
            # Spark (at least v1.3.1) doesn't work with # and --py-files,
            # see #1375
            if '#' in path:
                raise ValueError("py_files cannot contain '#'")
            path_dict = parse_legacy_hash_path('file', path)
            setup.append(['export PYTHONPATH=', path_dict, ':$PYTHONPATH'])

        # setup
        for cmd in self._opts['setup']:
            setup.append(parse_setup_cmd(cmd))

        return setup
Esempio n. 13
0
    def _non_option_kwargs(self):
        """Keyword arguments to runner constructor that can't be set
        in mrjob.conf.

        These should match the (named) arguments to
        :py:meth:`~mrjob.runner.MRJobRunner.__init__`.
        """
        # build extra_args and file_upload_args
        #
        # TODO v0.6.0: deprecate file_upload_args, represent paths to upload
        # as dictionaries in extra_args
        raw_args = _parse_raw_args(self.arg_parser, self._cl_args)

        extra_args = []

        for dest, option_string, args in raw_args:
            if dest in self._file_arg_dests:
                extra_args.append(option_string)
                extra_args.append(parse_legacy_hash_path('file', args[0]))
            elif dest in self._passthru_arg_dests:
                # special case for --hadoop-arg=-verbose etc.
                if (option_string and len(args) == 1
                        and args[0].startswith('-')):
                    extra_args.append('%s=%s' % (option_string, args[0]))
                else:
                    if option_string:
                        extra_args.append(option_string)
                    extra_args.extend(args)

        return dict(
            conf_paths=self.options.conf_paths,
            extra_args=extra_args,
            hadoop_input_format=self.hadoop_input_format(),
            hadoop_output_format=self.hadoop_output_format(),
            input_paths=self.options.args,
            mr_job_script=self._script_path,
            output_dir=self.options.output_dir,
            partitioner=self.partitioner(),
            stdin=self.stdin,
            step_output_dir=self.options.step_output_dir,
        )
Esempio n. 14
0
    def _non_option_kwargs(self):
        """Keyword arguments to runner constructor that can't be set
        in mrjob.conf.

        These should match the (named) arguments to
        :py:meth:`~mrjob.runner.MRJobRunner.__init__`.
        """
        # build extra_args
        raw_args = _parse_raw_args(self.arg_parser, self._cl_args)

        extra_args = []

        for dest, option_string, args in raw_args:
            if dest in self._file_arg_dests:
                extra_args.append(option_string)
                extra_args.append(parse_legacy_hash_path('file', args[0]))
            elif dest in self._passthru_arg_dests:
                # special case for --hadoop-arg=-verbose etc.
                if (option_string and len(args) == 1 and
                        args[0].startswith('-')):
                    extra_args.append('%s=%s' % (option_string, args[0]))
                else:
                    if option_string:
                        extra_args.append(option_string)
                    extra_args.extend(args)

        # max_output_files is added by _add_runner_args() but can only
        # be set from the command line, so we add it here (see #2040)
        return dict(
            conf_paths=self.options.conf_paths,
            extra_args=extra_args,
            hadoop_input_format=self.hadoop_input_format(),
            hadoop_output_format=self.hadoop_output_format(),
            input_paths=self.options.args,
            max_output_files=self.options.max_output_files,
            mr_job_script=self._script_path,
            output_dir=self.options.output_dir,
            partitioner=self.partitioner(),
            stdin=self.stdin,
            step_output_dir=self.options.step_output_dir,
        )
Esempio n. 15
0
File: launch.py Progetto: Yelp/mrjob
    def _non_option_kwargs(self):
        """Keyword arguments to runner constructor that can't be set
        in mrjob.conf.

        These should match the (named) arguments to
        :py:meth:`~mrjob.runner.MRJobRunner.__init__`.
        """
        # build extra_args
        raw_args = _parse_raw_args(self.arg_parser, self._cl_args)

        extra_args = []

        for dest, option_string, args in raw_args:
            if dest in self._file_arg_dests:
                extra_args.append(option_string)
                extra_args.append(parse_legacy_hash_path('file', args[0]))
            elif dest in self._passthru_arg_dests:
                # special case for --hadoop-arg=-verbose etc.
                if (option_string and len(args) == 1 and
                        args[0].startswith('-')):
                    extra_args.append('%s=%s' % (option_string, args[0]))
                else:
                    if option_string:
                        extra_args.append(option_string)
                    extra_args.extend(args)

        # max_output_files is added by _add_runner_args() but can only
        # be set from the command line, so we add it here (see #2040)
        return dict(
            conf_paths=self.options.conf_paths,
            extra_args=extra_args,
            hadoop_input_format=self.hadoop_input_format(),
            hadoop_output_format=self.hadoop_output_format(),
            input_paths=self.options.args,
            max_output_files=self.options.max_output_files,
            mr_job_script=self._script_path,
            output_dir=self.options.output_dir,
            partitioner=self.partitioner(),
            stdin=self.stdin,
            step_output_dir=self.options.step_output_dir,
        )
Esempio n. 16
0
    def __init__(self,
                 mr_job_script=None,
                 conf_paths=None,
                 extra_args=None,
                 file_upload_args=None,
                 hadoop_input_format=None,
                 hadoop_output_format=None,
                 input_paths=None,
                 output_dir=None,
                 partitioner=None,
                 sort_values=None,
                 stdin=None,
                 step_output_dir=None,
                 **opts):
        """All runners take the following keyword arguments:

        :type mr_job_script: str
        :param mr_job_script: the path of the ``.py`` file containing the
                              :py:class:`~mrjob.job.MRJob`. If this is None,
                              you won't actually be able to :py:meth:`run` the
                              job, but other utilities (e.g. :py:meth:`ls`)
                              will work.
        :type conf_paths: None or list
        :param conf_paths: List of config files to combine and use, or None to
                           search for mrjob.conf in the default locations.
        :type extra_args: list of str
        :param extra_args: a list of extra cmd-line arguments to pass to the
                           mr_job script. This is a hook to allow jobs to take
                           additional arguments.
        :param file_upload_args: a list of tuples of ``('--ARGNAME', path)``.
                                 The file at the given path will be uploaded
                                 to the local directory of the mr_job script
                                 when it runs, and then passed into the script
                                 with ``--ARGNAME``. Useful for passing in
                                 SQLite DBs and other configuration files to
                                 your job.
        :type hadoop_input_format: str
        :param hadoop_input_format: name of an optional Hadoop ``InputFormat``
                                    class. Passed to Hadoop along with your
                                    first step with the ``-inputformat``
                                    option. Note that if you write your own
                                    class, you'll need to include it in your
                                    own custom streaming jar (see
                                    :mrjob-opt:`hadoop_streaming_jar`).
        :type hadoop_output_format: str
        :param hadoop_output_format: name of an optional Hadoop
                                     ``OutputFormat`` class. Passed to Hadoop
                                     along with your first step with the
                                     ``-outputformat`` option. Note that if you
                                     write your own class, you'll need to
                                     include it in your own custom streaming
                                     jar (see
                                     :mrjob-opt:`hadoop_streaming_jar`).
        :type input_paths: list of str
        :param input_paths: Input files for your job. Supports globs and
                            recursively walks directories (e.g.
                            ``['data/common/', 'data/training/*.gz']``). If
                            this is left blank, we'll read from stdin
        :type output_dir: str
        :param output_dir: An empty/non-existent directory where Hadoop
                           should put the final output from the job.
                           If you don't specify an output directory, we'll
                           output into a subdirectory of this job's temporary
                           directory. You can control this from the command
                           line with ``--output-dir``. This option cannot be
                           set from configuration files. If used with the
                           hadoop runner, this path does not need to be fully
                           qualified with ``hdfs://`` URIs because it's
                           understood that it has to be on HDFS.
        :type partitioner: str
        :param partitioner: Optional name of a Hadoop partitioner class, e.g.
                            ``'org.apache.hadoop.mapred.lib.HashPartitioner'``.
                            Hadoop streaming will use this to determine how
                            mapper output should be sorted and distributed
                            to reducers.
        :type sort_values: bool
        :param sort_values: if true, set partitioners and jobconf variables
                            so that reducers to receive the values
                            associated with any key in sorted order (sorted by
                            their *encoded* value). Also known as secondary
                            sort.
        :param stdin: an iterable (can be a ``BytesIO`` or even a list) to use
                      as stdin. This is a hook for testing; if you set
                      ``stdin`` via :py:meth:`~mrjob.job.MRJob.sandbox`, it'll
                      get passed through to the runner. If for some reason
                      your lines are missing newlines, we'll add them;
                      this makes it easier to write automated tests.
        :type step_output_dir: str
        :param step_output_dir: An empty/non-existent directory where Hadoop
                                should put output from all steps other than
                                the last one (this only matters for multi-step
                                jobs). Currently ignored by local runners.
        """
        self._ran_job = False

        # opts are made from:
        #
        # empty defaults (everything set to None)
        # runner-specific defaults
        # opts from config file(s)
        # opts from command line
        self._opts = self._combine_confs(
            [(None, {key: None
                     for key in self.OPT_NAMES})] +
            [(None, self._default_opts())] +
            load_opts_from_mrjob_confs(self.alias, conf_paths) +
            [('the command line', opts)])

        log.debug('Active configuration:')
        log.debug(
            pprint.pformat({
                opt_key: self._obfuscate_opt(opt_key, opt_value)
                for opt_key, opt_value in self._opts.items()
            }))

        self._fs = None

        # a local tmp directory that will be cleaned up when we're done
        # access/make this using self._get_local_tmp_dir()
        self._local_tmp_dir = None

        self._working_dir_mgr = WorkingDirManager()

        # mapping from dir to path for corresponding archive. we pick
        # paths during init(), but don't actually create the archives
        # until self._create_dir_archives() is called
        self._dir_to_archive_path = {}
        # dir archive names (the filename minus ".tar.gz") already taken
        self._dir_archive_names_taken = set()
        # set of dir_archives that have actually been created
        self._dir_archives_created = set()

        # track (name, path) of files and archives to upload to spark.
        # these are a subset of those in self._working_dir_mgr
        self._spark_files = []
        self._spark_archives = []

        self._upload_mgr = None  # define in subclasses that use this

        self._script_path = mr_job_script
        if self._script_path:
            self._working_dir_mgr.add('file', self._script_path)

        # give this job a unique name
        self._job_key = self._make_unique_job_key(label=self._opts['label'],
                                                  owner=self._opts['owner'])

        # extra args to our job
        self._extra_args = list(extra_args) if extra_args else []
        for extra_arg in self._extra_args:
            if isinstance(extra_arg, dict):
                if extra_arg.get('type') != 'file':
                    raise NotImplementedError
                self._working_dir_mgr.add(**extra_arg)
                self._spark_files.append(
                    (extra_arg['name'], extra_arg['path']))

        # extra file arguments to our job
        if file_upload_args:
            log.warning('file_upload_args is deprecated and will be removed'
                        ' in v0.6.0. Pass dicts to extra_args instead.')
            for arg, path in file_upload_args:
                arg_file = parse_legacy_hash_path('file', path)
                self._working_dir_mgr.add(**arg_file)
                self._extra_args.extend([arg, arg_file])
                self._spark_files.append((arg_file['name'], arg_file['path']))

        # set up uploading
        for hash_path in self._opts['upload_files']:
            uf = parse_legacy_hash_path('file',
                                        hash_path,
                                        must_name='upload_files')
            self._working_dir_mgr.add(**uf)
            self._spark_files.append((uf['name'], uf['path']))

        for hash_path in self._opts['upload_archives']:
            ua = parse_legacy_hash_path('archive',
                                        hash_path,
                                        must_name='upload_archives')
            self._working_dir_mgr.add(**ua)
            self._spark_archives.append((ua['name'], ua['path']))

        for hash_path in self._opts['upload_dirs']:
            # pick name based on directory path
            ud = parse_legacy_hash_path('dir',
                                        hash_path,
                                        must_name='upload_archives')
            # but feed working_dir_mgr the archive's path
            archive_path = self._dir_archive_path(ud['path'])
            self._working_dir_mgr.add('archive', archive_path, name=ud['name'])
            self._spark_archives.append((ud['name'], archive_path))

        # py_files

        # self._setup is a list of shell commands with path dicts
        # interleaved; see mrjob.setup.parse_setup_cmd() for details
        self._setup = self._parse_setup_and_py_files()
        for cmd in self._setup:
            for token in cmd:
                if isinstance(token, dict):
                    # convert dir archives tokens to archives
                    if token['type'] == 'dir':
                        # feed the archive's path to self._working_dir_mgr
                        token['path'] = self._dir_archive_path(token['path'])
                        token['type'] = 'archive'

                    self._working_dir_mgr.add(**token)

        # Where to read input from (log files, etc.)
        self._input_paths = input_paths or ['-']  # by default read from stdin
        if PY2:
            self._stdin = stdin or sys.stdin
        else:
            self._stdin = stdin or sys.stdin.buffer
        self._stdin_path = None  # temp file containing dump from stdin

        # where a zip file of the mrjob library is stored locally
        self._mrjob_zip_path = None

        # store output_dir
        self._output_dir = output_dir

        # store partitioner
        self._partitioner = partitioner

        # store sort_values
        self._sort_values = sort_values

        # store step_output_dir
        self._step_output_dir = step_output_dir

        # store hadoop input and output formats
        self._hadoop_input_format = hadoop_input_format
        self._hadoop_output_format = hadoop_output_format

        # A cache for self._get_steps(); also useful as a test hook
        self._steps = None

        # this variable marks whether a cleanup has happened and this runner's
        # output stream is no longer available.
        self._closed = False
Esempio n. 17
0
    def __init__(self, mr_job_script=None, conf_paths=None,
                 extra_args=None, file_upload_args=None,
                 hadoop_input_format=None, hadoop_output_format=None,
                 input_paths=None, output_dir=None, partitioner=None,
                 stdin=None, **opts):
        """All runners take the following keyword arguments:

        :type mr_job_script: str
        :param mr_job_script: the path of the ``.py`` file containing the
                              :py:class:`~mrjob.job.MRJob`. If this is None,
                              you won't actually be able to :py:meth:`run` the
                              job, but other utilities (e.g. :py:meth:`ls`)
                              will work.
        :type conf_paths: None or list
        :param conf_paths: List of config files to combine and use, or None to
                           search for mrjob.conf in the default locations.
        :type extra_args: list of str
        :param extra_args: a list of extra cmd-line arguments to pass to the
                           mr_job script. This is a hook to allow jobs to take
                           additional arguments.
        :param file_upload_args: a list of tuples of ``('--ARGNAME', path)``.
                                 The file at the given path will be uploaded
                                 to the local directory of the mr_job script
                                 when it runs, and then passed into the script
                                 with ``--ARGNAME``. Useful for passing in
                                 SQLite DBs and other configuration files to
                                 your job.
        :type hadoop_input_format: str
        :param hadoop_input_format: name of an optional Hadoop ``InputFormat``
                                    class. Passed to Hadoop along with your
                                    first step with the ``-inputformat``
                                    option. Note that if you write your own
                                    class, you'll need to include it in your
                                    own custom streaming jar (see
                                    *hadoop_streaming_jar*).
        :type hadoop_output_format: str
        :param hadoop_output_format: name of an optional Hadoop
                                     ``OutputFormat`` class. Passed to Hadoop
                                     along with your first step with the
                                     ``-outputformat`` option. Note that if you
                                     write your own class, you'll need to
                                     include it in your own custom streaming
                                     jar (see *hadoop_streaming_jar*).
        :type input_paths: list of str
        :param input_paths: Input files for your job. Supports globs and
                            recursively walks directories (e.g.
                            ``['data/common/', 'data/training/*.gz']``). If
                            this is left blank, we'll read from stdin
        :type output_dir: str
        :param output_dir: An empty/non-existent directory where Hadoop
                           streaming should put the final output from the job.
                           If you don't specify an output directory, we'll
                           output into a subdirectory of this job's temporary
                           directory. You can control this from the command
                           line with ``--output-dir``. This option cannot be
                           set from configuration files. If used with the
                           hadoop runner, this path does not need to be fully
                           qualified with ``hdfs://`` URIs because it's
                           understood that it has to be on HDFS.
        :type partitioner: str
        :param partitioner: Optional name of a Hadoop partitoner class, e.g.
                            ``'org.apache.hadoop.mapred.lib.HashPartitioner'``.
                            Hadoop streaming will use this to determine how
                            mapper output should be sorted and distributed
                            to reducers.
        :param stdin: an iterable (can be a ``BytesIO`` or even a list) to use
                      as stdin. This is a hook for testing; if you set
                      ``stdin`` via :py:meth:`~mrjob.job.MRJob.sandbox`, it'll
                      get passed through to the runner. If for some reason
                      your lines are missing newlines, we'll add them;
                      this makes it easier to write automated tests.
        """
        self._ran_job = False

        self._opts = self.OPTION_STORE_CLASS(self.alias, opts, conf_paths)
        self._fs = None

        self._working_dir_mgr = WorkingDirManager()

        self._script_path = mr_job_script
        if self._script_path:
            self._working_dir_mgr.add('file', self._script_path)

        # give this job a unique name
        self._job_key = self._make_unique_job_key(
            label=self._opts['label'], owner=self._opts['owner'])

        # we'll create the wrapper script later
        self._setup_wrapper_script_path = None

        # extra args to our job
        self._extra_args = list(extra_args) if extra_args else []

        # extra file arguments to our job
        self._file_upload_args = []
        if file_upload_args:
            for arg, path in file_upload_args:
                arg_file = parse_legacy_hash_path('file', path)
                self._working_dir_mgr.add(**arg_file)
                self._file_upload_args.append((arg, arg_file))

        # set up uploading
        for path in self._opts['upload_files']:
            self._working_dir_mgr.add(**parse_legacy_hash_path(
                'file', path, must_name='upload_files'))
        for path in self._opts['upload_archives']:
            self._working_dir_mgr.add(**parse_legacy_hash_path(
                'archive', path, must_name='upload_archives'))

        # python_archives, setup, setup_cmds, and setup_scripts
        # self._setup is a list of shell commands with path dicts
        # interleaved; see mrjob.setup.parse_setup_cmds() for details
        self._setup = self._parse_setup()
        for cmd in self._setup:
            for maybe_path_dict in cmd:
                if isinstance(maybe_path_dict, dict):
                    self._working_dir_mgr.add(**maybe_path_dict)

        # Where to read input from (log files, etc.)
        self._input_paths = input_paths or ['-']  # by default read from stdin
        if PY2:
            self._stdin = stdin or sys.stdin
        else:
            self._stdin = stdin or sys.stdin.buffer
        self._stdin_path = None  # temp file containing dump from stdin

        # where a tarball of the mrjob library is stored locally
        self._mrjob_tar_gz_path = None

        # store output_dir
        self._output_dir = output_dir

        # store partitioner
        self._partitioner = partitioner

        # store hadoop input and output formats
        self._hadoop_input_format = hadoop_input_format
        self._hadoop_output_format = hadoop_output_format

        # a local tmp directory that will be cleaned up when we're done
        # access/make this using self._get_local_tmp_dir()
        self._local_tmp_dir = None

        # A cache for self._get_steps(); also useful as a test hook
        self._steps = None

        # if this is True, we have to pipe input into the sort command
        # rather than feed it multiple files
        self._sort_is_windows_sort = None

        # this variable marks whether a cleanup has happened and this runner's
        # output stream is no longer available.
        self._closed = False
Esempio n. 18
0
 def _add_python_archive(self, path):
     python_archive = parse_legacy_hash_path('archive', path)
     self._working_dir_mgr.add(**python_archive)
     self._python_archives.append(python_archive)
Esempio n. 19
0
    def __init__(self, mr_job_script=None, conf_paths=None,
                 extra_args=None, file_upload_args=None,
                 hadoop_input_format=None, hadoop_output_format=None,
                 input_paths=None, output_dir=None, partitioner=None,
                 sort_values=None, stdin=None, step_output_dir=None,
                 **opts):
        """All runners take the following keyword arguments:

        :type mr_job_script: str
        :param mr_job_script: the path of the ``.py`` file containing the
                              :py:class:`~mrjob.job.MRJob`. If this is None,
                              you won't actually be able to :py:meth:`run` the
                              job, but other utilities (e.g. :py:meth:`ls`)
                              will work.
        :type conf_paths: None or list
        :param conf_paths: List of config files to combine and use, or None to
                           search for mrjob.conf in the default locations.
        :type extra_args: list of str
        :param extra_args: a list of extra cmd-line arguments to pass to the
                           mr_job script. This is a hook to allow jobs to take
                           additional arguments.
        :param file_upload_args: a list of tuples of ``('--ARGNAME', path)``.
                                 The file at the given path will be uploaded
                                 to the local directory of the mr_job script
                                 when it runs, and then passed into the script
                                 with ``--ARGNAME``. Useful for passing in
                                 SQLite DBs and other configuration files to
                                 your job.
        :type hadoop_input_format: str
        :param hadoop_input_format: name of an optional Hadoop ``InputFormat``
                                    class. Passed to Hadoop along with your
                                    first step with the ``-inputformat``
                                    option. Note that if you write your own
                                    class, you'll need to include it in your
                                    own custom streaming jar (see
                                    :mrjob-opt:`hadoop_streaming_jar`).
        :type hadoop_output_format: str
        :param hadoop_output_format: name of an optional Hadoop
                                     ``OutputFormat`` class. Passed to Hadoop
                                     along with your first step with the
                                     ``-outputformat`` option. Note that if you
                                     write your own class, you'll need to
                                     include it in your own custom streaming
                                     jar (see
                                     :mrjob-opt:`hadoop_streaming_jar`).
        :type input_paths: list of str
        :param input_paths: Input files for your job. Supports globs and
                            recursively walks directories (e.g.
                            ``['data/common/', 'data/training/*.gz']``). If
                            this is left blank, we'll read from stdin
        :type output_dir: str
        :param output_dir: An empty/non-existent directory where Hadoop
                           should put the final output from the job.
                           If you don't specify an output directory, we'll
                           output into a subdirectory of this job's temporary
                           directory. You can control this from the command
                           line with ``--output-dir``. This option cannot be
                           set from configuration files. If used with the
                           hadoop runner, this path does not need to be fully
                           qualified with ``hdfs://`` URIs because it's
                           understood that it has to be on HDFS.
        :type partitioner: str
        :param partitioner: Optional name of a Hadoop partitioner class, e.g.
                            ``'org.apache.hadoop.mapred.lib.HashPartitioner'``.
                            Hadoop streaming will use this to determine how
                            mapper output should be sorted and distributed
                            to reducers.
        :type sort_values: bool
        :param sort_values: if true, set partitioners and jobconf variables
                            so that reducers to receive the values
                            associated with any key in sorted order (sorted by
                            their *encoded* value). Also known as secondary
                            sort.
        :param stdin: an iterable (can be a ``BytesIO`` or even a list) to use
                      as stdin. This is a hook for testing; if you set
                      ``stdin`` via :py:meth:`~mrjob.job.MRJob.sandbox`, it'll
                      get passed through to the runner. If for some reason
                      your lines are missing newlines, we'll add them;
                      this makes it easier to write automated tests.
        :type step_output_dir: str
        :param step_output_dir: An empty/non-existent directory where Hadoop
                                should put output from all steps other than
                                the last one (this only matters for multi-step
                                jobs). Currently ignored by local runners.
        """
        self._ran_job = False

        # opts are made from:
        #
        # empty defaults (everything set to None)
        # runner-specific defaults
        # opts from config file(s)
        # opts from command line
        self._opts = self._combine_confs(
            [(None, {key: None for key in self.OPT_NAMES})] +
            [(None, self._default_opts())] +
            load_opts_from_mrjob_confs(self.alias, conf_paths) +
            [('the command line', opts)]
        )

        log.debug('Active configuration:')
        log.debug(pprint.pformat({
            opt_key: self._obfuscate_opt(opt_key, opt_value)
            for opt_key, opt_value in self._opts.items()
        }))

        self._fs = None

        # a local tmp directory that will be cleaned up when we're done
        # access/make this using self._get_local_tmp_dir()
        self._local_tmp_dir = None

        self._working_dir_mgr = WorkingDirManager()

        # mapping from dir to path for corresponding archive. we pick
        # paths during init(), but don't actually create the archives
        # until self._create_dir_archives() is called
        self._dir_to_archive_path = {}
        # dir archive names (the filename minus ".tar.gz") already taken
        self._dir_archive_names_taken = set()
        # set of dir_archives that have actually been created
        self._dir_archives_created = set()

        # track (name, path) of files and archives to upload to spark.
        # these are a subset of those in self._working_dir_mgr
        self._spark_files = []
        self._spark_archives = []

        self._upload_mgr = None  # define in subclasses that use this

        self._script_path = mr_job_script
        if self._script_path:
            self._working_dir_mgr.add('file', self._script_path)

        # give this job a unique name
        self._job_key = self._make_unique_job_key(
            label=self._opts['label'], owner=self._opts['owner'])

        # extra args to our job
        self._extra_args = list(extra_args) if extra_args else []
        for extra_arg in self._extra_args:
            if isinstance(extra_arg, dict):
                if extra_arg.get('type') != 'file':
                    raise NotImplementedError
                self._working_dir_mgr.add(**extra_arg)
                self._spark_files.append(
                    (extra_arg['name'], extra_arg['path']))

        # extra file arguments to our job
        if file_upload_args:
            log.warning('file_upload_args is deprecated and will be removed'
                        ' in v0.6.0. Pass dicts to extra_args instead.')
            for arg, path in file_upload_args:
                arg_file = parse_legacy_hash_path('file', path)
                self._working_dir_mgr.add(**arg_file)
                self._extra_args.extend([arg, arg_file])
                self._spark_files.append((arg_file['name'], arg_file['path']))

        # set up uploading
        for hash_path in self._opts['upload_files']:
            uf = parse_legacy_hash_path('file', hash_path,
                                        must_name='upload_files')
            self._working_dir_mgr.add(**uf)
            self._spark_files.append((uf['name'], uf['path']))

        for hash_path in self._opts['upload_archives']:
            ua = parse_legacy_hash_path('archive', hash_path,
                                        must_name='upload_archives')
            self._working_dir_mgr.add(**ua)
            self._spark_archives.append((ua['name'], ua['path']))

        for hash_path in self._opts['upload_dirs']:
            # pick name based on directory path
            ud = parse_legacy_hash_path('dir', hash_path,
                                        must_name='upload_archives')
            # but feed working_dir_mgr the archive's path
            archive_path = self._dir_archive_path(ud['path'])
            self._working_dir_mgr.add(
                'archive', archive_path, name=ud['name'])
            self._spark_archives.append((ud['name'], archive_path))

        # py_files

        # self._setup is a list of shell commands with path dicts
        # interleaved; see mrjob.setup.parse_setup_cmd() for details
        self._setup = self._parse_setup_and_py_files()
        for cmd in self._setup:
            for token in cmd:
                if isinstance(token, dict):
                    # convert dir archives tokens to archives
                    if token['type'] == 'dir':
                        # feed the archive's path to self._working_dir_mgr
                        token['path'] = self._dir_archive_path(token['path'])
                        token['type'] = 'archive'

                    self._working_dir_mgr.add(**token)

        # Where to read input from (log files, etc.)
        self._input_paths = input_paths or ['-']  # by default read from stdin
        if PY2:
            self._stdin = stdin or sys.stdin
        else:
            self._stdin = stdin or sys.stdin.buffer
        self._stdin_path = None  # temp file containing dump from stdin

        # where a zip file of the mrjob library is stored locally
        self._mrjob_zip_path = None

        # store output_dir
        self._output_dir = output_dir

        # store partitioner
        self._partitioner = partitioner

        # store sort_values
        self._sort_values = sort_values

        # store step_output_dir
        self._step_output_dir = step_output_dir

        # store hadoop input and output formats
        self._hadoop_input_format = hadoop_input_format
        self._hadoop_output_format = hadoop_output_format

        # A cache for self._get_steps(); also useful as a test hook
        self._steps = None

        # this variable marks whether a cleanup has happened and this runner's
        # output stream is no longer available.
        self._closed = False
Esempio n. 20
0
 def test_basic(self):
     self.assertEqual(parse_legacy_hash_path('file', 'foo#bar'),
                      {'type': 'file', 'path': 'foo', 'name': 'bar'})
     self.assertEqual(parse_legacy_hash_path('file', '/dir/foo#bar'),
                      {'type': 'file', 'path': '/dir/foo', 'name': 'bar'})
Esempio n. 21
0
 def test_no_name(self):
     self.assertEqual(parse_legacy_hash_path('file', 'foo'),
                      {'type': 'file', 'path': 'foo', 'name': None})
     self.assertEqual(parse_legacy_hash_path('file', 'foo#'),
                      {'type': 'file', 'path': 'foo', 'name': None})
Esempio n. 22
0
 def test_basic(self):
     self.assertEqual(parse_legacy_hash_path("file", "foo#bar"), {"type": "file", "path": "foo", "name": "bar"})
     self.assertEqual(
         parse_legacy_hash_path("file", "/dir/foo#bar"), {"type": "file", "path": "/dir/foo", "name": "bar"}
     )
Esempio n. 23
0
 def test_no_name(self):
     self.assertEqual(parse_legacy_hash_path("file", "foo"), {"type": "file", "path": "foo", "name": None})
     self.assertEqual(parse_legacy_hash_path("file", "foo#"), {"type": "file", "path": "foo", "name": None})
Esempio n. 24
0
 def _add_python_archive(self, path):
     python_archive = parse_legacy_hash_path('archive', path)
     self._working_dir_mgr.add(**python_archive)
     self._python_archives.append(python_archive)