def _parse_setup(self): """Parse the *setup* option with :py:func:`mrjob.setup.parse_setup_cmd()`. If *bootstrap_mrjob* and ``self.BOOTSTRAP_MRJOB_IN_SETUP`` are both true, create mrjob.tar.gz (if it doesn't exist already) and prepend a setup command that adds it to PYTHONPATH. Also patch in the deprecated options *python_archives*, *setup_cmd*, and *setup_script* as setup commands. """ setup = [] # python_archives for path in self._opts['python_archives']: path_dict = parse_legacy_hash_path('archive', path) setup.append(['export PYTHONPATH=', path_dict, ':$PYTHONPATH']) # setup for cmd in self._opts['setup']: setup.append(parse_setup_cmd(cmd)) # setup_cmds for cmd in self._opts['setup_cmds']: if not isinstance(cmd, basestring): cmd = cmd_line(cmd) setup.append([cmd]) # setup_scripts for path in self._opts['setup_scripts']: path_dict = parse_legacy_hash_path('file', path) setup.append([path_dict]) return setup
def test_must_name(self): self.assertEqual( parse_legacy_hash_path('file', 'foo#bar', must_name='it'), { 'type': 'file', 'path': 'foo', 'name': 'bar' }) # use basename if no hash self.assertEqual(parse_legacy_hash_path('file', 'foo', must_name='it'), { 'type': 'file', 'path': 'foo', 'name': 'foo' }) # raise error on explicit empty name self.assertRaises(ValueError, parse_legacy_hash_path, 'file', 'foo#', must_name='it') # raise error if no basename self.assertRaises(ValueError, parse_legacy_hash_path, 'file', 'foo/', must_name='it')
def test_basic(self): self.assertEqual(parse_legacy_hash_path('file', 'foo#bar'), { 'type': 'file', 'path': 'foo', 'name': 'bar' }) self.assertEqual(parse_legacy_hash_path('file', '/dir/foo#bar'), { 'type': 'file', 'path': '/dir/foo', 'name': 'bar' })
def test_no_name(self): self.assertEqual(parse_legacy_hash_path('file', 'foo'), { 'type': 'file', 'path': 'foo', 'name': None }) self.assertEqual(parse_legacy_hash_path('file', 'foo#'), { 'type': 'file', 'path': 'foo', 'name': None })
def test_trailing_slash_in_name(self): self.assertRaises(ValueError, parse_legacy_hash_path, "file", "foo.tar.gz#bar/") self.assertRaises(ValueError, parse_legacy_hash_path, "file", "foo.tar.gz#/") # trailing slash is allowed for archives because that's the new # way of indicating archives self.assertEqual( parse_legacy_hash_path("archive", "foo.tar.gz#bar/"), {"type": "archive", "path": "foo.tar.gz", "name": "bar"}, ) self.assertEqual( parse_legacy_hash_path("archive", "foo.tar.gz#/"), {"type": "archive", "path": "foo.tar.gz", "name": None} )
def test_must_name(self): self.assertEqual( parse_legacy_hash_path("file", "foo#bar", must_name="it"), {"type": "file", "path": "foo", "name": "bar"} ) # use basename if no hash self.assertEqual( parse_legacy_hash_path("file", "foo", must_name="it"), {"type": "file", "path": "foo", "name": "foo"} ) # raise error on explicit empty name self.assertRaises(ValueError, parse_legacy_hash_path, "file", "foo#", must_name="it") # raise error if no basename self.assertRaises(ValueError, parse_legacy_hash_path, "file", "foo/", must_name="it")
def test_trailing_slash_in_name(self): self.assertRaises( ValueError, parse_legacy_hash_path, 'file', 'foo.tar.gz#bar/') self.assertRaises( ValueError, parse_legacy_hash_path, 'file', 'foo.tar.gz#/') # trailing slash is allowed for archives because that's the new # way of indicating archives self.assertEqual( parse_legacy_hash_path('archive', 'foo.tar.gz#bar/'), {'type': 'archive', 'path': 'foo.tar.gz', 'name': 'bar'}) self.assertEqual( parse_legacy_hash_path('archive', 'foo.tar.gz#/'), {'type': 'archive', 'path': 'foo.tar.gz', 'name': None})
def test_must_name(self): self.assertEqual( parse_legacy_hash_path('file', 'foo#bar', must_name='it'), {'type': 'file', 'path': 'foo', 'name': 'bar'}) # use basename if no hash self.assertEqual( parse_legacy_hash_path('file', 'foo', must_name='it'), {'type': 'file', 'path': 'foo', 'name': 'foo'}) # raise error on explicit empty name self.assertRaises(ValueError, parse_legacy_hash_path, 'file', 'foo#', must_name='it') # raise error if no basename self.assertRaises(ValueError, parse_legacy_hash_path, 'file', 'foo/', must_name='it')
def _parse_setup(self): """Parse the *setup* option with :py:func:`mrjob.setup.parse_setup_cmd()`. If *bootstrap_mrjob* and ``self.BOOTSTRAP_MRJOB_IN_SETUP`` are both true, create mrjob.tar.gz (if it doesn't exist already) and prepend a setup command that adds it to PYTHONPATH. Also patch in the deprecated options *python_archives*, *setup_cmd*, and *setup_script* as setup commands. """ setup = [] # python_archives for path in self._opts["python_archives"]: path_dict = parse_legacy_hash_path("archive", path) setup.append(["export PYTHONPATH=", path_dict, ":$PYTHONPATH"]) # setup for cmd in self._opts["setup"]: setup.append(parse_setup_cmd(cmd)) # setup_cmds if self._opts["setup_cmds"]: log.warning( "setup_cmds is deprecated since v0.4.2 and will be removed" " in v0.6.0. Consider using setup instead." ) for cmd in self._opts["setup_cmds"]: if not isinstance(cmd, string_types): cmd = cmd_line(cmd) setup.append([cmd]) # setup_scripts if self._opts["setup_scripts"]: log.warning( "setup_scripts is deprecated since v0.4.2 and will be removed" " in v0.6.0. Consider using setup instead." ) for path in self._opts["setup_scripts"]: path_dict = parse_legacy_hash_path("file", path) setup.append([path_dict]) return setup
def _parse_setup_and_py_files(self): """Parse the *setup* option with :py:func:`mrjob.setup.parse_setup_cmd()`, and patch in *py_files*. """ setup = [] # py_files for path in self._opts['py_files']: # Spark (at least v1.3.1) doesn't work with # and --py-files, # see #1375 if '#' in path: raise ValueError("py_files cannot contain '#'") path_dict = parse_legacy_hash_path('file', path) setup.append(['export PYTHONPATH=', path_dict, ':$PYTHONPATH']) # setup for cmd in self._opts['setup']: setup.append(parse_setup_cmd(cmd)) return setup
def _non_option_kwargs(self): """Keyword arguments to runner constructor that can't be set in mrjob.conf. These should match the (named) arguments to :py:meth:`~mrjob.runner.MRJobRunner.__init__`. """ # build extra_args and file_upload_args # # TODO v0.6.0: deprecate file_upload_args, represent paths to upload # as dictionaries in extra_args raw_args = _parse_raw_args(self.arg_parser, self._cl_args) extra_args = [] for dest, option_string, args in raw_args: if dest in self._file_arg_dests: extra_args.append(option_string) extra_args.append(parse_legacy_hash_path('file', args[0])) elif dest in self._passthru_arg_dests: # special case for --hadoop-arg=-verbose etc. if (option_string and len(args) == 1 and args[0].startswith('-')): extra_args.append('%s=%s' % (option_string, args[0])) else: if option_string: extra_args.append(option_string) extra_args.extend(args) return dict( conf_paths=self.options.conf_paths, extra_args=extra_args, hadoop_input_format=self.hadoop_input_format(), hadoop_output_format=self.hadoop_output_format(), input_paths=self.options.args, mr_job_script=self._script_path, output_dir=self.options.output_dir, partitioner=self.partitioner(), stdin=self.stdin, step_output_dir=self.options.step_output_dir, )
def _non_option_kwargs(self): """Keyword arguments to runner constructor that can't be set in mrjob.conf. These should match the (named) arguments to :py:meth:`~mrjob.runner.MRJobRunner.__init__`. """ # build extra_args raw_args = _parse_raw_args(self.arg_parser, self._cl_args) extra_args = [] for dest, option_string, args in raw_args: if dest in self._file_arg_dests: extra_args.append(option_string) extra_args.append(parse_legacy_hash_path('file', args[0])) elif dest in self._passthru_arg_dests: # special case for --hadoop-arg=-verbose etc. if (option_string and len(args) == 1 and args[0].startswith('-')): extra_args.append('%s=%s' % (option_string, args[0])) else: if option_string: extra_args.append(option_string) extra_args.extend(args) # max_output_files is added by _add_runner_args() but can only # be set from the command line, so we add it here (see #2040) return dict( conf_paths=self.options.conf_paths, extra_args=extra_args, hadoop_input_format=self.hadoop_input_format(), hadoop_output_format=self.hadoop_output_format(), input_paths=self.options.args, max_output_files=self.options.max_output_files, mr_job_script=self._script_path, output_dir=self.options.output_dir, partitioner=self.partitioner(), stdin=self.stdin, step_output_dir=self.options.step_output_dir, )
def __init__(self, mr_job_script=None, conf_paths=None, extra_args=None, file_upload_args=None, hadoop_input_format=None, hadoop_output_format=None, input_paths=None, output_dir=None, partitioner=None, sort_values=None, stdin=None, step_output_dir=None, **opts): """All runners take the following keyword arguments: :type mr_job_script: str :param mr_job_script: the path of the ``.py`` file containing the :py:class:`~mrjob.job.MRJob`. If this is None, you won't actually be able to :py:meth:`run` the job, but other utilities (e.g. :py:meth:`ls`) will work. :type conf_paths: None or list :param conf_paths: List of config files to combine and use, or None to search for mrjob.conf in the default locations. :type extra_args: list of str :param extra_args: a list of extra cmd-line arguments to pass to the mr_job script. This is a hook to allow jobs to take additional arguments. :param file_upload_args: a list of tuples of ``('--ARGNAME', path)``. The file at the given path will be uploaded to the local directory of the mr_job script when it runs, and then passed into the script with ``--ARGNAME``. Useful for passing in SQLite DBs and other configuration files to your job. :type hadoop_input_format: str :param hadoop_input_format: name of an optional Hadoop ``InputFormat`` class. Passed to Hadoop along with your first step with the ``-inputformat`` option. Note that if you write your own class, you'll need to include it in your own custom streaming jar (see :mrjob-opt:`hadoop_streaming_jar`). :type hadoop_output_format: str :param hadoop_output_format: name of an optional Hadoop ``OutputFormat`` class. Passed to Hadoop along with your first step with the ``-outputformat`` option. Note that if you write your own class, you'll need to include it in your own custom streaming jar (see :mrjob-opt:`hadoop_streaming_jar`). :type input_paths: list of str :param input_paths: Input files for your job. Supports globs and recursively walks directories (e.g. ``['data/common/', 'data/training/*.gz']``). If this is left blank, we'll read from stdin :type output_dir: str :param output_dir: An empty/non-existent directory where Hadoop should put the final output from the job. If you don't specify an output directory, we'll output into a subdirectory of this job's temporary directory. You can control this from the command line with ``--output-dir``. This option cannot be set from configuration files. If used with the hadoop runner, this path does not need to be fully qualified with ``hdfs://`` URIs because it's understood that it has to be on HDFS. :type partitioner: str :param partitioner: Optional name of a Hadoop partitioner class, e.g. ``'org.apache.hadoop.mapred.lib.HashPartitioner'``. Hadoop streaming will use this to determine how mapper output should be sorted and distributed to reducers. :type sort_values: bool :param sort_values: if true, set partitioners and jobconf variables so that reducers to receive the values associated with any key in sorted order (sorted by their *encoded* value). Also known as secondary sort. :param stdin: an iterable (can be a ``BytesIO`` or even a list) to use as stdin. This is a hook for testing; if you set ``stdin`` via :py:meth:`~mrjob.job.MRJob.sandbox`, it'll get passed through to the runner. If for some reason your lines are missing newlines, we'll add them; this makes it easier to write automated tests. :type step_output_dir: str :param step_output_dir: An empty/non-existent directory where Hadoop should put output from all steps other than the last one (this only matters for multi-step jobs). Currently ignored by local runners. """ self._ran_job = False # opts are made from: # # empty defaults (everything set to None) # runner-specific defaults # opts from config file(s) # opts from command line self._opts = self._combine_confs( [(None, {key: None for key in self.OPT_NAMES})] + [(None, self._default_opts())] + load_opts_from_mrjob_confs(self.alias, conf_paths) + [('the command line', opts)]) log.debug('Active configuration:') log.debug( pprint.pformat({ opt_key: self._obfuscate_opt(opt_key, opt_value) for opt_key, opt_value in self._opts.items() })) self._fs = None # a local tmp directory that will be cleaned up when we're done # access/make this using self._get_local_tmp_dir() self._local_tmp_dir = None self._working_dir_mgr = WorkingDirManager() # mapping from dir to path for corresponding archive. we pick # paths during init(), but don't actually create the archives # until self._create_dir_archives() is called self._dir_to_archive_path = {} # dir archive names (the filename minus ".tar.gz") already taken self._dir_archive_names_taken = set() # set of dir_archives that have actually been created self._dir_archives_created = set() # track (name, path) of files and archives to upload to spark. # these are a subset of those in self._working_dir_mgr self._spark_files = [] self._spark_archives = [] self._upload_mgr = None # define in subclasses that use this self._script_path = mr_job_script if self._script_path: self._working_dir_mgr.add('file', self._script_path) # give this job a unique name self._job_key = self._make_unique_job_key(label=self._opts['label'], owner=self._opts['owner']) # extra args to our job self._extra_args = list(extra_args) if extra_args else [] for extra_arg in self._extra_args: if isinstance(extra_arg, dict): if extra_arg.get('type') != 'file': raise NotImplementedError self._working_dir_mgr.add(**extra_arg) self._spark_files.append( (extra_arg['name'], extra_arg['path'])) # extra file arguments to our job if file_upload_args: log.warning('file_upload_args is deprecated and will be removed' ' in v0.6.0. Pass dicts to extra_args instead.') for arg, path in file_upload_args: arg_file = parse_legacy_hash_path('file', path) self._working_dir_mgr.add(**arg_file) self._extra_args.extend([arg, arg_file]) self._spark_files.append((arg_file['name'], arg_file['path'])) # set up uploading for hash_path in self._opts['upload_files']: uf = parse_legacy_hash_path('file', hash_path, must_name='upload_files') self._working_dir_mgr.add(**uf) self._spark_files.append((uf['name'], uf['path'])) for hash_path in self._opts['upload_archives']: ua = parse_legacy_hash_path('archive', hash_path, must_name='upload_archives') self._working_dir_mgr.add(**ua) self._spark_archives.append((ua['name'], ua['path'])) for hash_path in self._opts['upload_dirs']: # pick name based on directory path ud = parse_legacy_hash_path('dir', hash_path, must_name='upload_archives') # but feed working_dir_mgr the archive's path archive_path = self._dir_archive_path(ud['path']) self._working_dir_mgr.add('archive', archive_path, name=ud['name']) self._spark_archives.append((ud['name'], archive_path)) # py_files # self._setup is a list of shell commands with path dicts # interleaved; see mrjob.setup.parse_setup_cmd() for details self._setup = self._parse_setup_and_py_files() for cmd in self._setup: for token in cmd: if isinstance(token, dict): # convert dir archives tokens to archives if token['type'] == 'dir': # feed the archive's path to self._working_dir_mgr token['path'] = self._dir_archive_path(token['path']) token['type'] = 'archive' self._working_dir_mgr.add(**token) # Where to read input from (log files, etc.) self._input_paths = input_paths or ['-'] # by default read from stdin if PY2: self._stdin = stdin or sys.stdin else: self._stdin = stdin or sys.stdin.buffer self._stdin_path = None # temp file containing dump from stdin # where a zip file of the mrjob library is stored locally self._mrjob_zip_path = None # store output_dir self._output_dir = output_dir # store partitioner self._partitioner = partitioner # store sort_values self._sort_values = sort_values # store step_output_dir self._step_output_dir = step_output_dir # store hadoop input and output formats self._hadoop_input_format = hadoop_input_format self._hadoop_output_format = hadoop_output_format # A cache for self._get_steps(); also useful as a test hook self._steps = None # this variable marks whether a cleanup has happened and this runner's # output stream is no longer available. self._closed = False
def __init__(self, mr_job_script=None, conf_paths=None, extra_args=None, file_upload_args=None, hadoop_input_format=None, hadoop_output_format=None, input_paths=None, output_dir=None, partitioner=None, stdin=None, **opts): """All runners take the following keyword arguments: :type mr_job_script: str :param mr_job_script: the path of the ``.py`` file containing the :py:class:`~mrjob.job.MRJob`. If this is None, you won't actually be able to :py:meth:`run` the job, but other utilities (e.g. :py:meth:`ls`) will work. :type conf_paths: None or list :param conf_paths: List of config files to combine and use, or None to search for mrjob.conf in the default locations. :type extra_args: list of str :param extra_args: a list of extra cmd-line arguments to pass to the mr_job script. This is a hook to allow jobs to take additional arguments. :param file_upload_args: a list of tuples of ``('--ARGNAME', path)``. The file at the given path will be uploaded to the local directory of the mr_job script when it runs, and then passed into the script with ``--ARGNAME``. Useful for passing in SQLite DBs and other configuration files to your job. :type hadoop_input_format: str :param hadoop_input_format: name of an optional Hadoop ``InputFormat`` class. Passed to Hadoop along with your first step with the ``-inputformat`` option. Note that if you write your own class, you'll need to include it in your own custom streaming jar (see *hadoop_streaming_jar*). :type hadoop_output_format: str :param hadoop_output_format: name of an optional Hadoop ``OutputFormat`` class. Passed to Hadoop along with your first step with the ``-outputformat`` option. Note that if you write your own class, you'll need to include it in your own custom streaming jar (see *hadoop_streaming_jar*). :type input_paths: list of str :param input_paths: Input files for your job. Supports globs and recursively walks directories (e.g. ``['data/common/', 'data/training/*.gz']``). If this is left blank, we'll read from stdin :type output_dir: str :param output_dir: An empty/non-existent directory where Hadoop streaming should put the final output from the job. If you don't specify an output directory, we'll output into a subdirectory of this job's temporary directory. You can control this from the command line with ``--output-dir``. This option cannot be set from configuration files. If used with the hadoop runner, this path does not need to be fully qualified with ``hdfs://`` URIs because it's understood that it has to be on HDFS. :type partitioner: str :param partitioner: Optional name of a Hadoop partitoner class, e.g. ``'org.apache.hadoop.mapred.lib.HashPartitioner'``. Hadoop streaming will use this to determine how mapper output should be sorted and distributed to reducers. :param stdin: an iterable (can be a ``BytesIO`` or even a list) to use as stdin. This is a hook for testing; if you set ``stdin`` via :py:meth:`~mrjob.job.MRJob.sandbox`, it'll get passed through to the runner. If for some reason your lines are missing newlines, we'll add them; this makes it easier to write automated tests. """ self._ran_job = False self._opts = self.OPTION_STORE_CLASS(self.alias, opts, conf_paths) self._fs = None self._working_dir_mgr = WorkingDirManager() self._script_path = mr_job_script if self._script_path: self._working_dir_mgr.add('file', self._script_path) # give this job a unique name self._job_key = self._make_unique_job_key( label=self._opts['label'], owner=self._opts['owner']) # we'll create the wrapper script later self._setup_wrapper_script_path = None # extra args to our job self._extra_args = list(extra_args) if extra_args else [] # extra file arguments to our job self._file_upload_args = [] if file_upload_args: for arg, path in file_upload_args: arg_file = parse_legacy_hash_path('file', path) self._working_dir_mgr.add(**arg_file) self._file_upload_args.append((arg, arg_file)) # set up uploading for path in self._opts['upload_files']: self._working_dir_mgr.add(**parse_legacy_hash_path( 'file', path, must_name='upload_files')) for path in self._opts['upload_archives']: self._working_dir_mgr.add(**parse_legacy_hash_path( 'archive', path, must_name='upload_archives')) # python_archives, setup, setup_cmds, and setup_scripts # self._setup is a list of shell commands with path dicts # interleaved; see mrjob.setup.parse_setup_cmds() for details self._setup = self._parse_setup() for cmd in self._setup: for maybe_path_dict in cmd: if isinstance(maybe_path_dict, dict): self._working_dir_mgr.add(**maybe_path_dict) # Where to read input from (log files, etc.) self._input_paths = input_paths or ['-'] # by default read from stdin if PY2: self._stdin = stdin or sys.stdin else: self._stdin = stdin or sys.stdin.buffer self._stdin_path = None # temp file containing dump from stdin # where a tarball of the mrjob library is stored locally self._mrjob_tar_gz_path = None # store output_dir self._output_dir = output_dir # store partitioner self._partitioner = partitioner # store hadoop input and output formats self._hadoop_input_format = hadoop_input_format self._hadoop_output_format = hadoop_output_format # a local tmp directory that will be cleaned up when we're done # access/make this using self._get_local_tmp_dir() self._local_tmp_dir = None # A cache for self._get_steps(); also useful as a test hook self._steps = None # if this is True, we have to pipe input into the sort command # rather than feed it multiple files self._sort_is_windows_sort = None # this variable marks whether a cleanup has happened and this runner's # output stream is no longer available. self._closed = False
def _add_python_archive(self, path): python_archive = parse_legacy_hash_path('archive', path) self._working_dir_mgr.add(**python_archive) self._python_archives.append(python_archive)
def __init__(self, mr_job_script=None, conf_paths=None, extra_args=None, file_upload_args=None, hadoop_input_format=None, hadoop_output_format=None, input_paths=None, output_dir=None, partitioner=None, sort_values=None, stdin=None, step_output_dir=None, **opts): """All runners take the following keyword arguments: :type mr_job_script: str :param mr_job_script: the path of the ``.py`` file containing the :py:class:`~mrjob.job.MRJob`. If this is None, you won't actually be able to :py:meth:`run` the job, but other utilities (e.g. :py:meth:`ls`) will work. :type conf_paths: None or list :param conf_paths: List of config files to combine and use, or None to search for mrjob.conf in the default locations. :type extra_args: list of str :param extra_args: a list of extra cmd-line arguments to pass to the mr_job script. This is a hook to allow jobs to take additional arguments. :param file_upload_args: a list of tuples of ``('--ARGNAME', path)``. The file at the given path will be uploaded to the local directory of the mr_job script when it runs, and then passed into the script with ``--ARGNAME``. Useful for passing in SQLite DBs and other configuration files to your job. :type hadoop_input_format: str :param hadoop_input_format: name of an optional Hadoop ``InputFormat`` class. Passed to Hadoop along with your first step with the ``-inputformat`` option. Note that if you write your own class, you'll need to include it in your own custom streaming jar (see :mrjob-opt:`hadoop_streaming_jar`). :type hadoop_output_format: str :param hadoop_output_format: name of an optional Hadoop ``OutputFormat`` class. Passed to Hadoop along with your first step with the ``-outputformat`` option. Note that if you write your own class, you'll need to include it in your own custom streaming jar (see :mrjob-opt:`hadoop_streaming_jar`). :type input_paths: list of str :param input_paths: Input files for your job. Supports globs and recursively walks directories (e.g. ``['data/common/', 'data/training/*.gz']``). If this is left blank, we'll read from stdin :type output_dir: str :param output_dir: An empty/non-existent directory where Hadoop should put the final output from the job. If you don't specify an output directory, we'll output into a subdirectory of this job's temporary directory. You can control this from the command line with ``--output-dir``. This option cannot be set from configuration files. If used with the hadoop runner, this path does not need to be fully qualified with ``hdfs://`` URIs because it's understood that it has to be on HDFS. :type partitioner: str :param partitioner: Optional name of a Hadoop partitioner class, e.g. ``'org.apache.hadoop.mapred.lib.HashPartitioner'``. Hadoop streaming will use this to determine how mapper output should be sorted and distributed to reducers. :type sort_values: bool :param sort_values: if true, set partitioners and jobconf variables so that reducers to receive the values associated with any key in sorted order (sorted by their *encoded* value). Also known as secondary sort. :param stdin: an iterable (can be a ``BytesIO`` or even a list) to use as stdin. This is a hook for testing; if you set ``stdin`` via :py:meth:`~mrjob.job.MRJob.sandbox`, it'll get passed through to the runner. If for some reason your lines are missing newlines, we'll add them; this makes it easier to write automated tests. :type step_output_dir: str :param step_output_dir: An empty/non-existent directory where Hadoop should put output from all steps other than the last one (this only matters for multi-step jobs). Currently ignored by local runners. """ self._ran_job = False # opts are made from: # # empty defaults (everything set to None) # runner-specific defaults # opts from config file(s) # opts from command line self._opts = self._combine_confs( [(None, {key: None for key in self.OPT_NAMES})] + [(None, self._default_opts())] + load_opts_from_mrjob_confs(self.alias, conf_paths) + [('the command line', opts)] ) log.debug('Active configuration:') log.debug(pprint.pformat({ opt_key: self._obfuscate_opt(opt_key, opt_value) for opt_key, opt_value in self._opts.items() })) self._fs = None # a local tmp directory that will be cleaned up when we're done # access/make this using self._get_local_tmp_dir() self._local_tmp_dir = None self._working_dir_mgr = WorkingDirManager() # mapping from dir to path for corresponding archive. we pick # paths during init(), but don't actually create the archives # until self._create_dir_archives() is called self._dir_to_archive_path = {} # dir archive names (the filename minus ".tar.gz") already taken self._dir_archive_names_taken = set() # set of dir_archives that have actually been created self._dir_archives_created = set() # track (name, path) of files and archives to upload to spark. # these are a subset of those in self._working_dir_mgr self._spark_files = [] self._spark_archives = [] self._upload_mgr = None # define in subclasses that use this self._script_path = mr_job_script if self._script_path: self._working_dir_mgr.add('file', self._script_path) # give this job a unique name self._job_key = self._make_unique_job_key( label=self._opts['label'], owner=self._opts['owner']) # extra args to our job self._extra_args = list(extra_args) if extra_args else [] for extra_arg in self._extra_args: if isinstance(extra_arg, dict): if extra_arg.get('type') != 'file': raise NotImplementedError self._working_dir_mgr.add(**extra_arg) self._spark_files.append( (extra_arg['name'], extra_arg['path'])) # extra file arguments to our job if file_upload_args: log.warning('file_upload_args is deprecated and will be removed' ' in v0.6.0. Pass dicts to extra_args instead.') for arg, path in file_upload_args: arg_file = parse_legacy_hash_path('file', path) self._working_dir_mgr.add(**arg_file) self._extra_args.extend([arg, arg_file]) self._spark_files.append((arg_file['name'], arg_file['path'])) # set up uploading for hash_path in self._opts['upload_files']: uf = parse_legacy_hash_path('file', hash_path, must_name='upload_files') self._working_dir_mgr.add(**uf) self._spark_files.append((uf['name'], uf['path'])) for hash_path in self._opts['upload_archives']: ua = parse_legacy_hash_path('archive', hash_path, must_name='upload_archives') self._working_dir_mgr.add(**ua) self._spark_archives.append((ua['name'], ua['path'])) for hash_path in self._opts['upload_dirs']: # pick name based on directory path ud = parse_legacy_hash_path('dir', hash_path, must_name='upload_archives') # but feed working_dir_mgr the archive's path archive_path = self._dir_archive_path(ud['path']) self._working_dir_mgr.add( 'archive', archive_path, name=ud['name']) self._spark_archives.append((ud['name'], archive_path)) # py_files # self._setup is a list of shell commands with path dicts # interleaved; see mrjob.setup.parse_setup_cmd() for details self._setup = self._parse_setup_and_py_files() for cmd in self._setup: for token in cmd: if isinstance(token, dict): # convert dir archives tokens to archives if token['type'] == 'dir': # feed the archive's path to self._working_dir_mgr token['path'] = self._dir_archive_path(token['path']) token['type'] = 'archive' self._working_dir_mgr.add(**token) # Where to read input from (log files, etc.) self._input_paths = input_paths or ['-'] # by default read from stdin if PY2: self._stdin = stdin or sys.stdin else: self._stdin = stdin or sys.stdin.buffer self._stdin_path = None # temp file containing dump from stdin # where a zip file of the mrjob library is stored locally self._mrjob_zip_path = None # store output_dir self._output_dir = output_dir # store partitioner self._partitioner = partitioner # store sort_values self._sort_values = sort_values # store step_output_dir self._step_output_dir = step_output_dir # store hadoop input and output formats self._hadoop_input_format = hadoop_input_format self._hadoop_output_format = hadoop_output_format # A cache for self._get_steps(); also useful as a test hook self._steps = None # this variable marks whether a cleanup has happened and this runner's # output stream is no longer available. self._closed = False
def test_basic(self): self.assertEqual(parse_legacy_hash_path('file', 'foo#bar'), {'type': 'file', 'path': 'foo', 'name': 'bar'}) self.assertEqual(parse_legacy_hash_path('file', '/dir/foo#bar'), {'type': 'file', 'path': '/dir/foo', 'name': 'bar'})
def test_no_name(self): self.assertEqual(parse_legacy_hash_path('file', 'foo'), {'type': 'file', 'path': 'foo', 'name': None}) self.assertEqual(parse_legacy_hash_path('file', 'foo#'), {'type': 'file', 'path': 'foo', 'name': None})
def test_basic(self): self.assertEqual(parse_legacy_hash_path("file", "foo#bar"), {"type": "file", "path": "foo", "name": "bar"}) self.assertEqual( parse_legacy_hash_path("file", "/dir/foo#bar"), {"type": "file", "path": "/dir/foo", "name": "bar"} )
def test_no_name(self): self.assertEqual(parse_legacy_hash_path("file", "foo"), {"type": "file", "path": "foo", "name": None}) self.assertEqual(parse_legacy_hash_path("file", "foo#"), {"type": "file", "path": "foo", "name": None})