def test_allow_hidden_files(self): wd = WorkingDirManager() wd.add('archive', '_foo.tar.gz') wd.add('file', '.bazrc') self.assertEqual(wd.name('archive', '_foo.tar.gz'), '_foo.tar.gz') self.assertEqual(wd.name('file', '.bazrc'), '.bazrc')
class MRJobRunner(object): """Abstract base class for all runners""" #: alias for this runner; used for picking section of #: :py:mod:``mrjob.conf`` to load one of ``'local'``, ``'emr'``, #: or ``'hadoop'`` alias = None # if this is true, when bootstrap_mrjob is true, add it through the # setup script BOOTSTRAP_MRJOB_IN_SETUP = True OPTION_STORE_CLASS = RunnerOptionStore ### methods to call from your batch script ### def __init__(self, mr_job_script=None, conf_paths=None, extra_args=None, file_upload_args=None, hadoop_input_format=None, hadoop_output_format=None, input_paths=None, output_dir=None, partitioner=None, stdin=None, **opts): """All runners take the following keyword arguments: :type mr_job_script: str :param mr_job_script: the path of the ``.py`` file containing the :py:class:`~mrjob.job.MRJob`. If this is None, you won't actually be able to :py:meth:`run` the job, but other utilities (e.g. :py:meth:`ls`) will work. :type conf_paths: None or list :param conf_paths: List of config files to combine and use, or None to search for mrjob.conf in the default locations. :type extra_args: list of str :param extra_args: a list of extra cmd-line arguments to pass to the mr_job script. This is a hook to allow jobs to take additional arguments. :param file_upload_args: a list of tuples of ``('--ARGNAME', path)``. The file at the given path will be uploaded to the local directory of the mr_job script when it runs, and then passed into the script with ``--ARGNAME``. Useful for passing in SQLite DBs and other configuration files to your job. :type hadoop_input_format: str :param hadoop_input_format: name of an optional Hadoop ``InputFormat`` class. Passed to Hadoop along with your first step with the ``-inputformat`` option. Note that if you write your own class, you'll need to include it in your own custom streaming jar (see *hadoop_streaming_jar*). :type hadoop_output_format: str :param hadoop_output_format: name of an optional Hadoop ``OutputFormat`` class. Passed to Hadoop along with your first step with the ``-outputformat`` option. Note that if you write your own class, you'll need to include it in your own custom streaming jar (see *hadoop_streaming_jar*). :type input_paths: list of str :param input_paths: Input files for your job. Supports globs and recursively walks directories (e.g. ``['data/common/', 'data/training/*.gz']``). If this is left blank, we'll read from stdin :type output_dir: str :param output_dir: An empty/non-existent directory where Hadoop streaming should put the final output from the job. If you don't specify an output directory, we'll output into a subdirectory of this job's temporary directory. You can control this from the command line with ``--output-dir``. This option cannot be set from configuration files. If used with the hadoop runner, this path does not need to be fully qualified with ``hdfs://`` URIs because it's understood that it has to be on HDFS. :type partitioner: str :param partitioner: Optional name of a Hadoop partitoner class, e.g. ``'org.apache.hadoop.mapred.lib.HashPartitioner'``. Hadoop streaming will use this to determine how mapper output should be sorted and distributed to reducers. :param stdin: an iterable (can be a ``BytesIO`` or even a list) to use as stdin. This is a hook for testing; if you set ``stdin`` via :py:meth:`~mrjob.job.MRJob.sandbox`, it'll get passed through to the runner. If for some reason your lines are missing newlines, we'll add them; this makes it easier to write automated tests. """ self._ran_job = False self._opts = self.OPTION_STORE_CLASS(self.alias, opts, conf_paths) self._fs = None self._working_dir_mgr = WorkingDirManager() self._script_path = mr_job_script if self._script_path: self._working_dir_mgr.add('file', self._script_path) # give this job a unique name self._job_key = self._make_unique_job_key( label=self._opts['label'], owner=self._opts['owner']) # we'll create the wrapper script later self._setup_wrapper_script_path = None # extra args to our job self._extra_args = list(extra_args) if extra_args else [] # extra file arguments to our job self._file_upload_args = [] if file_upload_args: for arg, path in file_upload_args: arg_file = parse_legacy_hash_path('file', path) self._working_dir_mgr.add(**arg_file) self._file_upload_args.append((arg, arg_file)) # set up uploading for path in self._opts['upload_files']: self._working_dir_mgr.add(**parse_legacy_hash_path( 'file', path, must_name='upload_files')) for path in self._opts['upload_archives']: self._working_dir_mgr.add(**parse_legacy_hash_path( 'archive', path, must_name='upload_archives')) # python_archives, setup, setup_cmds, and setup_scripts # self._setup is a list of shell commands with path dicts # interleaved; see mrjob.setup.parse_setup_cmds() for details self._setup = self._parse_setup() for cmd in self._setup: for maybe_path_dict in cmd: if isinstance(maybe_path_dict, dict): self._working_dir_mgr.add(**maybe_path_dict) # Where to read input from (log files, etc.) self._input_paths = input_paths or ['-'] # by default read from stdin if PY2: self._stdin = stdin or sys.stdin else: self._stdin = stdin or sys.stdin.buffer self._stdin_path = None # temp file containing dump from stdin # where a tarball of the mrjob library is stored locally self._mrjob_tar_gz_path = None # store output_dir self._output_dir = output_dir # store partitioner self._partitioner = partitioner # store hadoop input and output formats self._hadoop_input_format = hadoop_input_format self._hadoop_output_format = hadoop_output_format # a local tmp directory that will be cleaned up when we're done # access/make this using self._get_local_tmp_dir() self._local_tmp_dir = None # A cache for self._get_steps(); also useful as a test hook self._steps = None # if this is True, we have to pipe input into the sort command # rather than feed it multiple files self._sort_is_windows_sort = None # this variable marks whether a cleanup has happened and this runner's # output stream is no longer available. self._closed = False ### Filesystem object ### @property def fs(self): """:py:class:`~mrjob.fs.base.Filesystem` object for the local filesystem. Methods on :py:class:`~mrjob.fs.base.Filesystem` objects will be forwarded to :py:class:`~mrjob.runner.MRJobRunner` until mrjob 0.6.0, but **this behavior is deprecated.** """ if self._fs is None: # wrap LocalFilesystem in CompositeFilesystem to get IOError # on URIs (see #1185) self._fs = CompositeFilesystem(LocalFilesystem()) return self._fs def __getattr__(self, name): # For backward compatibility, forward filesystem methods try: value = getattr(self.fs, name) except AttributeError: raise AttributeError(name) # friendly deprecation warning is_func = ismethod(value) or isfunction(value) log.warning( 'deprecated: %s %s.fs.%s%s directly' ' (%s.%s is going away in v0.6.0)' % ( 'call' if is_func else 'access', self.__class__.__name__, name, '()' if is_func else '', self.__class__.__name__, name)) return value ### Running the job and parsing output ### def run(self): """Run the job, and block until it finishes. Raise an exception if there are any problems. """ if not self._script_path: raise AssertionError("No script to run!") if self._ran_job: raise AssertionError("Job already ran!") self._run() self._ran_job = True def stream_output(self): """Stream raw lines from the job's output. You can parse these using the read() method of the appropriate HadoopStreamingProtocol class.""" output_dir = self.get_output_dir() if output_dir is None: raise AssertionError('Run the job before streaming output') if self._closed is True: log.warning( 'WARNING! Trying to stream output from a closed runner, output' ' will probably be empty.') log.info('Streaming final output from %s' % output_dir) def split_path(path): while True: base, name = os.path.split(path) # no more elements if not name: break yield name path = base for filename in self.fs.ls(output_dir): subpath = filename[len(output_dir):] if not any(name.startswith('_') for name in split_path(subpath)): for line in self.fs._cat_file(filename): yield line def _cleanup_mode(self, mode=None): """Actual cleanup action to take based on various options""" if self._script_path and not self._ran_job: return mode or self._opts['cleanup_on_failure'] else: return mode or self._opts['cleanup'] def _cleanup_local_tmp(self): """Cleanup any files/directories on the local machine we created while running this job. Should be safe to run this at any time, or multiple times. This particular function removes any local tmp directories added to the list self._local_tmp_dirs This won't remove output_dir if it's outside of our tmp dir. """ if self._local_tmp_dir: log.info('removing tmp directory %s' % self._local_tmp_dir) try: shutil.rmtree(self._local_tmp_dir) except OSError as e: log.exception(e) self._local_tmp_dir = None def _cleanup_remote_tmp(self): """Cleanup any files/directories on the remote machine (S3) we created while running this job. Should be safe to run this at any time, or multiple times. """ pass # this only happens on EMR def _cleanup_logs(self): """Cleanup any log files that are created as a side-effect of the job. """ pass # this only happens on EMR def _cleanup_job(self): """Stop any jobs that we created that are still running.""" pass # this only happens on EMR def _cleanup_job_flow(self): """Terminate the job flow if there is one.""" pass # this only happens on EMR def cleanup(self, mode=None): """Clean up running jobs, temp files, and logs, subject to the *cleanup* option passed to the constructor. If you create your runner in a :keyword:`with` block, :py:meth:`cleanup` will be called automatically:: with mr_job.make_runner() as runner: ... # cleanup() called automatically here :param mode: override *cleanup* passed into the constructor. Should be a list of strings from :py:data:`CLEANUP_CHOICES` """ mode = self._cleanup_mode(mode) def mode_has(*args): return any((choice in mode) for choice in args) if self._script_path and not self._ran_job: if mode_has('JOB_FLOW', 'ALL'): self._cleanup_job_flow() if mode_has('JOB', 'ALL'): self._cleanup_job() if mode_has('ALL', 'TMP', 'LOCAL_TMP'): self._cleanup_local_tmp() if mode_has('ALL', 'TMP', 'REMOTE_TMP'): self._cleanup_remote_tmp() if mode_has('ALL', 'LOGS'): self._cleanup_logs() self._closed = True def counters(self): """Get counters associated with this run in this form:: [{'group name': {'counter1': 1, 'counter2': 2}}, {'group name': ...}] The list contains an entry for every step of the current job, ignoring earlier steps in the same job flow. """ raise NotImplementedError def _print_counters(self, step_nums=None): """Log this run's counters in a user-friendly way. :type step_nums: list of int :param step_nums: Optional list of indexes of steps in ``self.counters()`` to filter on. Prints step nums 1-indexed (e.g. "step 1"), but *step_nums* is 0-indexed (e.g. [0]). """ for step_num, step_counters in enumerate(self.counters()): if step_nums is None or step_num in step_nums: log.info('Counters from step %d:' % (step_num + 1)) if step_counters: for group, group_counters in sorted(step_counters.items()): log.info('\t%s' % group) for counter, amount in sorted(group_counters.items()): log.info('\t\t%s=%d' % (counter, amount)) else: log.info(' (none found)') ### hooks for the with statement ### def __enter__(self): """Don't do anything special at start of with block""" return self def __exit__(self, type, value, traceback): """Call self.cleanup() at end of with block.""" self.cleanup() ### more runner information ### def get_opts(self): """Get options set for this runner, as a dict.""" return copy.deepcopy(self._opts) def get_job_key(self): """Get the unique key for the job run by this runner. This has the format ``label.owner.date.time.microseconds`` """ return self._job_key def get_job_name(self): """Alias for :py:meth:`get_job_key`. Will be removed in v0.6.0. .. deprecated:: 0.5.0 """ log.warn('get_job_name() has been renamed to get_job_key().' ' get_job_name() will be removed in v0.6.0') return self.get_job_key() def get_output_dir(self): """Find the directory containing the job output. If the job hasn't run yet, returns None""" if self._script_path and not self._ran_job: return None return self._output_dir ### other methods you need to implement in your subclass ### def get_hadoop_version(self): """Return the version number of the Hadoop environment as a string if Hadoop is being used or simulated. Return None if not applicable. :py:class:`~mrjob.emr.EMRJobRunner` infers this from the job flow. :py:class:`~mrjob.hadoop.HadoopJobRunner` gets this from ``hadoop version``. :py:class:`~mrjob.local.LocalMRJobRunner` has an additional `hadoop_version` option to specify which version it simulates, with a default of 0.20. :py:class:`~mrjob.inline.InlineMRJobRunner` does not simulate Hadoop at all. """ return None # you'll probably wan't to add your own __init__() and cleanup() as well def _run(self): """Run the job.""" raise NotImplementedError ### internal utilities for implementing MRJobRunners ### def _get_local_tmp_dir(self): """Create a tmp directory on the local filesystem that will be cleaned up by self.cleanup()""" if not self._local_tmp_dir: path = os.path.join(self._opts['local_tmp_dir'], self._job_key) log.info('creating tmp directory %s' % path) if os.path.isdir(path): shutil.rmtree(path) os.makedirs(path) self._local_tmp_dir = path return self._local_tmp_dir def _make_unique_job_key(self, label=None, owner=None): """Come up with a useful unique ID for this job. We use this to choose the output directory, etc. for the job. """ # use the name of the script if one wasn't explicitly # specified if not label: if self._script_path: label = os.path.basename(self._script_path).split('.')[0] else: label = 'no_script' if not owner: owner = 'no_user' now = datetime.datetime.utcnow() return '%s.%s.%s.%06d' % ( label, owner, now.strftime('%Y%m%d.%H%M%S'), now.microsecond) def _get_steps(self): """Call the job script to find out how many steps it has, and whether there are mappers and reducers for each step. Validate its output. Returns output as described in :ref:`steps-format`. Results are cached, so call this as many times as you want. """ if self._steps is None: if not self._script_path: self._steps = [] else: args = (self._executable(True) + ['--steps'] + self._mr_job_extra_args(local=True)) log.debug('> %s' % cmd_line(args)) # add . to PYTHONPATH (in case mrjob isn't actually installed) env = combine_local_envs(os.environ, {'PYTHONPATH': os.path.abspath('.')}) steps_proc = Popen(args, stdout=PIPE, stderr=PIPE, env=env) stdout, stderr = steps_proc.communicate() if steps_proc.returncode != 0: raise Exception( 'error getting step information: \n%s' % stderr) # on Python 3, convert stdout to str so we can json.loads() it if not isinstance(stdout, str): stdout = stdout.decode('utf_8') try: steps = json.loads(stdout) except ValueError: raise ValueError("Bad --steps response: \n%s" % stdout) # verify that this is a proper step description if not steps or not stdout: raise ValueError('step description is empty!') for step in steps: if step['type'] not in STEP_TYPES: raise ValueError( 'unexpected step type %r in steps %r' % ( step['type'], stdout)) self._steps = steps return self._steps def _get_step(self, step_num): """Get a single step (calls :py:meth:`_get_steps`).""" return self._get_steps()[step_num] def _num_steps(self): """Get the number of steps (calls :py:meth:`get_steps`).""" return len(self._get_steps()) def _interpreter(self, steps=False): if steps: return (self._opts['steps_interpreter'] or self._opts['interpreter'] or self._python_bin(steps=True)) else: return (self._opts['interpreter'] or self._python_bin()) def _executable(self, steps=False): if steps: return self._interpreter(steps=True) + [self._script_path] else: return self._interpreter() + [ self._working_dir_mgr.name('file', self._script_path)] def _python_bin(self, steps=False): if steps: return (self._opts['steps_python_bin'] or self._default_python_bin(local=True)) else: return (self._opts['python_bin'] or self._default_python_bin()) def _default_python_bin(self, local=False): """The default python command. If local is true, try to use sys.executable. Otherwise use 'python' or 'python3' as appropriate. This returns a single-item list (because it's a command). """ if local and sys.executable: return [sys.executable] elif PY2: return ['python'] else: # e.g. python3 return ['python%d' % sys.version_info[0]] def _script_args_for_step(self, step_num, mrc): assert self._script_path args = self._executable() + [ '--step-num=%d' % step_num, '--%s' % mrc, ] + self._mr_job_extra_args() if self._setup_wrapper_script_path: return (self._opts['sh_bin'] + [self._working_dir_mgr.name( 'file', self._setup_wrapper_script_path)] + args) else: return args def _substep_cmd_line(self, step_num, mrc): step = self._get_step(step_num) if step[mrc]['type'] == 'command': # never wrap custom hadoop streaming commands in bash return step[mrc]['command'], False elif step[mrc]['type'] == 'script': cmd = cmd_line(self._script_args_for_step(step_num, mrc)) # filter input and pipe for great speed, if user asks # but we have to wrap the command in bash if 'pre_filter' in step[mrc]: return '%s | %s' % (step[mrc]['pre_filter'], cmd), True else: return cmd, False else: raise ValueError("Invalid %s step %d: %r" % ( mrc, step_num, step[mrc])) def _render_substep(self, step_num, mrc): step = self._get_step(step_num) if mrc in step: return self._substep_cmd_line(step_num, mrc) else: if mrc == 'mapper': return 'cat', False else: return None, False def _hadoop_streaming_commands(self, step_num): version = self.get_hadoop_version() # Hadoop streaming stuff mapper, bash_wrap_mapper = self._render_substep( step_num, 'mapper') combiner, bash_wrap_combiner = self._render_substep( step_num, 'combiner') reducer, bash_wrap_reducer = self._render_substep( step_num, 'reducer') if (combiner is not None and not supports_combiners_in_hadoop_streaming(version)): # krazy hack to support combiners on hadoop <0.20 bash_wrap_mapper = True mapper = "%s | sort | %s" % (mapper, combiner) # take the combiner away, hadoop will just be confused combiner = None bash_wrap_combiner = False if bash_wrap_mapper: mapper = bash_wrap(mapper) if bash_wrap_combiner: combiner = bash_wrap(combiner) if bash_wrap_reducer: reducer = bash_wrap(reducer) return mapper, combiner, reducer def _mr_job_extra_args(self, local=False): """Return arguments to add to every invocation of MRJob. :type local: boolean :param local: if this is True, use files' local paths rather than the path they'll have inside Hadoop streaming """ return (self._get_file_upload_args(local=local) + self._get_strict_protocols_args() + self._extra_args) def _get_file_upload_args(self, local=False): """Arguments used to pass through config files, etc from the job runner through to the local directory where the script is run. :type local: boolean :param local: if this is True, use files' local paths rather than the path they'll have inside Hadoop streaming """ args = [] for arg, path_dict in self._file_upload_args: args.append(arg) if local: args.append(path_dict['path']) else: args.append(self._working_dir_mgr.name(**path_dict)) return args def _get_strict_protocols_args(self): """Arguments used to control protocol behavior in the job. This just adds --no-strict-protocols when strict_protocols is false. """ # These are only in the runner so that we can default them from # mrjob.conf, which will allow us to eventually remove them. # See issue #726. if not self._opts['strict_protocols']: return ['--no-strict-protocols'] else: return [] def _create_setup_wrapper_script( self, dest='setup-wrapper.sh', local=False): """Create the wrapper script, and write it into our local temp directory (by default, to a file named wrapper.sh). This will set ``self._setup_wrapper_script_path``, and add it to ``self._working_dir_mgr`` This will do nothing if ``self._setup`` is empty or this method has already been called. If *local* is true, use local line endings (e.g. Windows). Otherwise, use UNIX line endings (see #1071). """ if self._setup_wrapper_script_path: return setup = self._setup if self._bootstrap_mrjob() and self.BOOTSTRAP_MRJOB_IN_SETUP: # patch setup to add mrjob.tar.gz to PYTYHONPATH mrjob_tar_gz = self._create_mrjob_tar_gz() path_dict = {'type': 'archive', 'name': None, 'path': mrjob_tar_gz} self._working_dir_mgr.add(**path_dict) setup = [['export PYTHONPATH=', path_dict, ':$PYTHONPATH']] + setup if not setup: return path = os.path.join(self._get_local_tmp_dir(), dest) log.info('writing wrapper script to %s' % path) contents = self._setup_wrapper_script_content(setup) for line in contents: log.debug('WRAPPER: ' + line.rstrip('\n')) if local: with open(path, 'w') as f: for line in contents: f.write(line) else: with open(path, 'wb') as f: for line in contents: f.write(line.encode('utf-8')) self._setup_wrapper_script_path = path self._working_dir_mgr.add('file', self._setup_wrapper_script_path) def _parse_setup(self): """Parse the *setup* option with :py:func:`mrjob.setup.parse_setup_cmd()`. If *bootstrap_mrjob* and ``self.BOOTSTRAP_MRJOB_IN_SETUP`` are both true, create mrjob.tar.gz (if it doesn't exist already) and prepend a setup command that adds it to PYTHONPATH. Also patch in the deprecated options *python_archives*, *setup_cmd*, and *setup_script* as setup commands. """ setup = [] # python_archives for path in self._opts['python_archives']: path_dict = parse_legacy_hash_path('archive', path) setup.append(['export PYTHONPATH=', path_dict, ':$PYTHONPATH']) # setup for cmd in self._opts['setup']: setup.append(parse_setup_cmd(cmd)) # setup_cmds if self._opts['setup_cmds']: log.warning( "setup_cmds is deprecated since v0.4.2 and will be removed" " in v0.6.0. Consider using setup instead.") for cmd in self._opts['setup_cmds']: if not isinstance(cmd, string_types): cmd = cmd_line(cmd) setup.append([cmd]) # setup_scripts if self._opts['setup_scripts']: log.warning( "setup_scripts is deprecated since v0.4.2 and will be removed" " in v0.6.0. Consider using setup instead.") for path in self._opts['setup_scripts']: path_dict = parse_legacy_hash_path('file', path) setup.append([path_dict]) return setup def _setup_wrapper_script_content(self, setup, mrjob_tar_gz_name=None): """Return a (Bourne) shell script that runs the setup commands and then executes whatever is passed to it (this will be our mapper/reducer), as a list of strings (one for each line, including newlines). We obtain a file lock so that two copies of the setup commands cannot run simultaneously on the same machine (this helps for running :command:`make` on a shared source code archive, for example). """ out = [] def writeln(line=''): out.append(line + '\n') # we're always going to execute this script as an argument to # sh, so there's no need to add a shebang (e.g. #!/bin/sh) writeln('# store $PWD') writeln('__mrjob_PWD=$PWD') writeln() writeln('# obtain exclusive file lock') # Basically, we're going to tie file descriptor 9 to our lockfile, # use a subprocess to obtain a lock (which we somehow inherit too), # and then release the lock by closing the file descriptor. # File descriptors 10 and higher are used internally by the shell, # so 9 is as out-of-the-way as we can get. writeln('exec 9>/tmp/wrapper.lock.%s' % self._job_key) # would use flock(1), but it's not always available writeln("%s -c 'import fcntl; fcntl.flock(9, fcntl.LOCK_EX)'" % cmd_line(self._python_bin())) writeln() writeln('# setup commands') # group setup commands so we can redirect their input/output (see # below). Don't use parens; this would invoke a subshell, which would # keep us from exporting environment variables to the task. writeln('{') for cmd in setup: # reconstruct the command line, substituting $__mrjob_PWD/<name> # for path dicts line = ' ' # indent, since these commands are in a group for token in cmd: if isinstance(token, dict): # it's a path dictionary line += '$__mrjob_PWD/' line += pipes.quote(self._working_dir_mgr.name(**token)) else: # it's raw script line += token writeln(line) # redirect setup commands' input/output so they don't interfere # with the task (see Issue #803). writeln('} 0</dev/null 1>&2') writeln() writeln('# release exclusive file lock') writeln('exec 9>&-') writeln() writeln('# run task from the original working directory') writeln('cd $__mrjob_PWD') writeln('"$@"') return out def _bootstrap_mrjob(self): """Should we bootstrap mrjob?""" if self._opts['bootstrap_mrjob'] is None: return self._opts['interpreter'] is None else: return bool(self._opts['bootstrap_mrjob']) def _get_input_paths(self): """Get the paths to input files, dumping STDIN to a local file if need be.""" if '-' in self._input_paths: if self._stdin_path is None: # prompt user, so they don't think the process has stalled log.info('reading from STDIN') stdin_path = os.path.join(self._get_local_tmp_dir(), 'STDIN') log.debug('dumping stdin to local file %s' % stdin_path) with open(stdin_path, 'wb') as stdin_file: for line in self._stdin: # catch missing newlines (often happens with test data) if not line.endswith(b'\n'): line += b'\n' stdin_file.write(line) self._stdin_path = stdin_path return [self._stdin_path if p == '-' else p for p in self._input_paths] def _create_mrjob_tar_gz(self): """Make a tarball of the mrjob library, without .pyc or .pyo files, This will also set ``self._mrjob_tar_gz_path`` and return it. Typically called from :py:meth:`_create_setup_wrapper_script`. It's safe to call this method multiple times (we'll only create the tarball once.) """ if not self._mrjob_tar_gz_path: # find mrjob library import mrjob if not os.path.basename(mrjob.__file__).startswith('__init__.'): raise Exception( "Bad path for mrjob library: %s; can't bootstrap mrjob", mrjob.__file__) mrjob_dir = os.path.dirname(mrjob.__file__) or '.' tar_gz_path = os.path.join(self._get_local_tmp_dir(), 'mrjob.tar.gz') def filter_path(path): filename = os.path.basename(path) return not(file_ext(filename).lower() in ('.pyc', '.pyo') or # filter out emacs backup files filename.endswith('~') or # filter out emacs lock files filename.startswith('.#') or # filter out MacFuse resource forks filename.startswith('._')) log.debug('archiving %s -> %s as %s' % ( mrjob_dir, tar_gz_path, os.path.join('mrjob', ''))) tar_and_gzip( mrjob_dir, tar_gz_path, filter=filter_path, prefix='mrjob') self._mrjob_tar_gz_path = tar_gz_path return self._mrjob_tar_gz_path def _jobconf_for_step(self, step_num): """Get the jobconf dictionary, optionally including step-specific jobconf info. Also translate jobconfs to the current Hadoop version, if necessary. """ step = self._get_step(step_num) jobconf = combine_dicts(self._opts['jobconf'], step.get('jobconf')) # if user is using the wrong jobconfs, add in the correct ones self._update_jobconf_for_hadoop_version( jobconf, self.get_hadoop_version()) return jobconf def _update_jobconf_for_hadoop_version(self, jobconf, hadoop_version): """If *jobconf* (a dict) contains jobconf variables from the wrong version of Hadoop, add variables for the right one. If *hadoop_version* is empty, do nothing. """ if not hadoop_version: # this happens for sim runner return translations = {} # for warning, below for key, value in sorted(jobconf.items()): new_key = translate_jobconf(key, hadoop_version) if new_key not in jobconf: jobconf[new_key] = value translations[key] = new_key if translations: log.warning( "Detected hadoop configuration property names that" " do not match hadoop version %s:" "\nThey have been translated as follows\n %s", hadoop_version, '\n'.join([ "%s: %s" % (key, new_key) for key, new_key in sorted(translations.items())])) def _hadoop_args_for_step(self, step_num): """Build a list of extra arguments to the hadoop binary. This handles *cmdenv*, *hadoop_extra_args*, *hadoop_input_format*, *hadoop_output_format*, *jobconf*, and *partitioner*. This doesn't handle input, output, mappers, reducers, or uploading files. """ assert 0 <= step_num < self._num_steps() args = [] # hadoop_extra_args args.extend(self._opts['hadoop_extra_args']) # new-style jobconf version = self.get_hadoop_version() # translate the jobconf configuration names to match # the hadoop version jobconf = self._jobconf_for_step(step_num) if uses_generic_jobconf(version): for key, value in sorted(jobconf.items()): if value is not None: args.extend(['-D', '%s=%s' % (key, value)]) # old-style jobconf else: for key, value in sorted(jobconf.items()): if value is not None: args.extend(['-jobconf', '%s=%s' % (key, value)]) # partitioner if self._partitioner: args.extend(['-partitioner', self._partitioner]) # cmdenv for key, value in sorted(self._opts['cmdenv'].items()): args.append('-cmdenv') args.append('%s=%s' % (key, value)) # hadoop_input_format if (step_num == 0 and self._hadoop_input_format): args.extend(['-inputformat', self._hadoop_input_format]) # hadoop_output_format if (step_num == self._num_steps() - 1 and self._hadoop_output_format): args.extend(['-outputformat', self._hadoop_output_format]) return args def _arg_hash_paths(self, type, upload_mgr): """Helper function for the *upload_args methods.""" for name, path in self._working_dir_mgr.name_to_path(type).items(): uri = self._upload_mgr.uri(path) yield '%s#%s' % (uri, name) def _upload_args(self, upload_mgr): args = [] # TODO: does Hadoop have a way of coping with paths that have # commas in their names? file_hash_paths = list(self._arg_hash_paths('file', upload_mgr)) if file_hash_paths: args.append('-files') args.append(','.join(file_hash_paths)) archive_hash_paths = list(self._arg_hash_paths('archive', upload_mgr)) if archive_hash_paths: args.append('-archives') args.append(','.join(archive_hash_paths)) return args def _pre_0_20_upload_args(self, upload_mgr): """-files/-archive args for Hadoop prior to 0.20.203""" args = [] for file_hash in self._arg_hash_paths('file', upload_mgr): args.append('-cacheFile') args.append(file_hash) for archive_hash in self._arg_hash_paths('archive', upload_mgr): args.append('-cacheArchive') args.append(archive_hash) return args def _invoke_sort(self, input_paths, output_path): """Use the local sort command to sort one or more input files. Raise an exception if there is a problem. This is is just a wrapper to handle limitations of Windows sort (see Issue #288). :type input_paths: list of str :param input_paths: paths of one or more input files :type output_path: str :param output_path: where to pipe sorted output into """ if not input_paths: raise ValueError('Must specify at least one input path.') # ignore locale when sorting env = os.environ.copy() env['LC_ALL'] = 'C' # Make sure that the tmp dir environment variables are changed if # the default is changed. env['TMP'] = self._opts['local_tmp_dir'] env['TMPDIR'] = self._opts['local_tmp_dir'] env['TEMP'] = self._opts['local_tmp_dir'] log.info('writing to %s' % output_path) err_path = os.path.join(self._get_local_tmp_dir(), 'sort-stderr') # assume we're using UNIX sort unless we know otherwise if (not self._sort_is_windows_sort) or len(input_paths) == 1: with open(output_path, 'wb') as output: with open(err_path, 'wb') as err: args = ['sort'] + list(input_paths) log.info('> %s' % cmd_line(args)) try: check_call(args, stdout=output, stderr=err, env=env) return except CalledProcessError: pass # Looks like we're using Windows sort self._sort_is_windows_sort = True log.info('Piping files into sort for Windows compatibility') with open(output_path, 'wb') as output: with open(err_path, 'wb') as err: args = ['sort'] log.info('> %s' % cmd_line(args)) proc = Popen(args, stdin=PIPE, stdout=output, stderr=err, env=env) # shovel bytes into the sort process for input_path in input_paths: with open(input_path, 'rb') as input: while True: buf = input.read(_BUFFER_SIZE) if not buf: break proc.stdin.write(buf) proc.stdin.close() proc.wait() if proc.returncode == 0: return # looks like there was a problem. log it and raise an error with open(err_path) as err: for line in err: log.error('STDERR: %s' % line.rstrip('\r\n')) raise CalledProcessError(proc.returncode, args)
def test_cant_auto_name_unless_added_as_auto(self): wd = WorkingDirManager() wd.add("file", "bar.py", name="qux.py") self.assertEqual(wd.name("file", "bar.py", "qux.py"), "qux.py") self.assertRaises(ValueError, wd.name, "file", "bar.py")
def test_eager_naming(self): wd = WorkingDirManager() wd.add("file", "qux.py") # qux.py by default self.assertEqual(wd.name("file", "qux.py"), "qux.py") # whoops, picked that name too soon! self.assertRaises(ValueError, wd.add, "file", "bar.py", name="qux.py")
def test_cant_auto_name_unless_added_as_auto(self): wd = WorkingDirManager() wd.add('file', 'bar.py', name='qux.py') self.assertEqual(wd.name('file', 'bar.py', 'qux.py'), 'qux.py') self.assertRaises(ValueError, wd.name, 'file', 'bar.py')
def test_eager_naming(self): wd = WorkingDirManager() wd.add('file', 'qux.py') # qux.py by default self.assertEqual(wd.name('file', 'qux.py'), 'qux.py') # whoops, picked that name too soon! self.assertRaises(ValueError, wd.add, 'file', 'bar.py', name='qux.py')
class MRJobRunner(object): """Abstract base class for all runners""" # this class handles the basic runner framework, options and config files, # arguments to mrjobs, and setting up job working dirs and environments. # this will put files from setup scripts, py_files, and bootstrap_mrjob # into the job's working dir, but won't actually run/import them # # command lines to run substeps (including Spark) are handled by # mrjob.bin.MRJobBinRunner #: alias for this runner; used for picking section of #: :py:mod:``mrjob.conf`` to load one of ``'local'``, ``'emr'``, #: or ``'hadoop'`` alias = None # libjars is only here because the job can set it; might want to # handle this with a warning from the launcher instead OPT_NAMES = { 'bootstrap_mrjob', 'check_input_paths', 'cleanup', 'cleanup_on_failure', 'cmdenv', 'jobconf', 'label', 'libjars', 'local_tmp_dir', 'owner', 'py_files', 'setup', 'upload_archives', 'upload_dirs', 'upload_files' } # if this is true, when bootstrap_mrjob is true, add it through the # setup script _BOOTSTRAP_MRJOB_IN_SETUP = True ### methods to call from your batch script ### def __init__(self, mr_job_script=None, conf_paths=None, extra_args=None, file_upload_args=None, hadoop_input_format=None, hadoop_output_format=None, input_paths=None, output_dir=None, partitioner=None, sort_values=None, stdin=None, step_output_dir=None, **opts): """All runners take the following keyword arguments: :type mr_job_script: str :param mr_job_script: the path of the ``.py`` file containing the :py:class:`~mrjob.job.MRJob`. If this is None, you won't actually be able to :py:meth:`run` the job, but other utilities (e.g. :py:meth:`ls`) will work. :type conf_paths: None or list :param conf_paths: List of config files to combine and use, or None to search for mrjob.conf in the default locations. :type extra_args: list of str :param extra_args: a list of extra cmd-line arguments to pass to the mr_job script. This is a hook to allow jobs to take additional arguments. :param file_upload_args: a list of tuples of ``('--ARGNAME', path)``. The file at the given path will be uploaded to the local directory of the mr_job script when it runs, and then passed into the script with ``--ARGNAME``. Useful for passing in SQLite DBs and other configuration files to your job. :type hadoop_input_format: str :param hadoop_input_format: name of an optional Hadoop ``InputFormat`` class. Passed to Hadoop along with your first step with the ``-inputformat`` option. Note that if you write your own class, you'll need to include it in your own custom streaming jar (see :mrjob-opt:`hadoop_streaming_jar`). :type hadoop_output_format: str :param hadoop_output_format: name of an optional Hadoop ``OutputFormat`` class. Passed to Hadoop along with your first step with the ``-outputformat`` option. Note that if you write your own class, you'll need to include it in your own custom streaming jar (see :mrjob-opt:`hadoop_streaming_jar`). :type input_paths: list of str :param input_paths: Input files for your job. Supports globs and recursively walks directories (e.g. ``['data/common/', 'data/training/*.gz']``). If this is left blank, we'll read from stdin :type output_dir: str :param output_dir: An empty/non-existent directory where Hadoop should put the final output from the job. If you don't specify an output directory, we'll output into a subdirectory of this job's temporary directory. You can control this from the command line with ``--output-dir``. This option cannot be set from configuration files. If used with the hadoop runner, this path does not need to be fully qualified with ``hdfs://`` URIs because it's understood that it has to be on HDFS. :type partitioner: str :param partitioner: Optional name of a Hadoop partitioner class, e.g. ``'org.apache.hadoop.mapred.lib.HashPartitioner'``. Hadoop streaming will use this to determine how mapper output should be sorted and distributed to reducers. :type sort_values: bool :param sort_values: if true, set partitioners and jobconf variables so that reducers to receive the values associated with any key in sorted order (sorted by their *encoded* value). Also known as secondary sort. :param stdin: an iterable (can be a ``BytesIO`` or even a list) to use as stdin. This is a hook for testing; if you set ``stdin`` via :py:meth:`~mrjob.job.MRJob.sandbox`, it'll get passed through to the runner. If for some reason your lines are missing newlines, we'll add them; this makes it easier to write automated tests. :type step_output_dir: str :param step_output_dir: An empty/non-existent directory where Hadoop should put output from all steps other than the last one (this only matters for multi-step jobs). Currently ignored by local runners. """ self._ran_job = False # opts are made from: # # empty defaults (everything set to None) # runner-specific defaults # opts from config file(s) # opts from command line self._opts = self._combine_confs( [(None, {key: None for key in self.OPT_NAMES})] + [(None, self._default_opts())] + load_opts_from_mrjob_confs(self.alias, conf_paths) + [('the command line', opts)] ) log.debug('Active configuration:') log.debug(pprint.pformat({ opt_key: self._obfuscate_opt(opt_key, opt_value) for opt_key, opt_value in self._opts.items() })) self._fs = None # a local tmp directory that will be cleaned up when we're done # access/make this using self._get_local_tmp_dir() self._local_tmp_dir = None self._working_dir_mgr = WorkingDirManager() # mapping from dir to path for corresponding archive. we pick # paths during init(), but don't actually create the archives # until self._create_dir_archives() is called self._dir_to_archive_path = {} # dir archive names (the filename minus ".tar.gz") already taken self._dir_archive_names_taken = set() # set of dir_archives that have actually been created self._dir_archives_created = set() # track (name, path) of files and archives to upload to spark. # these are a subset of those in self._working_dir_mgr self._spark_files = [] self._spark_archives = [] self._upload_mgr = None # define in subclasses that use this self._script_path = mr_job_script if self._script_path: self._working_dir_mgr.add('file', self._script_path) # give this job a unique name self._job_key = self._make_unique_job_key( label=self._opts['label'], owner=self._opts['owner']) # extra args to our job self._extra_args = list(extra_args) if extra_args else [] for extra_arg in self._extra_args: if isinstance(extra_arg, dict): if extra_arg.get('type') != 'file': raise NotImplementedError self._working_dir_mgr.add(**extra_arg) self._spark_files.append( (extra_arg['name'], extra_arg['path'])) # extra file arguments to our job if file_upload_args: log.warning('file_upload_args is deprecated and will be removed' ' in v0.6.0. Pass dicts to extra_args instead.') for arg, path in file_upload_args: arg_file = parse_legacy_hash_path('file', path) self._working_dir_mgr.add(**arg_file) self._extra_args.extend([arg, arg_file]) self._spark_files.append((arg_file['name'], arg_file['path'])) # set up uploading for hash_path in self._opts['upload_files']: uf = parse_legacy_hash_path('file', hash_path, must_name='upload_files') self._working_dir_mgr.add(**uf) self._spark_files.append((uf['name'], uf['path'])) for hash_path in self._opts['upload_archives']: ua = parse_legacy_hash_path('archive', hash_path, must_name='upload_archives') self._working_dir_mgr.add(**ua) self._spark_archives.append((ua['name'], ua['path'])) for hash_path in self._opts['upload_dirs']: # pick name based on directory path ud = parse_legacy_hash_path('dir', hash_path, must_name='upload_archives') # but feed working_dir_mgr the archive's path archive_path = self._dir_archive_path(ud['path']) self._working_dir_mgr.add( 'archive', archive_path, name=ud['name']) self._spark_archives.append((ud['name'], archive_path)) # py_files # self._setup is a list of shell commands with path dicts # interleaved; see mrjob.setup.parse_setup_cmd() for details self._setup = self._parse_setup_and_py_files() for cmd in self._setup: for token in cmd: if isinstance(token, dict): # convert dir archives tokens to archives if token['type'] == 'dir': # feed the archive's path to self._working_dir_mgr token['path'] = self._dir_archive_path(token['path']) token['type'] = 'archive' self._working_dir_mgr.add(**token) # Where to read input from (log files, etc.) self._input_paths = input_paths or ['-'] # by default read from stdin if PY2: self._stdin = stdin or sys.stdin else: self._stdin = stdin or sys.stdin.buffer self._stdin_path = None # temp file containing dump from stdin # where a zip file of the mrjob library is stored locally self._mrjob_zip_path = None # store output_dir self._output_dir = output_dir # store partitioner self._partitioner = partitioner # store sort_values self._sort_values = sort_values # store step_output_dir self._step_output_dir = step_output_dir # store hadoop input and output formats self._hadoop_input_format = hadoop_input_format self._hadoop_output_format = hadoop_output_format # A cache for self._get_steps(); also useful as a test hook self._steps = None # this variable marks whether a cleanup has happened and this runner's # output stream is no longer available. self._closed = False ### Options #### def _default_opts(self): try: owner = getpass.getuser() except: owner = None return dict( check_input_paths=True, cleanup=['ALL'], cleanup_on_failure=['NONE'], local_tmp_dir=tempfile.gettempdir(), owner=owner, ) def _combine_confs(self, source_and_opt_list): """Combine several opt dictionaries into one. *source_and_opt_list* is a list of tuples of *source*, *opts* where *opts* is a dictionary and *source* is either None or a description of where the opts came from (usually a path). Only override this if you need truly fine-grained control, including knowledge of the options' source. """ opt_list = [ self._fix_opts(opts, source) for source, opts in source_and_opt_list ] return self._combine_opts(opt_list) def _combine_opts(self, opt_list): """Combine several opt dictionaries into one. *opt_list* is a list of dictionaries containing validated options Override this if you need to base options off the values of other options, but don't need to issue warnings etc. about the options' source. """ return combine_opts(self._opt_combiners(), *opt_list) def _opt_combiners(self): """A dictionary mapping opt name to combiner funciton. This won't necessarily include every opt name (we default to :py:func:`~mrjob.conf.combine_value`). """ return _combiners(self.OPT_NAMES) def _fix_opts(self, opts, source=None): """Take an options dictionary, and either return a sanitized version of it, or raise an exception. *source* is either a string describing where the opts came from or None. This ensures that opt dictionaries are really dictionaries and handles deprecated options. """ if source is None: source = 'defaults' # defaults shouldn't trigger warnings if not isinstance(opts, dict): raise TypeError( 'options for %s (from %s) must be a dict' % (self.alias, source)) deprecated_aliases = _deprecated_aliases(self.OPT_NAMES) results = {} for k, v in sorted(opts.items()): # rewrite deprecated aliases if k in deprecated_aliases: if v is None: # don't care continue aliased_opt = deprecated_aliases log.warning('Deprecated option %s (from %s) has been renamed' ' to %s and will be removed in v0.7.0' % ( k, source, aliased_opt)) if opts.get(aliased_opt) is not None: return # don't overwrite non-aliased opt k = aliased_opt if k in self.OPT_NAMES: results[k] = None if v is None else self._fix_opt(k, v, source) else: log.warning('Unexpected option %s (from %s)' % (k, source)) return results def _fix_opt(self, opt_key, opt_value, source): """Fix a single option, returning its correct value or raising an exception. This is not called for options that are ``None``. This currently handles cleanup opts. Override this if you require additional opt validation or cleanup. """ if opt_key in ('cleanup', 'cleanup_on_failure'): return self._fix_cleanup_opt(opt_key, opt_value, source) else: return opt_value def _fix_cleanup_opt(self, opt_key, opt_value, source): """Fix a cleanup option, or raise ValueError.""" if isinstance(opt_value, string_types): opt_value = [opt_value] if 'NONE' in opt_value and len(set(opt_value)) > 1: raise ValueError( 'Cannot clean up both nothing and something!' ' (%s option from %s)' % (opt_key, source)) for cleanup_type in opt_value: if cleanup_type not in CLEANUP_CHOICES: raise ValueError( '%s must be one of %s, not %s (from %s)' % ( opt_key, ', '.join(CLEANUP_CHOICES), opt_value, source)) return opt_value def _obfuscate_opt(self, opt_key, opt_value): """Return value of opt to show in debug printout. Used to obfuscate credentials, etc.""" return opt_value ### Filesystem object ### @property def fs(self): """:py:class:`~mrjob.fs.base.Filesystem` object for the local filesystem. """ if self._fs is None: # wrap LocalFilesystem in CompositeFilesystem to get IOError # on URIs (see #1185) self._fs = CompositeFilesystem(LocalFilesystem()) return self._fs ### Running the job and parsing output ### def run(self): """Run the job, and block until it finishes. Raise :py:class:`~mrjob.step.StepFailedException` if there are any problems (except on :py:class:`~mrjob.inline.InlineMRJobRunner`, where we raise the actual exception that caused the step to fail). """ if not self._script_path: raise AssertionError("No script to run!") if self._ran_job: raise AssertionError("Job already ran!") self._create_dir_archives() self._check_input_paths() self._run() self._ran_job = True def cat_output(self): """Stream the jobs output, as a stream of ``bytes``. If there are multiple output files, there will be an empty bytestring (``b''``) between them. .. versionadded:: 0.6.0 In previous versions, you'd use :py:meth:`stream_output`. """ output_dir = self.get_output_dir() if output_dir is None: raise AssertionError('Run the job before streaming output') if self._closed is True: log.warning( 'WARNING! Trying to stream output from a closed runner, output' ' will probably be empty.') log.info('Streaming final output from %s...' % output_dir) def split_path(path): while True: base, name = os.path.split(path) # no more elements if not name: break yield name path = base def ls_output(): for filename in self.fs.ls(output_dir): subpath = filename[len(output_dir):] if not (any(name.startswith('_') for name in split_path(subpath))): yield filename for i, filename in enumerate(ls_output()): if i > 0: yield b'' # EOF of previous file for chunk in self.fs._cat_file(filename): yield chunk def stream_output(self): """Like :py:meth:`cat_output` except that it groups bytes into lines. Equivalent to ``mrjob.util.to_lines(runner.stream_output())``. .. deprecated:: 0.6.0 """ log.warning('stream_output() is deprecated and will be removed in' ' v0.7.0. use mrjob.util.to_lines(runner.cat_output())' ' instead.') return to_lines(self.cat_output()) def _cleanup_mode(self, mode=None): """Actual cleanup action to take based on various options""" if self._script_path and not self._ran_job: return mode or self._opts['cleanup_on_failure'] else: return mode or self._opts['cleanup'] def _cleanup_cloud_tmp(self): """Cleanup any files/directories on cloud storage (e.g. S3) we created while running this job. Should be safe to run this at any time, or multiple times. """ pass # only EMR runner does this def _cleanup_hadoop_tmp(self): """Cleanup any files/directories on HDFS we created while running this job. Should be safe to run this at any time, or multiple times. """ pass # only Hadoop runner does this def _cleanup_local_tmp(self): """Cleanup any files/directories on the local machine we created while running this job. Should be safe to run this at any time, or multiple times. This particular function removes any local tmp directories added to the list self._local_tmp_dirs This won't remove output_dir if it's outside of our tmp dir. """ if self._local_tmp_dir: log.info('Removing temp directory %s...' % self._local_tmp_dir) try: shutil.rmtree(self._local_tmp_dir) except OSError as e: log.exception(e) self._local_tmp_dir = None def _cleanup_cluster(self): """Terminate the cluster if there is one.""" pass # this only happens on EMR def _cleanup_logs(self): """Cleanup any log files that are created as a side-effect of the job. """ pass # this only happens on EMR def _cleanup_job(self): """Stop any jobs that we created that are still running.""" pass # currently disabled (see #1241) def cleanup(self, mode=None): """Clean up running jobs, temp files, and logs, subject to the *cleanup* option passed to the constructor. If you create your runner in a :keyword:`with` block, :py:meth:`cleanup` will be called automatically:: with mr_job.make_runner() as runner: ... # cleanup() called automatically here :param mode: override *cleanup* passed into the constructor. Should be a list of strings from :py:data:`CLEANUP_CHOICES` """ mode = self._cleanup_mode(mode) def mode_has(*args): return any((choice in mode) for choice in args) if self._script_path and not self._ran_job: if mode_has('CLUSTER', 'ALL'): self._cleanup_cluster() if mode_has('JOB', 'ALL'): self._cleanup_job() if mode_has('ALL', 'TMP', 'CLOUD_TMP'): self._cleanup_cloud_tmp() if mode_has('ALL', 'TMP', 'HADOOP_TMP'): self._cleanup_hadoop_tmp() if mode_has('ALL', 'TMP', 'LOCAL_TMP'): self._cleanup_local_tmp() if mode_has('ALL', 'LOGS'): self._cleanup_logs() self._closed = True def counters(self): """Get counters associated with this run in this form:: [{'group name': {'counter1': 1, 'counter2': 2}}, {'group name': ...}] The list contains an entry for every step of the current job. """ raise NotImplementedError ### hooks for the with statement ### def __enter__(self): """Don't do anything special at start of with block""" return self def __exit__(self, type, value, traceback): """Call self.cleanup() at end of with block.""" self.cleanup() ### more runner information ### def get_opts(self): """Get options set for this runner, as a dict.""" log.warning('get_opts() is deprecated and will be removed in v0.7.0') return copy.deepcopy(self._opts) def get_job_key(self): """Get the unique key for the job run by this runner. This has the format ``label.owner.date.time.microseconds`` """ return self._job_key def get_output_dir(self): """Find the directory containing the job output. If the job hasn't run yet, returns None""" if self._script_path and not self._ran_job: return None return self._output_dir ### other methods you need to implement in your subclass ### def get_hadoop_version(self): """Return the version number of the Hadoop environment as a string if Hadoop is being used or simulated. Return None if not applicable. :py:class:`~mrjob.emr.EMRJobRunner` infers this from the cluster. :py:class:`~mrjob.hadoop.HadoopJobRunner` gets this from ``hadoop version``. :py:class:`~mrjob.local.LocalMRJobRunner` has an additional `hadoop_version` option to specify which version it simulates. :py:class:`~mrjob.inline.InlineMRJobRunner` does not simulate Hadoop at all. """ return None # you'll probably wan't to add your own __init__() and cleanup() as well def _run(self): """Run the job.""" raise NotImplementedError ### internal utilities for implementing MRJobRunners ### def _get_local_tmp_dir(self): """Create a tmp directory on the local filesystem that will be cleaned up by self.cleanup()""" if not self._local_tmp_dir: path = os.path.join(self._opts['local_tmp_dir'], self._job_key) log.info('Creating temp directory %s' % path) if os.path.isdir(path): shutil.rmtree(path) os.makedirs(path) self._local_tmp_dir = path return self._local_tmp_dir def _make_unique_job_key(self, label=None, owner=None): """Come up with a useful unique ID for this job. We use this to choose the output directory, etc. for the job. """ # use the name of the script if one wasn't explicitly # specified if not label: if self._script_path: label = os.path.basename(self._script_path).split('.')[0] else: label = 'no_script' if not owner: owner = 'no_user' now = datetime.datetime.utcnow() return '%s.%s.%s.%06d' % ( label, owner, now.strftime('%Y%m%d.%H%M%S'), now.microsecond) def _get_steps(self): """Call the job script to find out how many steps it has, and whether there are mappers and reducers for each step. Validate its output. Returns output as described in :ref:`steps-format`. Results are cached, so call this as many times as you want. """ if self._steps is None: self._steps = self._load_steps() return self._steps def _load_steps(self): """Ask job how many steps it has, and whether there are mappers and reducers for each step. Returns output as described in :ref:`steps-format`. """ raise NotImplementedError def _get_step(self, step_num): """Get a single step (calls :py:meth:`_get_steps`).""" return self._get_steps()[step_num] def _num_steps(self): """Get the number of steps (calls :py:meth:`get_steps`).""" return len(self._get_steps()) def _has_streaming_steps(self): """Are any of our steps Hadoop streaming steps?""" return any(step['type'] == 'streaming' for step in self._get_steps()) def _has_spark_steps(self): """Are any of our steps Spark steps (either spark or spark_script)""" return any(_is_spark_step_type(step['type']) for step in self._get_steps()) def _args_for_task(self, step_num, mrc): return [ '--step-num=%d' % step_num, '--%s' % mrc, ] + self._mr_job_extra_args() def _mr_job_extra_args(self, local=False): """Return arguments to add to every invocation of MRJob. :type local: boolean :param local: if this is True, use files' local paths rather than the path they'll have inside Hadoop streaming """ result = [] for extra_arg in self._extra_args: if isinstance(extra_arg, dict): if local: result.append(extra_arg['path']) else: result.append(self._working_dir_mgr.name(**extra_arg)) else: result.append(extra_arg) return result def _dir_archive_path(self, dir_path): """Assign a path for the archive of *dir_path* but don't actually create anything.""" if dir_path not in self._dir_to_archive_path: # we can check local paths now if not (is_uri(dir_path) or os.path.isdir(dir_path)): raise OSError('%s is not a directory!' % dir_path) name = name_uniquely( dir_path, names_taken=self._dir_archive_names_taken) self._dir_archive_names_taken.add(name) self._dir_to_archive_path[dir_path] = os.path.join( self._get_local_tmp_dir(), 'archives', name + '.tar.gz') return self._dir_to_archive_path[dir_path] def _create_dir_archives(self): """Call this to create all dir archives""" for dir_path in sorted(set(self._dir_to_archive_path)): self._create_dir_archive(dir_path) def _create_dir_archive(self, dir_path): """Helper for :py:meth:`archive_dir`""" if not self.fs.exists(dir_path): raise OSError('%s does not exist') tar_gz_path = self._dir_archive_path(dir_path) if tar_gz_path in self._dir_archives_created: return # already created if not os.path.isdir(os.path.dirname(tar_gz_path)): os.makedirs(os.path.dirname(tar_gz_path)) # for remote files tmp_download_path = os.path.join( self._get_local_tmp_dir(), 'tmp-download') log.info('Archiving %s -> %s' % (dir_path, tar_gz_path)) with tarfile.open(tar_gz_path, mode='w:gz') as tar_gz: for path in self.fs.ls(dir_path): # fs.ls() only lists files if path == dir_path: raise OSError('%s is a file, not a directory!' % dir_path) # TODO: do we need this? if os.path.realpath(path) == os.path.realpath(tar_gz_path): raise OSError( 'attempted to archive %s into itself!' % tar_gz_path) if is_uri(path): path_in_tar_gz = path[len(dir_path):].lstrip('/') log.info(' downloading %s -> %s' % ( path, tmp_download_path)) with open(tmp_download_path, 'wb') as f: for chunk in self.fs.cat(path): f.write(chunk) local_path = tmp_download_path else: path_in_tar_gz = path[len(dir_path):].lstrip(os.sep) local_path = path log.debug(' adding %s to %s' % (path, tar_gz_path)) tar_gz.add(local_path, path_in_tar_gz, recursive=False) self._dir_archives_created.add(tar_gz_path) def _bootstrap_mrjob(self): """Should we bootstrap mrjob?""" if self._opts['bootstrap_mrjob'] is None: return self._opts['interpreter'] is None else: return bool(self._opts['bootstrap_mrjob']) def _get_input_paths(self): """Get the paths to input files, dumping STDIN to a local file if need be.""" if '-' in self._input_paths: if self._stdin_path is None: # prompt user, so they don't think the process has stalled log.info('reading from STDIN') stdin_path = os.path.join(self._get_local_tmp_dir(), 'STDIN') log.debug('dumping stdin to local file %s' % stdin_path) with open(stdin_path, 'wb') as stdin_file: for line in self._stdin: # catch missing newlines (often happens with test data) if not line.endswith(b'\n'): line += b'\n' stdin_file.write(line) self._stdin_path = stdin_path return [self._stdin_path if p == '-' else p for p in self._input_paths] def _check_input_paths(self): """Check that input exists prior to running the job, if the `check_input_paths` option is true.""" if not self._opts['check_input_paths']: return for path in self._input_paths: if path == '-': continue # STDIN always exists if not self.fs.can_handle_path(path): continue # e.g. non-S3 URIs on EMR if not self.fs.exists(path): raise IOError( 'Input path %s does not exist!' % (path,)) def _intermediate_output_uri(self, step_num, local=False): """A URI for intermediate output for the given step number.""" join = os.path.join if local else posixpath.join return join( self._step_output_dir or self._default_step_output_dir(), '%04d' % step_num) def _default_step_output_dir(self): """Where to put output for steps other than the last one, if not specified by the *output_dir* constructor keyword. Usually you want this to be on HDFS (most efficient). Define this in your runner subclass. """ raise NotImplementedError def _step_input_uris(self, step_num): """A list of URIs to use as input for the given step. For all except the first step, this list will have a single item (a directory).""" if step_num == 0: return [self._upload_mgr.uri(path) for path in self._get_input_paths()] else: return [self._intermediate_output_uri(step_num - 1)] def _step_output_uri(self, step_num): """URI to use as output for the given step. This is either an intermediate dir (see :py:meth:`intermediate_output_uri`) or ``self._output_dir`` for the final step.""" if step_num == len(self._get_steps()) - 1: return self._output_dir else: return self._intermediate_output_uri(step_num) def _interpolate_input_and_output(self, args, step_num): """Replace :py:data:`~mrjob.step.INPUT` and :py:data:`~mrjob.step.OUTPUT` in arguments to a jar or Spark step. If there are multiple input paths (i.e. on the first step), they'll be joined with a comma. """ def interpolate(arg): if arg == mrjob.step.INPUT: return ','.join(self._step_input_uris(step_num)) elif arg == mrjob.step.OUTPUT: return self._step_output_uri(step_num) else: return arg return [interpolate(arg) for arg in args] def _create_mrjob_zip(self): """Make a zip of the mrjob library, without .pyc or .pyo files, This will also set ``self._mrjob_zip_path`` and return it. Typically called from :py:meth:`_create_setup_wrapper_script`. It's safe to call this method multiple times (we'll only create the zip file once.) """ if not self._mrjob_zip_path: # find mrjob library import mrjob if not os.path.basename(mrjob.__file__).startswith('__init__.'): raise Exception( "Bad path for mrjob library: %s; can't bootstrap mrjob", mrjob.__file__) mrjob_dir = os.path.dirname(mrjob.__file__) or '.' zip_path = os.path.join(self._get_local_tmp_dir(), 'mrjob.zip') def filter_path(path): filename = os.path.basename(path) return not(filename.lower().endswith('.pyc') or filename.lower().endswith('.pyo') or # filter out emacs backup files filename.endswith('~') or # filter out emacs lock files filename.startswith('.#') or # filter out MacFuse resource forks filename.startswith('._')) log.debug('archiving %s -> %s as %s' % ( mrjob_dir, zip_path, os.path.join('mrjob', ''))) zip_dir(mrjob_dir, zip_path, filter=filter_path, prefix='mrjob') self._mrjob_zip_path = zip_path return self._mrjob_zip_path def _jobconf_for_step(self, step_num): """Get the jobconf dictionary, optionally including step-specific jobconf info. Also translate jobconfs to the current Hadoop version, if necessary. """ step = self._get_step(step_num) # _sort_values_jobconf() isn't relevant to Spark, # but it doesn't do any harm either jobconf = combine_dicts(self._sort_values_jobconf(), self._opts['jobconf'], step.get('jobconf')) # if user is using the wrong jobconfs, add in the correct ones # and log a warning hadoop_version = self.get_hadoop_version() if hadoop_version: jobconf = translate_jobconf_dict(jobconf, hadoop_version) return jobconf def _sort_values_jobconf(self): """Jobconf dictionary to enable sorting by value. """ if not self._sort_values: return {} # translate _SORT_VALUES_JOBCONF to the correct Hadoop version, # without logging a warning hadoop_version = self.get_hadoop_version() jobconf = {} for k, v in _SORT_VALUES_JOBCONF.items(): if hadoop_version: jobconf[translate_jobconf(k, hadoop_version)] = v else: for j in translate_jobconf_for_all_versions(k): jobconf[j] = v return jobconf def _sort_values_partitioner(self): """Partitioner to use with *sort_values* keyword to the constructor.""" if self._sort_values: return _SORT_VALUES_PARTITIONER else: return None def _parse_setup_and_py_files(self): """Parse the *setup* option with :py:func:`mrjob.setup.parse_setup_cmd()`, and patch in *py_files*. """ setup = [] # py_files for path in self._opts['py_files']: # Spark (at least v1.3.1) doesn't work with # and --py-files, # see #1375 if '#' in path: raise ValueError("py_files cannot contain '#'") path_dict = parse_legacy_hash_path('file', path) setup.append(['export PYTHONPATH=', path_dict, ':$PYTHONPATH']) # setup for cmd in self._opts['setup']: setup.append(parse_setup_cmd(cmd)) return setup def _upload_args(self): # just upload every file and archive in the working dir manager return self._upload_args_helper('-files', None, '-archives', None) def _upload_args_helper( self, files_opt_str, files, archives_opt_str, archives): args = [] file_hash_paths = list(self._arg_hash_paths('file', files)) if file_hash_paths: args.append(files_opt_str) args.append(','.join(file_hash_paths)) archive_hash_paths = list(self._arg_hash_paths('archive', archives)) if archive_hash_paths: args.append(archives_opt_str) args.append(','.join(archive_hash_paths)) return args def _arg_hash_paths(self, type, named_paths=None): """Helper function for the *upload_args methods.""" if named_paths is None: # just return everything managed by _working_dir_mgr named_paths = sorted( self._working_dir_mgr.name_to_path(type).items()) for name, path in named_paths: if not name: name = self._working_dir_mgr.name(type, path) uri = self._upload_mgr.uri(path) yield '%s#%s' % (uri, name)
class HadoopInTheCloudJobRunner(MRJobBinRunner): """Abstract base class for all Hadoop-in-the-cloud services.""" alias = '_cloud' OPT_NAMES = MRJobBinRunner.OPT_NAMES | { 'bootstrap', 'bootstrap_python', 'check_cluster_every', 'cloud_fs_sync_secs', 'cloud_part_size_mb', 'cloud_tmp_dir', 'cluster_id', 'core_instance_type', 'extra_cluster_params', 'hadoop_streaming_jar', 'image_id', 'image_version', 'instance_type', 'master_instance_type', 'max_mins_idle', 'max_hours_idle', 'num_core_instances', 'num_task_instances', 'region', 'ssh_bind_ports', 'ssh_tunnel', 'ssh_tunnel_is_open', 'task_instance_type', 'zone', } # so far, every service provides the ability to run bootstrap scripts _BOOTSTRAP_MRJOB_IN_PY_FILES = False def __init__(self, **kwargs): super(HadoopInTheCloudJobRunner, self).__init__(**kwargs) # if *cluster_id* is not set, ``self._cluster_id`` will be # set when we create or join a cluster self._cluster_id = self._opts['cluster_id'] # bootstrapping self._bootstrap = self._bootstrap_python() + self._parse_bootstrap() # add files to manager self._bootstrap_dir_mgr = WorkingDirManager() for cmd in self._bootstrap: for token in cmd: if isinstance(token, dict): # convert dir archive tokens to archives if token['type'] == 'dir': token['path'] = self._dir_archive_path(token['path']) token['type'] = 'archive' self._bootstrap_dir_mgr.add(**token) # we'll create this script later, as needed self._master_bootstrap_script_path = None # ssh state # the process for the SSH tunnel self._ssh_proc = None # if this is true, stop trying to launch the SSH tunnel self._give_up_on_ssh_tunnel = False # store the (tunneled) URL of the job tracker/resource manager self._ssh_tunnel_url = None ### Options ### def _default_opts(self): return combine_dicts( super(HadoopInTheCloudJobRunner, self)._default_opts(), dict( cloud_part_size_mb=100, # 100 MB max_mins_idle=_DEFAULT_MAX_MINS_IDLE, # don't use a list because it makes it hard to read option # values when running in verbose mode. See #1284 ssh_bind_ports=xrange(40001, 40841), ssh_tunnel=False, ssh_tunnel_is_open=False, # ssh_bin isn't included here. For example, the Dataproc # runner launches ssh through the gcloud util ), ) def _fix_opts(self, opts, source=None): opts = super(HadoopInTheCloudJobRunner, self)._fix_opts( opts, source=source) # cloud_part_size_mb should be a number if opts.get('cloud_part_size_mb') is not None: if not isinstance(opts['cloud_part_size_mb'], (integer_types, float)): raise TypeError('cloud_part_size_mb must be a number') # patch max_hours_idle into max_mins_idle (see #1663) if opts.get('max_hours_idle') is not None: log.warning( 'max_hours_idle is deprecated and will be removed in v0.7.0.' + (' Please use max_mins_idle instead' if opts.get('max_mins_idle') is None else '')) if opts.get('max_mins_idle') is None: opts['max_mins_idle'] = opts['max_hours_idle'] * 60 return opts def _combine_opts(self, opt_list): """Propagate *instance_type* to other instance type opts, if not already set. Also propagate core instance type to task instance type, if it's not already set. """ opts = super(HadoopInTheCloudJobRunner, self)._combine_opts(opt_list) if opts['instance_type']: # figure out how late in the configs opt was set (setting # --instance_type on the command line overrides core_instance_type # set in configs) opt_priority = {k: -1 for k in opts} for i, sub_opts in enumerate(opt_list): for k, v in sub_opts.items(): if v == opts[k]: opt_priority[k] = i # instance_type only affects master_instance_type if there are # no other instances if opts['num_core_instances'] or opts['num_task_instances']: propagate_to = ['core_instance_type', 'task_instance_type'] else: propagate_to = ['master_instance_type'] for k in propagate_to: if opts[k] is None or ( opt_priority[k] < opt_priority['instance_type']): opts[k] = opts['instance_type'] if not opts['task_instance_type']: opts['task_instance_type'] = opts['core_instance_type'] return opts ### Bootstrapping ### def _bootstrap_python(self): """Redefine this to return a (possibly empty) list of parsed commands (in the same format as returned by parse_setup_cmd())' to make sure a compatible version of Python is installed If the *bootstrap_python* option is false, should always return ``[]``. """ return [] def _cp_to_local_cmd(self): """Command to copy files from the cloud to the local directory (usually via Hadoop). Redefine this as needed; for example, on EMR, we sometimes have to use ``aws s3 cp`` because ``hadoop`` isn't installed at bootstrap time.""" return 'hadoop fs -copyToLocal' def _parse_bootstrap(self): """Parse the *bootstrap* option with :py:func:`mrjob.setup.parse_setup_cmd()`. """ return [parse_setup_cmd(cmd) for cmd in self._opts['bootstrap']] def _create_master_bootstrap_script_if_needed(self): """Helper for :py:meth:`_add_bootstrap_files_for_upload`. Create the master bootstrap script and write it into our local temp directory. Set self._master_bootstrap_script_path. This will do nothing if there are no bootstrap scripts or commands, or if it has already been called.""" if self._master_bootstrap_script_path: return # don't bother if we're not starting a cluster if self._cluster_id: return # Also don't bother if we're not bootstrapping if not (self._bootstrap or self._bootstrap_mrjob()): return # create mrjob.zip if we need it, and add commands to install it mrjob_bootstrap = [] if self._bootstrap_mrjob(): assert self._mrjob_zip_path path_dict = { 'type': 'file', 'name': None, 'path': self._mrjob_zip_path} self._bootstrap_dir_mgr.add(**path_dict) # find out where python keeps its libraries mrjob_bootstrap.append([ "__mrjob_PYTHON_LIB=$(%s -c " "'from distutils.sysconfig import get_python_lib;" " print(get_python_lib())')" % cmd_line(self._python_bin())]) # remove anything that might be in the way (see #1567) mrjob_bootstrap.append(['sudo rm -rf $__mrjob_PYTHON_LIB/mrjob']) # unzip mrjob.zip mrjob_bootstrap.append( ['sudo unzip ', path_dict, ' -d $__mrjob_PYTHON_LIB']) # re-compile pyc files now, since mappers/reducers can't # write to this directory. Don't fail if there is extra # un-compileable crud in the tarball (this would matter if # sh_bin were 'sh -e') mrjob_bootstrap.append( ['sudo %s -m compileall -q' ' -f $__mrjob_PYTHON_LIB/mrjob && true' % cmd_line(self._python_bin())]) path = os.path.join(self._get_local_tmp_dir(), 'b.sh') log.info('writing master bootstrap script to %s' % path) contents = self._master_bootstrap_script_content( self._bootstrap + mrjob_bootstrap) self._write_script(contents, path, 'master bootstrap script') self._master_bootstrap_script_path = path def _master_bootstrap_script_content(self, bootstrap): """Return a list containing the lines of the master bootstrap script. (without trailing newlines) """ out = [] # shebang, precommands out.extend(self._start_of_sh_script()) out.append('') # for example, create a tmp dir and cd to it if self._bootstrap_pre_commands(): out.extend(self._bootstrap_pre_commands()) out.append('') # store $PWD out.append('# store $PWD') out.append('__mrjob_PWD=$PWD') out.append('') # special case for PWD being in /, which happens on Dataproc # (really we should cd to tmp or something) out.append('if [ $__mrjob_PWD = "/" ]; then') out.append(' __mrjob_PWD=""') out.append('fi') out.append('') # run commands in a block so we can redirect stdout to stderr # (e.g. to catch errors from compileall). See #370 out.append('{') # download files out.append(' # download files and mark them executable') cp_to_local = self._cp_to_local_cmd() # TODO: why bother with $__mrjob_PWD here, since we're already in it? for name, path in sorted( self._bootstrap_dir_mgr.name_to_path('file').items()): uri = self._upload_mgr.uri(path) out.append(' %s %s $__mrjob_PWD/%s' % (cp_to_local, pipes.quote(uri), pipes.quote(name))) # imitate Hadoop Distributed Cache (see #1602) out.append(' chmod u+rx $__mrjob_PWD/%s' % pipes.quote(name)) out.append('') # download and unarchive archives archive_names_and_paths = sorted( self._bootstrap_dir_mgr.name_to_path('archive').items()) if archive_names_and_paths: # make tmp dir if needed out.append(' # download and unpack archives') out.append(' __mrjob_TMP=$(mktemp -d)') out.append('') for name, path in archive_names_and_paths: uri = self._upload_mgr.uri(path) ext = file_ext(basename(path)) # copy file to tmp dir quoted_archive_path = '$__mrjob_TMP/%s' % pipes.quote(name) out.append(' %s %s %s' % ( cp_to_local, pipes.quote(uri), quoted_archive_path)) # unarchive file if ext not in _EXT_TO_UNARCHIVE_CMD: raise KeyError('unknown archive file extension: %s' % path) unarchive_cmd = _EXT_TO_UNARCHIVE_CMD[ext] out.append(' ' + unarchive_cmd % dict( file=quoted_archive_path, dir='$__mrjob_PWD/' + pipes.quote(name))) # imitate Hadoop Distributed Cache (see #1602) out.append( ' chmod u+rx -R $__mrjob_PWD/%s' % pipes.quote(name)) out.append('') # run bootstrap commands out.append(' # bootstrap commands') for cmd in bootstrap: # reconstruct the command line, substituting $__mrjob_PWD/<name> # for path dicts line = ' ' for token in cmd: if isinstance(token, dict): # it's a path dictionary line += '$__mrjob_PWD/' line += pipes.quote(self._bootstrap_dir_mgr.name(**token)) else: # it's raw script line += token out.append(line) out.append('} 1>&2') # stdout -> stderr for ease of error log parsing return out def _bootstrap_pre_commands(self): """A list of hard-coded commands to run at the beginning of the bootstrap script. Currently used by dataproc to cd into a tmp dir.""" return [] def _start_of_sh_script(self): """Return a list of lines (without trailing newlines) containing the shell script shebang and pre-commands.""" out = [] # shebang sh_bin = self._sh_bin() if not sh_bin[0].startswith('/'): sh_bin = ['/usr/bin/env'] + sh_bin out.append('#!' + cmd_line(sh_bin)) # hook for 'set -e', etc. (see #1549) out.extend(self._sh_pre_commands()) return out ### Launching Clusters ### def _add_extra_cluster_params(self, params): """Return a dict with the *extra_cluster_params* opt patched into *params*, and ``None`` values removed.""" params = deepcopy(params) for k, v in sorted(self._opts['extra_cluster_params'].items()): _patch_params(params, k, v) return params ### SSH Tunnel ### def _ssh_tunnel_args(self, bind_port): """Redefine this in your subclass. You will probably want to call :py:meth:`_ssh_tunnel_opts` somewhere in here. Should return the list of args used to run the command to open the SSH tunnel, bound to *bind_port* on your computer, or ``None`` if it isn't possible to set up an SSH tunnel. """ return None def _ssh_tunnel_config(self): """Redefine this in your subclass. Should return a dict with the following keys: *localhost*: once we SSH in, is the web interface? reachable at ``localhost`` *name*: either ``'job tracker'`` or ``'resource manager'`` *path*: path of main page on web interface (e.g. "/cluster") *port*: port number of the web interface """ raise NotImplementedError def _launch_ssh_proc(self, args): """The command used to create a :py:class:`subprocess.Popen` to run the SSH tunnel. You usually don't need to redefine this.""" log.debug('> %s' % cmd_line(args)) return Popen(args, stdin=PIPE, stdout=PIPE, stderr=PIPE) def _ssh_launch_wait_secs(self): """Wait this long after launching the SSH process before checking for failure (default 1 second). You may redefine this.""" return 1.0 def _set_up_ssh_tunnel(self): """Call this whenever you think it is possible to SSH to your cluster. This sets :py:attr:`_ssh_proc`. Does nothing if :mrjob-opt:`ssh_tunnel` is not set, or there is already a tunnel process running. """ # did the user request an SSH tunnel? if not self._opts['ssh_tunnel']: return # no point in trying to launch a nonexistent command twice if self._give_up_on_ssh_tunnel: return # did we already launch the SSH tunnel process? is it still running? if self._ssh_proc: self._ssh_proc.poll() if self._ssh_proc.returncode is None: return else: log.warning(' Oops, ssh subprocess exited with return code' ' %d, restarting...' % self._ssh_proc.returncode) self._ssh_proc = None tunnel_config = self._ssh_tunnel_config() bind_port = None popen_exception = None ssh_tunnel_args = [] for bind_port in self._pick_ssh_bind_ports(): ssh_proc = None ssh_tunnel_args = self._ssh_tunnel_args(bind_port) # can't launch SSH tunnel right now if not ssh_tunnel_args: return try: ssh_proc = self._launch_ssh_proc(ssh_tunnel_args) except OSError as ex: # e.g. OSError(2, 'File not found') popen_exception = ex # warning handled below break if ssh_proc: time.sleep(self._ssh_launch_wait_secs()) ssh_proc.poll() # still running. We are golden if ssh_proc.returncode is None: self._ssh_proc = ssh_proc break else: ssh_proc.stdin.close() ssh_proc.stdout.close() ssh_proc.stderr.close() if self._ssh_proc: if self._opts['ssh_tunnel_is_open']: bind_host = socket.getfqdn() else: bind_host = 'localhost' self._ssh_tunnel_url = 'http://%s:%d%s' % ( bind_host, bind_port, tunnel_config['path']) log.info(' Connect to %s at: %s' % ( tunnel_config['name'], self._ssh_tunnel_url)) else: if popen_exception: # this only happens if the ssh binary is not present # or not executable (so tunnel_config and the args to the # ssh binary don't matter) log.warning( " Couldn't open SSH tunnel: %s" % popen_exception) self._give_up_on_ssh_tunnel = True return else: log.warning( ' Failed to open ssh tunnel to %s' % tunnel_config['name']) def _kill_ssh_tunnel(self): """Send SIGKILL to SSH tunnel, if it's running.""" if not self._ssh_proc: return self._ssh_proc.poll() if self._ssh_proc.returncode is None: log.info('Killing our SSH tunnel (pid %d)' % self._ssh_proc.pid) self._ssh_proc.stdin.close() self._ssh_proc.stdout.close() self._ssh_proc.stderr.close() try: if hasattr(signal, 'SIGKILL'): os.kill(self._ssh_proc.pid, signal.SIGKILL) else: # Windows doesn't have SIGKILL, see #1892 os.kill(self._ssh_proc.pid, signal.SIGABRT) except Exception as e: log.exception(e) self._ssh_proc = None self._ssh_tunnel_url = None def _ssh_tunnel_opts(self, bind_port): """Options to SSH related to setting up a tunnel (rather than SSHing in). Helper for :py:meth:`_ssh_tunnel_args`. """ args = self._ssh_local_tunnel_opt(bind_port) + [ '-N', '-n', '-q', ] if self._opts['ssh_tunnel_is_open']: args.extend(['-g', '-4']) # -4: listen on IPv4 only return args def _ssh_local_tunnel_opt(self, bind_port): """Helper for :py:meth:`_ssh_tunnel_opts`.""" tunnel_config = self._ssh_tunnel_config() return [ '-L', '%d:%s:%d' % ( bind_port, self._job_tracker_host(), tunnel_config['port'], ), ] def _pick_ssh_bind_ports(self): """Pick a list of ports to try binding our SSH tunnel to. We will try to bind the same port for any given cluster (Issue #67) """ # don't perturb the random number generator random_state = random.getstate() try: # seed random port selection on cluster ID random.seed(self._cluster_id) num_picks = min(_MAX_SSH_RETRIES, len(self._opts['ssh_bind_ports'])) return random.sample(self._opts['ssh_bind_ports'], num_picks) finally: random.setstate(random_state)
class HadoopInTheCloudJobRunner(MRJobBinRunner): """Abstract base class for all Hadoop-in-the-cloud services.""" alias = '_cloud' OPT_NAMES = MRJobBinRunner.OPT_NAMES | { 'bootstrap', 'bootstrap_python', 'check_cluster_every', 'cloud_fs_sync_secs', 'cloud_tmp_dir', 'cluster_id', 'core_instance_type', 'extra_cluster_params', 'image_version', 'instance_type', 'master_instance_type', 'max_mins_idle', 'max_hours_idle', 'num_core_instances', 'num_task_instances', 'region', 'task_instance_type', 'zone', } # so far, every service provides the ability to run bootstrap scripts _BOOTSTRAP_MRJOB_IN_SETUP = False def __init__(self, **kwargs): super(HadoopInTheCloudJobRunner, self).__init__(**kwargs) # if *cluster_id* is not set, ``self._cluster_id`` will be # set when we create or join a cluster self._cluster_id = self._opts['cluster_id'] # bootstrapping self._bootstrap = self._bootstrap_python() + self._parse_bootstrap() # add files to manager self._bootstrap_dir_mgr = WorkingDirManager() for cmd in self._bootstrap: for token in cmd: if isinstance(token, dict): # convert dir archive tokens to archives if token['type'] == 'dir': token['path'] = self._dir_archive_path(token['path']) token['type'] = 'archive' self._bootstrap_dir_mgr.add(**token) # we'll create this script later, as needed self._master_bootstrap_script_path = None ### Options ### def _fix_opts(self, opts, source=None): opts = super(HadoopInTheCloudJobRunner, self)._fix_opts( opts, source=source) # patch max_hours_idle into max_mins_idle (see #1663) if opts.get('max_hours_idle') is not None: log.warning( 'max_hours_idle is deprecated and will be removed in v0.7.0.' + (' Please use max_mins_idle instead' if opts.get('max_mins_idle') is None else '')) if opts.get('max_mins_idle') is None: if opts.get('max_hours_idle') is not None: opts['max_mins_idle'] = opts['max_hours_idle'] * 60 else: opts['max_mins_idle'] = _DEFAULT_MAX_MINS_IDLE # warn about issues with if opts['max_mins_idle'] < _DEFAULT_MAX_MINS_IDLE: log.warning('Setting max_mins_idle to less than %.1f may result' ' in cluster shutting down before job can run' % _DEFAULT_MAX_MINS_IDLE) return opts def _combine_opts(self, opt_list): """Propagate *instance_type* to other instance type opts, if not already set. Also propagate core instance type to task instance type, if it's not already set. """ opts = super(HadoopInTheCloudJobRunner, self)._combine_opts(opt_list) if opts['instance_type']: # figure out how late in the configs opt was set (setting # --instance_type on the command line overrides core_instance_type # set in configs) opt_priority = {k: -1 for k in opts} for i, sub_opts in enumerate(opt_list): for k, v in sub_opts.items(): if v == opts[k]: opt_priority[k] = i # instance_type only affects master_instance_type if there are # no other instances if opts['num_core_instances'] or opts['num_task_instances']: propagate_to = ['core_instance_type', 'task_instance_type'] else: propagate_to = ['master_instance_type'] for k in propagate_to: if opts[k] is None or ( opt_priority[k] < opt_priority['instance_type']): opts[k] = opts['instance_type'] if not opts['task_instance_type']: opts['task_instance_type'] = opts['core_instance_type'] return opts ### Bootstrapping ### def _bootstrap_python(self): """Redefine this to return a (possibly empty) list of parsed commands (in the same format as returned by parse_setup_cmd())' to make sure a compatible version of Python is installed If the *bootstrap_python* option is false, should always return ``[]``. """ return [] def _cp_to_local_cmd(self): """Command to copy files from the cloud to the local directory (usually via Hadoop). Redefine this as needed; for example, on EMR, we sometimes have to use ``aws s3 cp`` because ``hadoop`` isn't installed at bootstrap time.""" return 'hadoop fs -copyToLocal' def _parse_bootstrap(self): """Parse the *bootstrap* option with :py:func:`mrjob.setup.parse_setup_cmd()`. """ return [parse_setup_cmd(cmd) for cmd in self._opts['bootstrap']] def _create_master_bootstrap_script_if_needed(self): """Helper for :py:meth:`_add_bootstrap_files_for_upload`. Create the master bootstrap script and write it into our local temp directory. Set self._master_bootstrap_script_path. This will do nothing if there are no bootstrap scripts or commands, or if it has already been called.""" if self._master_bootstrap_script_path: return # don't bother if we're not starting a cluster if self._cluster_id: return # Also don't bother if we're not bootstrapping if not (self._bootstrap or self._bootstrap_mrjob()): return # create mrjob.zip if we need it, and add commands to install it mrjob_bootstrap = [] if self._bootstrap_mrjob(): assert self._mrjob_zip_path path_dict = { 'type': 'file', 'name': None, 'path': self._mrjob_zip_path} self._bootstrap_dir_mgr.add(**path_dict) # find out where python keeps its libraries mrjob_bootstrap.append([ "__mrjob_PYTHON_LIB=$(%s -c " "'from distutils.sysconfig import get_python_lib;" " print(get_python_lib())')" % cmd_line(self._python_bin())]) # remove anything that might be in the way (see #1567) mrjob_bootstrap.append(['sudo rm -rf $__mrjob_PYTHON_LIB/mrjob']) # unzip mrjob.zip mrjob_bootstrap.append( ['sudo unzip ', path_dict, ' -d $__mrjob_PYTHON_LIB']) # re-compile pyc files now, since mappers/reducers can't # write to this directory. Don't fail if there is extra # un-compileable crud in the tarball (this would matter if # sh_bin were 'sh -e') mrjob_bootstrap.append( ['sudo %s -m compileall -q' ' -f $__mrjob_PYTHON_LIB/mrjob && true' % cmd_line(self._python_bin())]) path = os.path.join(self._get_local_tmp_dir(), 'b.sh') log.info('writing master bootstrap script to %s' % path) contents = self._master_bootstrap_script_content( self._bootstrap + mrjob_bootstrap) for line in contents: log.debug('BOOTSTRAP: ' + line) with open(path, 'wb') as f: for line in contents: f.write(line.encode('utf-8') + b'\n') self._master_bootstrap_script_path = path def _master_bootstrap_script_content(self, bootstrap): """Return a list containing the lines of the master bootstrap script. (without trailing newlines) """ out = [] # shebang, precommands out.extend(self._start_of_sh_script()) out.append('') # store $PWD out.append('# store $PWD') out.append('__mrjob_PWD=$PWD') out.append('') # special case for PWD being in /, which happens on Dataproc # (really we should cd to tmp or something) out.append('if [ $__mrjob_PWD = "/" ]; then') out.append(' __mrjob_PWD=""') out.append('fi') out.append('') # run commands in a block so we can redirect stdout to stderr # (e.g. to catch errors from compileall). See #370 out.append('{') # download files out.append(' # download files and mark them executable') cp_to_local = self._cp_to_local_cmd() # TODO: why bother with $__mrjob_PWD here, since we're already in it? for name, path in sorted( self._bootstrap_dir_mgr.name_to_path('file').items()): uri = self._upload_mgr.uri(path) out.append('') out.append(' %s %s $__mrjob_PWD/%s' % (cp_to_local, pipes.quote(uri), pipes.quote(name))) # imitate Hadoop Distributed Cache (see #1602) out.append(' chmod u+rx $__mrjob_PWD/%s' % pipes.quote(name)) out.append('') # download and unarchive archives archive_names_and_paths = sorted( self._bootstrap_dir_mgr.name_to_path('archive').items()) if archive_names_and_paths: # make tmp dir if needed out.append(' # download and unpack archives') out.append(' __mrjob_TMP=$(mktemp -d)') out.append('') for name, path in archive_names_and_paths: uri = self._upload_mgr.uri(path) ext = file_ext(basename(path)) # copy file to tmp dir quoted_archive_path = '$__mrjob_TMP/%s' % pipes.quote(name) out.append(' %s %s %s' % ( cp_to_local, pipes.quote(uri), quoted_archive_path)) # unarchive file if ext not in _EXT_TO_UNARCHIVE_CMD: raise KeyError('unknown archive file extension: %s' % path) unarchive_cmd = _EXT_TO_UNARCHIVE_CMD[ext] out.append(' ' + unarchive_cmd % dict( file=quoted_archive_path, dir='$__mrjob_PWD/' + pipes.quote(name))) # imitate Hadoop Distributed Cache (see #1602) out.append( ' chmod u+rx -R $__mrjob_PWD/%s' % pipes.quote(name)) out.append('') # run bootstrap commands out.append(' # bootstrap commands') for cmd in bootstrap: # reconstruct the command line, substituting $__mrjob_PWD/<name> # for path dicts line = ' ' for token in cmd: if isinstance(token, dict): # it's a path dictionary line += '$__mrjob_PWD/' line += pipes.quote(self._bootstrap_dir_mgr.name(**token)) else: # it's raw script line += token out.append(line) out.append('} 1>&2') # stdout -> stderr for ease of error log parsing return out def _start_of_sh_script(self): """Return a list of lines (without trailing newlines) containing the shell script shebang and pre-commands.""" out = [] # shebang sh_bin = self._sh_bin() if not sh_bin[0].startswith('/'): sh_bin = ['/usr/bin/env'] + sh_bin out.append('#!' + cmd_line(sh_bin)) # hook for 'set -e', etc. (see #1549) out.extend(self._sh_pre_commands()) return out ### Launching Clusters ### def _add_extra_cluster_params(self, params): """Return a dict with the *extra_cluster_params* opt patched into *params*, and ``None`` values removed.""" params = params.copy() params.update(self._opts['extra_cluster_params']) params = {k: v for k, v in params.items() if v is not None} return params
class MRJobRunner(object): """Abstract base class for all runners""" # this class handles the basic runner framework, options and config files, # arguments to mrjobs, and setting up job working dirs and environments. # this will put files from setup scripts, py_files, and bootstrap_mrjob # into the job's working dir, but won't actually run/import them # # command lines to run substeps (including Spark) are handled by # mrjob.bin.MRJobBinRunner #: alias for this runner; used for picking section of #: :py:mod:``mrjob.conf`` to load one of ``'local'``, ``'emr'``, #: or ``'hadoop'`` alias = None # libjars is only here because the job can set it; might want to # handle this with a warning from the launcher instead OPT_NAMES = { 'bootstrap_mrjob', 'check_input_paths', 'cleanup', 'cleanup_on_failure', 'cmdenv', 'jobconf', 'label', 'libjars', 'local_tmp_dir', 'owner', 'py_files', 'setup', 'upload_archives', 'upload_dirs', 'upload_files' } # if this is true, when bootstrap_mrjob is true, add it through the # setup script _BOOTSTRAP_MRJOB_IN_SETUP = True ### methods to call from your batch script ### def __init__(self, mr_job_script=None, conf_paths=None, extra_args=None, file_upload_args=None, hadoop_input_format=None, hadoop_output_format=None, input_paths=None, output_dir=None, partitioner=None, sort_values=None, stdin=None, step_output_dir=None, **opts): """All runners take the following keyword arguments: :type mr_job_script: str :param mr_job_script: the path of the ``.py`` file containing the :py:class:`~mrjob.job.MRJob`. If this is None, you won't actually be able to :py:meth:`run` the job, but other utilities (e.g. :py:meth:`ls`) will work. :type conf_paths: None or list :param conf_paths: List of config files to combine and use, or None to search for mrjob.conf in the default locations. :type extra_args: list of str :param extra_args: a list of extra cmd-line arguments to pass to the mr_job script. This is a hook to allow jobs to take additional arguments. :param file_upload_args: a list of tuples of ``('--ARGNAME', path)``. The file at the given path will be uploaded to the local directory of the mr_job script when it runs, and then passed into the script with ``--ARGNAME``. Useful for passing in SQLite DBs and other configuration files to your job. :type hadoop_input_format: str :param hadoop_input_format: name of an optional Hadoop ``InputFormat`` class. Passed to Hadoop along with your first step with the ``-inputformat`` option. Note that if you write your own class, you'll need to include it in your own custom streaming jar (see :mrjob-opt:`hadoop_streaming_jar`). :type hadoop_output_format: str :param hadoop_output_format: name of an optional Hadoop ``OutputFormat`` class. Passed to Hadoop along with your first step with the ``-outputformat`` option. Note that if you write your own class, you'll need to include it in your own custom streaming jar (see :mrjob-opt:`hadoop_streaming_jar`). :type input_paths: list of str :param input_paths: Input files for your job. Supports globs and recursively walks directories (e.g. ``['data/common/', 'data/training/*.gz']``). If this is left blank, we'll read from stdin :type output_dir: str :param output_dir: An empty/non-existent directory where Hadoop should put the final output from the job. If you don't specify an output directory, we'll output into a subdirectory of this job's temporary directory. You can control this from the command line with ``--output-dir``. This option cannot be set from configuration files. If used with the hadoop runner, this path does not need to be fully qualified with ``hdfs://`` URIs because it's understood that it has to be on HDFS. :type partitioner: str :param partitioner: Optional name of a Hadoop partitioner class, e.g. ``'org.apache.hadoop.mapred.lib.HashPartitioner'``. Hadoop streaming will use this to determine how mapper output should be sorted and distributed to reducers. :type sort_values: bool :param sort_values: if true, set partitioners and jobconf variables so that reducers to receive the values associated with any key in sorted order (sorted by their *encoded* value). Also known as secondary sort. :param stdin: an iterable (can be a ``BytesIO`` or even a list) to use as stdin. This is a hook for testing; if you set ``stdin`` via :py:meth:`~mrjob.job.MRJob.sandbox`, it'll get passed through to the runner. If for some reason your lines are missing newlines, we'll add them; this makes it easier to write automated tests. :type step_output_dir: str :param step_output_dir: An empty/non-existent directory where Hadoop should put output from all steps other than the last one (this only matters for multi-step jobs). Currently ignored by local runners. """ self._ran_job = False # opts are made from: # # empty defaults (everything set to None) # runner-specific defaults # opts from config file(s) # opts from command line self._opts = self._combine_confs( [(None, {key: None for key in self.OPT_NAMES})] + [(None, self._default_opts())] + load_opts_from_mrjob_confs(self.alias, conf_paths) + [('the command line', opts)]) log.debug('Active configuration:') log.debug( pprint.pformat({ opt_key: self._obfuscate_opt(opt_key, opt_value) for opt_key, opt_value in self._opts.items() })) self._fs = None # a local tmp directory that will be cleaned up when we're done # access/make this using self._get_local_tmp_dir() self._local_tmp_dir = None self._working_dir_mgr = WorkingDirManager() # mapping from dir to path for corresponding archive. we pick # paths during init(), but don't actually create the archives # until self._create_dir_archives() is called self._dir_to_archive_path = {} # dir archive names (the filename minus ".tar.gz") already taken self._dir_archive_names_taken = set() # set of dir_archives that have actually been created self._dir_archives_created = set() # track (name, path) of files and archives to upload to spark. # these are a subset of those in self._working_dir_mgr self._spark_files = [] self._spark_archives = [] self._upload_mgr = None # define in subclasses that use this self._script_path = mr_job_script if self._script_path: self._working_dir_mgr.add('file', self._script_path) # give this job a unique name self._job_key = self._make_unique_job_key(label=self._opts['label'], owner=self._opts['owner']) # extra args to our job self._extra_args = list(extra_args) if extra_args else [] # extra file arguments to our job self._file_upload_args = [] if file_upload_args: for arg, path in file_upload_args: arg_file = parse_legacy_hash_path('file', path) self._working_dir_mgr.add(**arg_file) self._file_upload_args.append((arg, arg_file)) self._spark_files.append((arg_file['name'], arg_file['path'])) # set up uploading for hash_path in self._opts['upload_files']: uf = parse_legacy_hash_path('file', hash_path, must_name='upload_files') self._working_dir_mgr.add(**uf) self._spark_files.append((uf['name'], uf['path'])) for hash_path in self._opts['upload_archives']: ua = parse_legacy_hash_path('archive', hash_path, must_name='upload_archives') self._working_dir_mgr.add(**ua) self._spark_archives.append((ua['name'], ua['path'])) for hash_path in self._opts['upload_dirs']: # pick name based on directory path ud = parse_legacy_hash_path('dir', hash_path, must_name='upload_archives') # but feed working_dir_mgr the archive's path archive_path = self._dir_archive_path(ud['path']) self._working_dir_mgr.add('archive', archive_path, name=ud['name']) self._spark_archives.append((ud['name'], archive_path)) # py_files # self._setup is a list of shell commands with path dicts # interleaved; see mrjob.setup.parse_setup_cmd() for details self._setup = self._parse_setup_and_py_files() for cmd in self._setup: for token in cmd: if isinstance(token, dict): # convert dir archives tokens to archives if token['type'] == 'dir': # feed the archive's path to self._working_dir_mgr token['path'] = self._dir_archive_path(token['path']) token['type'] = 'archive' self._working_dir_mgr.add(**token) # Where to read input from (log files, etc.) self._input_paths = input_paths or ['-'] # by default read from stdin if PY2: self._stdin = stdin or sys.stdin else: self._stdin = stdin or sys.stdin.buffer self._stdin_path = None # temp file containing dump from stdin # where a zip file of the mrjob library is stored locally self._mrjob_zip_path = None # store output_dir self._output_dir = output_dir # store partitioner self._partitioner = partitioner # store sort_values self._sort_values = sort_values # store step_output_dir self._step_output_dir = step_output_dir # store hadoop input and output formats self._hadoop_input_format = hadoop_input_format self._hadoop_output_format = hadoop_output_format # A cache for self._get_steps(); also useful as a test hook self._steps = None # this variable marks whether a cleanup has happened and this runner's # output stream is no longer available. self._closed = False ### Options #### def _default_opts(self): try: owner = getpass.getuser() except: owner = None return dict( check_input_paths=True, cleanup=['ALL'], cleanup_on_failure=['NONE'], local_tmp_dir=tempfile.gettempdir(), owner=owner, ) def _combine_confs(self, source_and_opt_list): """Combine several opt dictionaries into one. *source_and_opt_list* is a list of tuples of *source*, *opts* where *opts* is a dictionary and *source* is either None or a description of where the opts came from (usually a path). Only override this if you need truly fine-grained control, including knowledge of the options' source. """ opt_list = [ self._fix_opts(opts, source) for source, opts in source_and_opt_list ] return self._combine_opts(opt_list) def _combine_opts(self, opt_list): """Combine several opt dictionaries into one. *opt_list* is a list of dictionaries containing validated options Override this if you need to base options off the values of other options, but don't need to issue warnings etc. about the options' source. """ return combine_opts(self._opt_combiners(), *opt_list) def _opt_combiners(self): """A dictionary mapping opt name to combiner funciton. This won't necessarily include every opt name (we default to :py:func:`~mrjob.conf.combine_value`). """ return _combiners(self.OPT_NAMES) def _fix_opts(self, opts, source=None): """Take an options dictionary, and either return a sanitized version of it, or raise an exception. *source* is either a string describing where the opts came from or None. This ensures that opt dictionaries are really dictionaries and handles deprecated options. """ if source is None: source = 'defaults' # defaults shouldn't trigger warnings if not isinstance(opts, dict): raise TypeError( 'options for %s (from %s) must be a dict' % self.runner_alias, source) deprecated_aliases = _deprecated_aliases(self.OPT_NAMES) results = {} for k, v in sorted(opts.items()): # rewrite deprecated aliases if k in deprecated_aliases: if v is None: # don't care continue aliased_opt = deprecated_aliases log.warning('Deprecated option %s (from %s) has been renamed' ' to %s and will be removed in v0.7.0' % (k, source, aliased_opt)) if opts.get(aliased_opt) is not None: return # don't overwrite non-aliased opt k = aliased_opt if k in self.OPT_NAMES: results[k] = None if v is None else self._fix_opt(k, v, source) else: log.warning('Unexpected option %s (from %s)' % (k, source)) return results def _fix_opt(self, opt_key, opt_value, source): """Fix a single option, returning its correct value or raising an exception. This is not called for options that are ``None``. This currently handles cleanup opts. Override this if you require additional opt validation or cleanup. """ if opt_key in ('cleanup', 'cleanup_on_failure'): return self._fix_cleanup_opt(opt_key, opt_value, source) else: return opt_value def _fix_cleanup_opt(self, opt_key, opt_value, source): """Fix a cleanup option, or raise ValueError.""" if isinstance(opt_value, string_types): opt_value = [opt_value] if 'NONE' in opt_value and len(set(opt_value)) > 1: raise ValueError('Cannot clean up both nothing and something!' ' (%s option from %s)' % (opt_key, source)) for cleanup_type in opt_value: if cleanup_type not in CLEANUP_CHOICES: raise ValueError( '%s must be one of %s, not %s (from %s)' % (opt_key, ', '.join(CLEANUP_CHOICES), opt_value, source)) return opt_value def _obfuscate_opt(self, opt_key, opt_value): """Return value of opt to show in debug printout. Used to obfuscate credentials, etc.""" return opt_value ### Filesystem object ### @property def fs(self): """:py:class:`~mrjob.fs.base.Filesystem` object for the local filesystem. """ if self._fs is None: # wrap LocalFilesystem in CompositeFilesystem to get IOError # on URIs (see #1185) self._fs = CompositeFilesystem(LocalFilesystem()) return self._fs ### Running the job and parsing output ### def run(self): """Run the job, and block until it finishes. Raise :py:class:`~mrjob.step.StepFailedException` if there are any problems (except on :py:class:`~mrjob.inline.InlineMRJobRunner`, where we raise the actual exception that caused the step to fail). """ if not self._script_path: raise AssertionError("No script to run!") if self._ran_job: raise AssertionError("Job already ran!") self._create_dir_archives() self._check_input_paths() self._run() self._ran_job = True def cat_output(self): """Stream the jobs output, as a stream of ``bytes``. If there are multiple output files, there will be an empty bytestring (``b''``) between them. .. versionadded:: 0.6.0 In previous versions, you'd use :py:meth:`stream_output`. """ output_dir = self.get_output_dir() if output_dir is None: raise AssertionError('Run the job before streaming output') if self._closed is True: log.warning( 'WARNING! Trying to stream output from a closed runner, output' ' will probably be empty.') log.info('Streaming final output from %s...' % output_dir) def split_path(path): while True: base, name = os.path.split(path) # no more elements if not name: break yield name path = base def ls_output(): for filename in self.fs.ls(output_dir): subpath = filename[len(output_dir):] if not (any( name.startswith('_') for name in split_path(subpath))): yield filename for i, filename in enumerate(ls_output()): if i > 0: yield b'' # EOF of previous file for chunk in self.fs._cat_file(filename): yield chunk def stream_output(self): """Like :py:meth:`cat_output` except that it groups bytes into lines. Equivalent to ``mrjob.util.to_lines(runner.stream_output())``. .. deprecated:: 0.6.0 """ log.warning('stream_output() is deprecated and will be removed in' ' v0.7.0. use mrjob.util.to_lines(runner.cat_output())' ' instead.') return to_lines(self.cat_output()) def _cleanup_mode(self, mode=None): """Actual cleanup action to take based on various options""" if self._script_path and not self._ran_job: return mode or self._opts['cleanup_on_failure'] else: return mode or self._opts['cleanup'] def _cleanup_cloud_tmp(self): """Cleanup any files/directories on cloud storage (e.g. S3) we created while running this job. Should be safe to run this at any time, or multiple times. """ pass # only EMR runner does this def _cleanup_hadoop_tmp(self): """Cleanup any files/directories on HDFS we created while running this job. Should be safe to run this at any time, or multiple times. """ pass # only Hadoop runner does this def _cleanup_local_tmp(self): """Cleanup any files/directories on the local machine we created while running this job. Should be safe to run this at any time, or multiple times. This particular function removes any local tmp directories added to the list self._local_tmp_dirs This won't remove output_dir if it's outside of our tmp dir. """ if self._local_tmp_dir: log.info('Removing temp directory %s...' % self._local_tmp_dir) try: shutil.rmtree(self._local_tmp_dir) except OSError as e: log.exception(e) self._local_tmp_dir = None def _cleanup_cluster(self): """Terminate the cluster if there is one.""" pass # this only happens on EMR def _cleanup_logs(self): """Cleanup any log files that are created as a side-effect of the job. """ pass # this only happens on EMR def _cleanup_job(self): """Stop any jobs that we created that are still running.""" pass # currently disabled (see #1241) def cleanup(self, mode=None): """Clean up running jobs, temp files, and logs, subject to the *cleanup* option passed to the constructor. If you create your runner in a :keyword:`with` block, :py:meth:`cleanup` will be called automatically:: with mr_job.make_runner() as runner: ... # cleanup() called automatically here :param mode: override *cleanup* passed into the constructor. Should be a list of strings from :py:data:`CLEANUP_CHOICES` """ mode = self._cleanup_mode(mode) def mode_has(*args): return any((choice in mode) for choice in args) if self._script_path and not self._ran_job: if mode_has('CLUSTER', 'ALL'): self._cleanup_cluster() if mode_has('JOB', 'ALL'): self._cleanup_job() if mode_has('ALL', 'TMP', 'CLOUD_TMP'): self._cleanup_cloud_tmp() if mode_has('ALL', 'TMP', 'HADOOP_TMP'): self._cleanup_hadoop_tmp() if mode_has('ALL', 'TMP', 'LOCAL_TMP'): self._cleanup_local_tmp() if mode_has('ALL', 'LOGS'): self._cleanup_logs() self._closed = True def counters(self): """Get counters associated with this run in this form:: [{'group name': {'counter1': 1, 'counter2': 2}}, {'group name': ...}] The list contains an entry for every step of the current job. """ raise NotImplementedError ### hooks for the with statement ### def __enter__(self): """Don't do anything special at start of with block""" return self def __exit__(self, type, value, traceback): """Call self.cleanup() at end of with block.""" self.cleanup() ### more runner information ### def get_opts(self): """Get options set for this runner, as a dict.""" return copy.deepcopy(self._opts) def get_job_key(self): """Get the unique key for the job run by this runner. This has the format ``label.owner.date.time.microseconds`` """ return self._job_key def get_output_dir(self): """Find the directory containing the job output. If the job hasn't run yet, returns None""" if self._script_path and not self._ran_job: return None return self._output_dir ### other methods you need to implement in your subclass ### def get_hadoop_version(self): """Return the version number of the Hadoop environment as a string if Hadoop is being used or simulated. Return None if not applicable. :py:class:`~mrjob.emr.EMRJobRunner` infers this from the cluster. :py:class:`~mrjob.hadoop.HadoopJobRunner` gets this from ``hadoop version``. :py:class:`~mrjob.local.LocalMRJobRunner` has an additional `hadoop_version` option to specify which version it simulates. :py:class:`~mrjob.inline.InlineMRJobRunner` does not simulate Hadoop at all. """ return None # you'll probably wan't to add your own __init__() and cleanup() as well def _run(self): """Run the job.""" raise NotImplementedError ### internal utilities for implementing MRJobRunners ### def _get_local_tmp_dir(self): """Create a tmp directory on the local filesystem that will be cleaned up by self.cleanup()""" if not self._local_tmp_dir: path = os.path.join(self._opts['local_tmp_dir'], self._job_key) log.info('Creating temp directory %s' % path) if os.path.isdir(path): shutil.rmtree(path) os.makedirs(path) self._local_tmp_dir = path return self._local_tmp_dir def _make_unique_job_key(self, label=None, owner=None): """Come up with a useful unique ID for this job. We use this to choose the output directory, etc. for the job. """ # use the name of the script if one wasn't explicitly # specified if not label: if self._script_path: label = os.path.basename(self._script_path).split('.')[0] else: label = 'no_script' if not owner: owner = 'no_user' now = datetime.datetime.utcnow() return '%s.%s.%s.%06d' % (label, owner, now.strftime('%Y%m%d.%H%M%S'), now.microsecond) def _get_steps(self): """Call the job script to find out how many steps it has, and whether there are mappers and reducers for each step. Validate its output. Returns output as described in :ref:`steps-format`. Results are cached, so call this as many times as you want. """ if self._steps is None: self._steps = self._load_steps() return self._steps def _load_steps(self): """Ask job how many steps it has, and whether there are mappers and reducers for each step. Returns output as described in :ref:`steps-format`. """ raise NotImplementedError def _get_step(self, step_num): """Get a single step (calls :py:meth:`_get_steps`).""" return self._get_steps()[step_num] def _num_steps(self): """Get the number of steps (calls :py:meth:`get_steps`).""" return len(self._get_steps()) def _has_streaming_steps(self): """Are any of our steps Hadoop streaming steps?""" return any(step['type'] == 'streaming' for step in self._get_steps()) def _has_spark_steps(self): """Are any of our steps Spark steps (either spark or spark_script)""" return any( _is_spark_step_type(step['type']) for step in self._get_steps()) def _args_for_task(self, step_num, mrc): return [ '--step-num=%d' % step_num, '--%s' % mrc, ] + self._mr_job_extra_args() def _mr_job_extra_args(self, local=False): """Return arguments to add to every invocation of MRJob. :type local: boolean :param local: if this is True, use files' local paths rather than the path they'll have inside Hadoop streaming """ return (self._get_file_upload_args(local=local) + self._extra_args) def _get_file_upload_args(self, local=False): """Arguments used to pass through config files, etc from the job runner through to the local directory where the script is run. :type local: boolean :param local: if this is True, use files' local paths rather than the path they'll have inside Hadoop streaming """ args = [] for arg, path_dict in self._file_upload_args: args.append(arg) if local: args.append(path_dict['path']) else: args.append(self._working_dir_mgr.name(**path_dict)) return args def _dir_archive_path(self, dir_path): """Assign a path for the archive of *dir_path* but don't actually create anything.""" if dir_path not in self._dir_to_archive_path: # we can check local paths now if not (is_uri(dir_path) or os.path.isdir(dir_path)): raise OSError('%s is not a directory!' % dir_path) name = name_uniquely(dir_path, names_taken=self._dir_archive_names_taken) self._dir_archive_names_taken.add(name) self._dir_to_archive_path[dir_path] = os.path.join( self._get_local_tmp_dir(), 'archives', name + '.tar.gz') return self._dir_to_archive_path[dir_path] def _create_dir_archives(self): """Call this to create all dir archives""" for dir_path in sorted(set(self._dir_to_archive_path)): self._create_dir_archive(dir_path) def _create_dir_archive(self, dir_path): """Helper for :py:meth:`archive_dir`""" if not self.fs.exists(dir_path): raise OSError('%s does not exist') tar_gz_path = self._dir_archive_path(dir_path) if tar_gz_path in self._dir_archives_created: return # already created if not os.path.isdir(os.path.dirname(tar_gz_path)): os.makedirs(os.path.dirname(tar_gz_path)) # for remote files tmp_download_path = os.path.join(self._get_local_tmp_dir(), 'tmp-download') log.info('Archiving %s -> %s' % (dir_path, tar_gz_path)) with tarfile.open(tar_gz_path, mode='w:gz') as tar_gz: for path in self.fs.ls(dir_path): # fs.ls() only lists files if path == dir_path: raise OSError('%s is a file, not a directory!' % dir_path) # TODO: do we need this? if os.path.realpath(path) == os.path.realpath(tar_gz_path): raise OSError('attempted to archive %s into itself!' % tar_gz_path) if is_uri(path): path_in_tar_gz = path[len(dir_path):].lstrip('/') log.info(' downloading %s -> %s' % (path, tmp_download_path)) with open(tmp_download_path, 'wb') as f: for chunk in self.fs.cat(path): f.write(chunk) local_path = tmp_download_path else: path_in_tar_gz = path[len(dir_path):].lstrip(os.sep) local_path = path log.debug(' adding %s to %s' % (path, tar_gz_path)) tar_gz.add(local_path, path_in_tar_gz, recursive=False) self._dir_archives_created.add(tar_gz_path) def _bootstrap_mrjob(self): """Should we bootstrap mrjob?""" if self._opts['bootstrap_mrjob'] is None: return self._opts['interpreter'] is None else: return bool(self._opts['bootstrap_mrjob']) def _get_input_paths(self): """Get the paths to input files, dumping STDIN to a local file if need be.""" if '-' in self._input_paths: if self._stdin_path is None: # prompt user, so they don't think the process has stalled log.info('reading from STDIN') stdin_path = os.path.join(self._get_local_tmp_dir(), 'STDIN') log.debug('dumping stdin to local file %s' % stdin_path) with open(stdin_path, 'wb') as stdin_file: for line in self._stdin: # catch missing newlines (often happens with test data) if not line.endswith(b'\n'): line += b'\n' stdin_file.write(line) self._stdin_path = stdin_path return [self._stdin_path if p == '-' else p for p in self._input_paths] def _check_input_paths(self): """Check that input exists prior to running the job, if the `check_input_paths` option is true.""" if not self._opts['check_input_paths']: return for path in self._input_paths: if path == '-': continue # STDIN always exists if not self.fs.can_handle_path(path): continue # e.g. non-S3 URIs on EMR if not self.fs.exists(path): raise IOError('Input path %s does not exist!' % (path, )) def _intermediate_output_uri(self, step_num, local=False): """A URI for intermediate output for the given step number.""" join = os.path.join if local else posixpath.join return join(self._step_output_dir or self._default_step_output_dir(), '%04d' % step_num) def _default_step_output_dir(self): """Where to put output for steps other than the last one, if not specified by the *output_dir* constructor keyword. Usually you want this to be on HDFS (most efficient). Define this in your runner subclass. """ raise NotImplementedError def _step_input_uris(self, step_num): """A list of URIs to use as input for the given step. For all except the first step, this list will have a single item (a directory).""" if step_num == 0: return [ self._upload_mgr.uri(path) for path in self._get_input_paths() ] else: return [self._intermediate_output_uri(step_num - 1)] def _step_output_uri(self, step_num): """URI to use as output for the given step. This is either an intermediate dir (see :py:meth:`intermediate_output_uri`) or ``self._output_dir`` for the final step.""" if step_num == len(self._get_steps()) - 1: return self._output_dir else: return self._intermediate_output_uri(step_num) def _interpolate_input_and_output(self, args, step_num): """Replace :py:data:`~mrjob.step.INPUT` and :py:data:`~mrjob.step.OUTPUT` in arguments to a jar or Spark step. If there are multiple input paths (i.e. on the first step), they'll be joined with a comma. """ def interpolate(arg): if arg == mrjob.step.INPUT: return ','.join(self._step_input_uris(step_num)) elif arg == mrjob.step.OUTPUT: return self._step_output_uri(step_num) else: return arg return [interpolate(arg) for arg in args] def _create_mrjob_zip(self): """Make a zip of the mrjob library, without .pyc or .pyo files, This will also set ``self._mrjob_zip_path`` and return it. Typically called from :py:meth:`_create_setup_wrapper_script`. It's safe to call this method multiple times (we'll only create the zip file once.) """ if not self._mrjob_zip_path: # find mrjob library import mrjob if not os.path.basename(mrjob.__file__).startswith('__init__.'): raise Exception( "Bad path for mrjob library: %s; can't bootstrap mrjob", mrjob.__file__) mrjob_dir = os.path.dirname(mrjob.__file__) or '.' zip_path = os.path.join(self._get_local_tmp_dir(), 'mrjob.zip') def filter_path(path): filename = os.path.basename(path) return not (filename.lower().endswith('.pyc') or filename.lower().endswith('.pyo') or # filter out emacs backup files filename.endswith('~') or # filter out emacs lock files filename.startswith('.#') or # filter out MacFuse resource forks filename.startswith('._')) log.debug('archiving %s -> %s as %s' % (mrjob_dir, zip_path, os.path.join('mrjob', ''))) zip_dir(mrjob_dir, zip_path, filter=filter_path, prefix='mrjob') self._mrjob_zip_path = zip_path return self._mrjob_zip_path def _jobconf_for_step(self, step_num): """Get the jobconf dictionary, optionally including step-specific jobconf info. Also translate jobconfs to the current Hadoop version, if necessary. """ step = self._get_step(step_num) # _sort_values_jobconf() isn't relevant to Spark, # but it doesn't do any harm either jobconf = combine_dicts(self._sort_values_jobconf(), self._opts['jobconf'], step.get('jobconf')) # if user is using the wrong jobconfs, add in the correct ones # and log a warning hadoop_version = self.get_hadoop_version() if hadoop_version: jobconf = translate_jobconf_dict(jobconf, hadoop_version) return jobconf def _sort_values_jobconf(self): """Jobconf dictionary to enable sorting by value. """ if not self._sort_values: return {} # translate _SORT_VALUES_JOBCONF to the correct Hadoop version, # without logging a warning hadoop_version = self.get_hadoop_version() jobconf = {} for k, v in _SORT_VALUES_JOBCONF.items(): if hadoop_version: jobconf[translate_jobconf(k, hadoop_version)] = v else: for j in translate_jobconf_for_all_versions(k): jobconf[j] = v return jobconf def _sort_values_partitioner(self): """Partitioner to use with *sort_values* keyword to the constructor.""" if self._sort_values: return _SORT_VALUES_PARTITIONER else: return None def _parse_setup_and_py_files(self): """Parse the *setup* option with :py:func:`mrjob.setup.parse_setup_cmd()`, and patch in *py_files*. """ setup = [] # py_files for path in self._opts['py_files']: # Spark (at least v1.3.1) doesn't work with # and --py-files, # see #1375 if '#' in path: raise ValueError("py_files cannot contain '#'") path_dict = parse_legacy_hash_path('file', path) setup.append(['export PYTHONPATH=', path_dict, ':$PYTHONPATH']) # setup for cmd in self._opts['setup']: setup.append(parse_setup_cmd(cmd)) return setup def _upload_args(self): # just upload every file and archive in the working dir manager return self._upload_args_helper('-files', None, '-archives', None) def _upload_args_helper(self, files_opt_str, files, archives_opt_str, archives): args = [] file_hash_paths = list(self._arg_hash_paths('file', files)) if file_hash_paths: args.append(files_opt_str) args.append(','.join(file_hash_paths)) archive_hash_paths = list(self._arg_hash_paths('archive', archives)) if archive_hash_paths: args.append(archives_opt_str) args.append(','.join(archive_hash_paths)) return args def _arg_hash_paths(self, type, named_paths=None): """Helper function for the *upload_args methods.""" if named_paths is None: # just return everything managed by _working_dir_mgr named_paths = sorted( self._working_dir_mgr.name_to_path(type).items()) for name, path in named_paths: if not name: name = self._working_dir_mgr.name(type, path) uri = self._upload_mgr.uri(path) yield '%s#%s' % (uri, name)
class HadoopInTheCloudJobRunner(MRJobBinRunner): """Abstract base class for all Hadoop-in-the-cloud services.""" alias = '_cloud' OPT_NAMES = MRJobBinRunner.OPT_NAMES | { 'bootstrap', 'bootstrap_python', 'check_cluster_every', 'cloud_fs_sync_secs', 'cloud_part_size_mb', 'cloud_tmp_dir', 'cluster_id', 'core_instance_type', 'extra_cluster_params', 'hadoop_streaming_jar', 'image_id', 'image_version', 'instance_type', 'master_instance_type', 'max_mins_idle', 'num_core_instances', 'num_task_instances', 'region', 'ssh_bind_ports', 'ssh_tunnel', 'ssh_tunnel_is_open', 'task_instance_type', 'zone', } # so far, every service provides the ability to run bootstrap scripts _BOOTSTRAP_MRJOB_IN_PY_FILES = False def __init__(self, **kwargs): super(HadoopInTheCloudJobRunner, self).__init__(**kwargs) # if *cluster_id* is not set, ``self._cluster_id`` will be # set when we create or join a cluster self._cluster_id = self._opts['cluster_id'] # bootstrapping self._bootstrap = self._bootstrap_python() + self._parse_bootstrap() # add files to manager self._bootstrap_dir_mgr = WorkingDirManager() for cmd in self._bootstrap: for token in cmd: if isinstance(token, dict): # convert dir archive tokens to archives if token['type'] == 'dir': token['path'] = self._dir_archive_path(token['path']) token['type'] = 'archive' self._bootstrap_dir_mgr.add(**token) # we'll create this script later, as needed self._master_bootstrap_script_path = None # ssh state # the process for the SSH tunnel self._ssh_proc = None # if this is true, stop trying to launch the SSH tunnel self._give_up_on_ssh_tunnel = False # store the (tunneled) URL of the job tracker/resource manager self._ssh_tunnel_url = None ### Options ### @classmethod def _default_opts(cls): return combine_dicts( super(HadoopInTheCloudJobRunner, cls)._default_opts(), dict( cloud_part_size_mb=_DEFAULT_CLOUD_PART_SIZE_MB, max_mins_idle=_DEFAULT_MAX_MINS_IDLE, # don't use a list because it makes it hard to read option # values when running in verbose mode. See #1284 ssh_bind_ports=xrange(40001, 40841), ssh_tunnel=False, ssh_tunnel_is_open=False, # ssh_bin isn't included here. For example, the Dataproc # runner launches ssh through the gcloud util ), ) def _fix_opts(self, opts, source=None): opts = super(HadoopInTheCloudJobRunner, self)._fix_opts(opts, source=source) # cloud_part_size_mb should be a number if opts.get('cloud_part_size_mb') is not None: if not isinstance(opts['cloud_part_size_mb'], (integer_types, float)): raise TypeError('cloud_part_size_mb must be a number') return opts def _combine_opts(self, opt_list): """Propagate *instance_type* to other instance type opts, if not already set. Also propagate core instance type to task instance type, if it's not already set. """ opts = super(HadoopInTheCloudJobRunner, self)._combine_opts(opt_list) if opts['instance_type']: # figure out how late in the configs opt was set (setting # --instance_type on the command line overrides core_instance_type # set in configs) opt_priority = {k: -1 for k in opts} for i, sub_opts in enumerate(opt_list): for k, v in sub_opts.items(): if v == opts[k]: opt_priority[k] = i # instance_type only affects master_instance_type if there are # no other instances if opts['num_core_instances'] or opts['num_task_instances']: propagate_to = ['core_instance_type', 'task_instance_type'] else: propagate_to = ['master_instance_type'] for k in propagate_to: if opts[k] is None or (opt_priority[k] < opt_priority['instance_type']): opts[k] = opts['instance_type'] if not opts['task_instance_type']: opts['task_instance_type'] = opts['core_instance_type'] return opts ### Bootstrapping ### def _bootstrap_python(self): """Redefine this to return a (possibly empty) list of parsed commands (in the same format as returned by parse_setup_cmd())' to make sure a compatible version of Python is installed If the *bootstrap_python* option is false, should always return ``[]``. """ return [] def _cp_to_local_cmd(self): """Command to copy files from the cloud to the local directory (usually via Hadoop). Redefine this as needed; for example, on EMR, we sometimes have to use ``aws s3 cp`` because ``hadoop`` isn't installed at bootstrap time.""" return 'hadoop fs -copyToLocal' def _parse_bootstrap(self): """Parse the *bootstrap* option with :py:func:`mrjob.setup.parse_setup_cmd()`. """ return [parse_setup_cmd(cmd) for cmd in self._opts['bootstrap']] def _create_master_bootstrap_script_if_needed(self): """Helper for :py:meth:`_add_bootstrap_files_for_upload`. Create the master bootstrap script and write it into our local temp directory. Set self._master_bootstrap_script_path. This will do nothing if there are no bootstrap scripts or commands, or if it has already been called.""" if self._master_bootstrap_script_path: return # don't bother if we're not starting a cluster if self._cluster_id: return # Also don't bother if we're not bootstrapping if not (self._bootstrap or self._bootstrap_mrjob()): return # create mrjob.zip if we need it, and add commands to install it mrjob_bootstrap = [] if self._bootstrap_mrjob(): assert self._mrjob_zip_path path_dict = { 'type': 'file', 'name': None, 'path': self._mrjob_zip_path } self._bootstrap_dir_mgr.add(**path_dict) # find out where python keeps its libraries mrjob_bootstrap.append([ "__mrjob_PYTHON_LIB=$(%s -c " "'from distutils.sysconfig import get_python_lib;" " print(get_python_lib())')" % cmd_line(self._python_bin()) ]) # remove anything that might be in the way (see #1567) mrjob_bootstrap.append(['sudo rm -rf $__mrjob_PYTHON_LIB/mrjob']) # unzip mrjob.zip mrjob_bootstrap.append( ['sudo unzip ', path_dict, ' -d $__mrjob_PYTHON_LIB']) # re-compile pyc files now, since mappers/reducers can't # write to this directory. Don't fail if there is extra # un-compileable crud in the tarball (this would matter if # sh_bin were 'sh -e') mrjob_bootstrap.append([ 'sudo %s -m compileall -q' ' -f $__mrjob_PYTHON_LIB/mrjob && true' % cmd_line(self._python_bin()) ]) path = os.path.join(self._get_local_tmp_dir(), 'b.sh') log.info('writing master bootstrap script to %s' % path) contents = self._master_bootstrap_script_content(self._bootstrap + mrjob_bootstrap) self._write_script(contents, path, 'master bootstrap script') self._master_bootstrap_script_path = path def _master_bootstrap_script_content(self, bootstrap): """Return a list containing the lines of the master bootstrap script. (without trailing newlines) """ out = [] # shebang, precommands out.extend(self._start_of_sh_script()) out.append('') # for example, create a tmp dir and cd to it if self._bootstrap_pre_commands(): out.extend(self._bootstrap_pre_commands()) out.append('') # store $PWD out.append('# store $PWD') out.append('__mrjob_PWD=$PWD') out.append('') # special case for PWD being in /, which happens on Dataproc # (really we should cd to tmp or something) out.append('if [ $__mrjob_PWD = "/" ]; then') out.append(' __mrjob_PWD=""') out.append('fi') out.append('') # run commands in a block so we can redirect stdout to stderr # (e.g. to catch errors from compileall). See #370 out.append('{') # download files out.append(' # download files and mark them executable') cp_to_local = self._cp_to_local_cmd() # TODO: why bother with $__mrjob_PWD here, since we're already in it? for name, path in sorted( self._bootstrap_dir_mgr.name_to_path('file').items()): uri = self._upload_mgr.uri(path) out.append(' %s %s $__mrjob_PWD/%s' % (cp_to_local, pipes.quote(uri), pipes.quote(name))) # imitate Hadoop Distributed Cache (see #1602) out.append(' chmod u+rx $__mrjob_PWD/%s' % pipes.quote(name)) out.append('') # download and unarchive archives archive_names_and_paths = sorted( self._bootstrap_dir_mgr.name_to_path('archive').items()) if archive_names_and_paths: # make tmp dir if needed out.append(' # download and unpack archives') out.append(' __mrjob_TMP=$(mktemp -d)') out.append('') for name, path in archive_names_and_paths: uri = self._upload_mgr.uri(path) ext = file_ext(basename(path)) archive_file_name = self._bootstrap_dir_mgr.name( 'archive_file', path) # copy file to tmp dir quoted_archive_path = '$__mrjob_TMP/%s' % pipes.quote( archive_file_name) out.append( ' %s %s %s' % (cp_to_local, pipes.quote(uri), quoted_archive_path)) out.append(' ' + _unarchive_cmd(path) % dict(file=quoted_archive_path, dir='$__mrjob_PWD/' + pipes.quote(name))) # imitate Hadoop Distributed Cache (see #1602) out.append(' chmod u+rx -R $__mrjob_PWD/%s' % pipes.quote(name)) out.append('') # run bootstrap commands out.append(' # bootstrap commands') for cmd in bootstrap: # reconstruct the command line, substituting $__mrjob_PWD/<name> # for path dicts line = ' ' for token in cmd: if isinstance(token, dict): # it's a path dictionary line += '$__mrjob_PWD/' line += pipes.quote(self._bootstrap_dir_mgr.name(**token)) else: # it's raw script line += token out.append(line) out.append('} 1>&2') # stdout -> stderr for ease of error log parsing return out def _bootstrap_pre_commands(self): """A list of hard-coded commands to run at the beginning of the bootstrap script. Currently used by dataproc to cd into a tmp dir.""" return [] def _start_of_sh_script(self): """Return a list of lines (without trailing newlines) containing the shell script shebang and pre-commands.""" out = [] # shebang sh_bin = self._sh_bin() if not sh_bin[0].startswith('/'): sh_bin = ['/usr/bin/env'] + sh_bin out.append('#!' + cmd_line(sh_bin)) # hook for 'set -e', etc. (see #1549) out.extend(self._sh_pre_commands()) return out ### Launching Clusters ### def _add_extra_cluster_params(self, params): """Return a dict with the *extra_cluster_params* opt patched into *params*, and ``None`` values removed.""" params = deepcopy(params) for k, v in sorted(self._opts['extra_cluster_params'].items()): _patch_params(params, k, v) return params ### SSH Tunnel ### def _ssh_tunnel_args(self, bind_port): """Redefine this in your subclass. You will probably want to call :py:meth:`_ssh_tunnel_opts` somewhere in here. Should return the list of args used to run the command to open the SSH tunnel, bound to *bind_port* on your computer, or ``None`` if it isn't possible to set up an SSH tunnel. """ return None def _ssh_tunnel_config(self): """Redefine this in your subclass. Should return a dict with the following keys: *localhost*: once we SSH in, is the web interface? reachable at ``localhost`` *name*: either ``'job tracker'`` or ``'resource manager'`` *path*: path of main page on web interface (e.g. "/cluster") *port*: port number of the web interface """ raise NotImplementedError def _launch_ssh_proc(self, args): """The command used to create a :py:class:`subprocess.Popen` to run the SSH tunnel. You usually don't need to redefine this.""" log.debug('> %s' % cmd_line(args)) return Popen(args, stdin=PIPE, stdout=PIPE, stderr=PIPE) def _ssh_launch_wait_secs(self): """Wait this long after launching the SSH process before checking for failure (default 1 second). You may redefine this.""" return 1.0 def _set_up_ssh_tunnel(self): """Call this whenever you think it is possible to SSH to your cluster. This sets :py:attr:`_ssh_proc`. Does nothing if :mrjob-opt:`ssh_tunnel` is not set, or there is already a tunnel process running. """ # did the user request an SSH tunnel? if not self._opts['ssh_tunnel']: return # no point in trying to launch a nonexistent command twice if self._give_up_on_ssh_tunnel: return # did we already launch the SSH tunnel process? is it still running? if self._ssh_proc: self._ssh_proc.poll() if self._ssh_proc.returncode is None: return else: log.warning(' Oops, ssh subprocess exited with return code' ' %d, restarting...' % self._ssh_proc.returncode) self._ssh_proc = None tunnel_config = self._ssh_tunnel_config() bind_port = None popen_exception = None ssh_tunnel_args = [] for bind_port in self._pick_ssh_bind_ports(): ssh_proc = None ssh_tunnel_args = self._ssh_tunnel_args(bind_port) # can't launch SSH tunnel right now if not ssh_tunnel_args: return try: ssh_proc = self._launch_ssh_proc(ssh_tunnel_args) except OSError as ex: # e.g. OSError(2, 'File not found') popen_exception = ex # warning handled below break if ssh_proc: time.sleep(self._ssh_launch_wait_secs()) ssh_proc.poll() # still running. We are golden if ssh_proc.returncode is None: self._ssh_proc = ssh_proc break else: ssh_proc.stdin.close() ssh_proc.stdout.close() ssh_proc.stderr.close() if self._ssh_proc: if self._opts['ssh_tunnel_is_open']: bind_host = socket.getfqdn() else: bind_host = 'localhost' self._ssh_tunnel_url = 'http://%s:%d%s' % (bind_host, bind_port, tunnel_config['path']) log.info(' Connect to %s at: %s' % (tunnel_config['name'], self._ssh_tunnel_url)) else: if popen_exception: # this only happens if the ssh binary is not present # or not executable (so tunnel_config and the args to the # ssh binary don't matter) log.warning(" Couldn't open SSH tunnel: %s" % popen_exception) self._give_up_on_ssh_tunnel = True return else: log.warning(' Failed to open ssh tunnel to %s' % tunnel_config['name']) def _kill_ssh_tunnel(self): """Send SIGKILL to SSH tunnel, if it's running.""" if not self._ssh_proc: return self._ssh_proc.poll() if self._ssh_proc.returncode is None: log.info('Killing our SSH tunnel (pid %d)' % self._ssh_proc.pid) self._ssh_proc.stdin.close() self._ssh_proc.stdout.close() self._ssh_proc.stderr.close() try: if hasattr(signal, 'SIGKILL'): os.kill(self._ssh_proc.pid, signal.SIGKILL) else: # Windows doesn't have SIGKILL, see #1892 os.kill(self._ssh_proc.pid, signal.SIGABRT) except Exception as e: log.exception(e) self._ssh_proc = None self._ssh_tunnel_url = None def _ssh_tunnel_opts(self, bind_port): """Options to SSH related to setting up a tunnel (rather than SSHing in). Helper for :py:meth:`_ssh_tunnel_args`. """ args = self._ssh_local_tunnel_opt(bind_port) + [ '-N', '-n', '-q', ] if self._opts['ssh_tunnel_is_open']: args.extend(['-g', '-4']) # -4: listen on IPv4 only return args def _ssh_local_tunnel_opt(self, bind_port): """Helper for :py:meth:`_ssh_tunnel_opts`.""" tunnel_config = self._ssh_tunnel_config() return [ '-L', '%d:%s:%d' % ( bind_port, self._job_tracker_host(), tunnel_config['port'], ), ] def _pick_ssh_bind_ports(self): """Pick a list of ports to try binding our SSH tunnel to. We will try to bind the same port for any given cluster (Issue #67) """ # don't perturb the random number generator random_state = random.getstate() try: # seed random port selection on cluster ID random.seed(self._cluster_id) num_picks = min(_MAX_SSH_RETRIES, len(self._opts['ssh_bind_ports'])) return random.sample(self._opts['ssh_bind_ports'], num_picks) finally: random.setstate(random_state)
class MRJobRunner(object): """Abstract base class for all runners""" #: alias for this runner; used for picking section of #: :py:mod:``mrjob.conf`` to load one of ``'local'``, ``'emr'``, #: or ``'hadoop'`` alias = None # if this is true, when bootstrap_mrjob is true, add it through the # setup script BOOTSTRAP_MRJOB_IN_SETUP = True OPTION_STORE_CLASS = RunnerOptionStore ### methods to call from your batch script ### def __init__(self, mr_job_script=None, conf_path=None, extra_args=None, file_upload_args=None, hadoop_input_format=None, hadoop_output_format=None, input_paths=None, output_dir=None, partitioner=None, stdin=None, conf_paths=None, **opts): """All runners take the following keyword arguments: :type mr_job_script: str :param mr_job_script: the path of the ``.py`` file containing the :py:class:`~mrjob.job.MRJob`. If this is None, you won't actually be able to :py:meth:`run` the job, but other utilities (e.g. :py:meth:`ls`) will work. :type conf_path: str, None, or False :param conf_path: Deprecated. Alternate path to read configs from, or ``False`` to ignore all config files. Use *conf_paths* instead. :type conf_paths: None or list :param conf_paths: List of config files to combine and use, or None to search for mrjob.conf in the default locations. :type extra_args: list of str :param extra_args: a list of extra cmd-line arguments to pass to the mr_job script. This is a hook to allow jobs to take additional arguments. :param file_upload_args: a list of tuples of ``('--ARGNAME', path)``. The file at the given path will be uploaded to the local directory of the mr_job script when it runs, and then passed into the script with ``--ARGNAME``. Useful for passing in SQLite DBs and other configuration files to your job. :type hadoop_input_format: str :param hadoop_input_format: name of an optional Hadoop ``InputFormat`` class. Passed to Hadoop along with your first step with the ``-inputformat`` option. Note that if you write your own class, you'll need to include it in your own custom streaming jar (see *hadoop_streaming_jar*). :type hadoop_output_format: str :param hadoop_output_format: name of an optional Hadoop ``OutputFormat`` class. Passed to Hadoop along with your first step with the ``-outputformat`` option. Note that if you write your own class, you'll need to include it in your own custom streaming jar (see *hadoop_streaming_jar*). :type input_paths: list of str :param input_paths: Input files for your job. Supports globs and recursively walks directories (e.g. ``['data/common/', 'data/training/*.gz']``). If this is left blank, we'll read from stdin :type output_dir: str :param output_dir: An empty/non-existent directory where Hadoop streaming should put the final output from the job. If you don't specify an output directory, we'll output into a subdirectory of this job's temporary directory. You can control this from the command line with ``--output-dir``. This option cannot be set from configuration files. If used with the hadoop runner, this path does not need to be fully qualified with ``hdfs://`` URIs because it's understood that it has to be on HDFS. :type partitioner: str :param partitioner: Optional name of a Hadoop partitoner class, e.g. ``'org.apache.hadoop.mapred.lib.HashPartitioner'``. Hadoop streaming will use this to determine how mapper output should be sorted and distributed to reducers. :param stdin: an iterable (can be a ``StringIO`` or even a list) to use as stdin. This is a hook for testing; if you set ``stdin`` via :py:meth:`~mrjob.job.MRJob.sandbox`, it'll get passed through to the runner. If for some reason your lines are missing newlines, we'll add them; this makes it easier to write automated tests. """ self._ran_job = False if conf_path is not None: if conf_paths is not None: raise ValueError("Can't specify both conf_path and conf_paths") else: log.warn("The conf_path argument to MRJobRunner() is" " deprecated. Use conf_paths instead.") if conf_path is False: conf_paths = [] else: conf_paths = [conf_path] self._opts = self.OPTION_STORE_CLASS(self.alias, opts, conf_paths) self._fs = None self._working_dir_mgr = WorkingDirManager() self._script_path = mr_job_script if self._script_path: self._working_dir_mgr.add('file', self._script_path) # give this job a unique name self._job_name = self._make_unique_job_name(label=self._opts['label'], owner=self._opts['owner']) # we'll create the wrapper script later self._setup_wrapper_script_path = None # extra args to our job self._extra_args = list(extra_args) if extra_args else [] # extra file arguments to our job self._file_upload_args = [] if file_upload_args: for arg, path in file_upload_args: arg_file = parse_legacy_hash_path('file', path) self._working_dir_mgr.add(**arg_file) self._file_upload_args.append((arg, arg_file)) # set up uploading for path in self._opts['upload_files']: self._working_dir_mgr.add(**parse_legacy_hash_path( 'file', path, must_name='upload_files')) for path in self._opts['upload_archives']: self._working_dir_mgr.add(**parse_legacy_hash_path( 'archive', path, must_name='upload_archives')) # python_archives, setup, setup_cmds, and setup_scripts # self._setup is a list of shell commands with path dicts # interleaved; see mrjob.setup.parse_setup_cmds() for details self._setup = self._parse_setup() for cmd in self._setup: for maybe_path_dict in cmd: if isinstance(maybe_path_dict, dict): self._working_dir_mgr.add(**maybe_path_dict) # Where to read input from (log files, etc.) self._input_paths = input_paths or ['-'] # by default read from stdin self._stdin = stdin or sys.stdin self._stdin_path = None # temp file containing dump from stdin # where a tarball of the mrjob library is stored locally self._mrjob_tar_gz_path = None # store output_dir self._output_dir = output_dir # store partitioner self._partitioner = partitioner # store hadoop input and output formats self._hadoop_input_format = hadoop_input_format self._hadoop_output_format = hadoop_output_format # a local tmp directory that will be cleaned up when we're done # access/make this using self._get_local_tmp_dir() self._local_tmp_dir = None # A cache for self._get_steps(); also useful as a test hook self._steps = None # if this is True, we have to pipe input into the sort command # rather than feed it multiple files self._sort_is_windows_sort = None # this variable marks whether a cleanup has happened and this runner's # output stream is no longer available. self._closed = False ### Filesystem object ### @property def fs(self): """:py:class:`~mrjob.fs.base.Filesystem` object for the local filesystem. Methods on :py:class:`~mrjob.fs.base.Filesystem` objects will be forwarded to :py:class:`~mrjob.runner.MRJobRunner` until mrjob 0.5, but **this behavior is deprecated.** """ if self._fs is None: self._fs = LocalFilesystem() return self._fs def __getattr__(self, name): # For backward compatibility, forward filesystem methods try: return getattr(self.fs, name) except AttributeError: raise AttributeError(name) ### Running the job and parsing output ### def run(self): """Run the job, and block until it finishes. Raise an exception if there are any problems. """ if not self._script_path: raise AssertionError("No script to run!") if self._ran_job: raise AssertionError("Job already ran!") self._run() self._ran_job = True def stream_output(self): """Stream raw lines from the job's output. You can parse these using the read() method of the appropriate HadoopStreamingProtocol class.""" output_dir = self.get_output_dir() if output_dir is None: raise AssertionError('Run the job before streaming output') if self._closed is True: log.warn( 'WARNING! Trying to stream output from a closed runner, output' ' will probably be empty.') log.info('Streaming final output from %s' % output_dir) def split_path(path): while True: base, name = os.path.split(path) # no more elements if not name: break yield name path = base for filename in self.ls(output_dir): subpath = filename[len(output_dir):] if not any(name.startswith('_') for name in split_path(subpath)): for line in self._cat_file(filename): yield line def _cleanup_mode(self, mode=None): """Actual cleanup action to take based on various options""" if self._script_path and not self._ran_job: return mode or self._opts['cleanup_on_failure'] else: return mode or self._opts['cleanup'] def _cleanup_local_scratch(self): """Cleanup any files/directories on the local machine we created while running this job. Should be safe to run this at any time, or multiple times. This particular function removes any local tmp directories added to the list self._local_tmp_dirs This won't remove output_dir if it's outside of our scratch dir. """ if self._local_tmp_dir: log.info('removing tmp directory %s' % self._local_tmp_dir) try: shutil.rmtree(self._local_tmp_dir) except OSError as e: log.exception(e) self._local_tmp_dir = None def _cleanup_remote_scratch(self): """Cleanup any files/directories on the remote machine (S3) we created while running this job. Should be safe to run this at any time, or multiple times. """ pass # this only happens on EMR def _cleanup_logs(self): """Cleanup any log files that are created as a side-effect of the job. """ pass # this only happens on EMR def _cleanup_job(self): """Stop any jobs that we created that are still running.""" pass # this only happens on EMR def _cleanup_job_flow(self): """Terminate the job flow if there is one.""" pass # this only happens on EMR def cleanup(self, mode=None): """Clean up running jobs, scratch dirs, and logs, subject to the *cleanup* option passed to the constructor. If you create your runner in a :keyword:`with` block, :py:meth:`cleanup` will be called automatically:: with mr_job.make_runner() as runner: ... # cleanup() called automatically here :param mode: override *cleanup* passed into the constructor. Should be a list of strings from :py:data:`CLEANUP_CHOICES` """ mode = self._cleanup_mode(mode) def mode_has(*args): return any((choice in mode) for choice in args) if self._script_path and not self._ran_job: if mode_has('JOB_FLOW', 'ALL'): self._cleanup_job_flow() if mode_has('JOB', 'ALL'): self._cleanup_job() if mode_has('ALL', 'SCRATCH', 'LOCAL_SCRATCH'): self._cleanup_local_scratch() if mode_has('ALL', 'SCRATCH', 'REMOTE_SCRATCH'): self._cleanup_remote_scratch() if mode_has('ALL', 'LOGS'): self._cleanup_logs() self._closed = True def counters(self): """Get counters associated with this run in this form:: [{'group name': {'counter1': 1, 'counter2': 2}}, {'group name': ...}] The list contains an entry for every step of the current job, ignoring earlier steps in the same job flow. """ raise NotImplementedError def print_counters(self, limit_to_steps=None): """Display this run's counters in a user-friendly way. :type first_step_num: int :param first_step_num: Display step number of the counters from the first step :type limit_to_steps: list of int :param limit_to_steps: List of step numbers *relative to this job* to print, indexed from 1 """ for step_num, step_counters in enumerate(self.counters()): step_num = step_num + 1 if limit_to_steps is None or step_num in limit_to_steps: log.info('Counters from step %d:' % step_num) if step_counters.keys(): for group_name in sorted(step_counters.keys()): log.info(' %s:' % group_name) group_counters = step_counters[group_name] for counter_name in sorted(group_counters.keys()): log.info( ' %s: %d' % (counter_name, group_counters[counter_name])) else: log.info(' (no counters found)') ### hooks for the with statement ### def __enter__(self): """Don't do anything special at start of with block""" return self def __exit__(self, type, value, traceback): """Call self.cleanup() at end of with block.""" self.cleanup() ### more runner information ### def get_opts(self): """Get options set for this runner, as a dict.""" return copy.deepcopy(self._opts) def get_job_name(self): """Get the unique name for the job run by this runner. This has the format ``label.owner.date.time.microseconds`` """ return self._job_name def get_output_dir(self): """Find the directory containing the job output. If the job hasn't run yet, returns None""" if self._script_path and not self._ran_job: return None return self._output_dir ### other methods you need to implement in your subclass ### def get_hadoop_version(self): """Return the version number of the Hadoop environment as a string if Hadoop is being used or simulated. Return None if not applicable. :py:class:`~mrjob.emr.EMRJobRunner` infers this from the job flow. :py:class:`~mrjob.hadoop.HadoopJobRunner` gets this from ``hadoop version``. :py:class:`~mrjob.local.LocalMRJobRunner` has an additional `hadoop_version` option to specify which version it simulates, with a default of 0.20. :py:class:`~mrjob.inline.InlineMRJobRunner` does not simulate Hadoop at all. """ return None # you'll probably wan't to add your own __init__() and cleanup() as well def _run(self): """Run the job.""" raise NotImplementedError ### internal utilities for implementing MRJobRunners ### def _get_local_tmp_dir(self): """Create a tmp directory on the local filesystem that will be cleaned up by self.cleanup()""" if not self._local_tmp_dir: path = os.path.join(self._opts['base_tmp_dir'], self._job_name) log.info('creating tmp directory %s' % path) if os.path.isdir(path): shutil.rmtree(path) os.makedirs(path) self._local_tmp_dir = path return self._local_tmp_dir def _make_unique_job_name(self, label=None, owner=None): """Come up with a useful unique ID for this job. We use this to choose the output directory, etc. for the job. """ # use the name of the script if one wasn't explicitly # specified if not label: if self._script_path: label = os.path.basename(self._script_path).split('.')[0] else: label = 'no_script' if not owner: owner = 'no_user' now = datetime.datetime.utcnow() return '%s.%s.%s.%06d' % (label, owner, now.strftime('%Y%m%d.%H%M%S'), now.microsecond) def _get_steps(self): """Call the job script to find out how many steps it has, and whether there are mappers and reducers for each step. Validate its output. Returns output as described in :ref:`steps-format`. Results are cached, so call this as many times as you want. """ if self._steps is None: if not self._script_path: self._steps = [] else: args = (self._executable(True) + ['--steps'] + self._mr_job_extra_args(local=True)) log.debug('> %s' % cmd_line(args)) # add . to PYTHONPATH (in case mrjob isn't actually installed) env = combine_local_envs(os.environ, {'PYTHONPATH': os.path.abspath('.')}) steps_proc = Popen(args, stdout=PIPE, stderr=PIPE, env=env) stdout, stderr = steps_proc.communicate() if steps_proc.returncode != 0: raise Exception('error getting step information: \n%s' % stderr) try: steps = json.loads(stdout) except JSONDecodeError: raise ValueError("Bad --steps response: \n%s" % stdout) # verify that this is a proper step description if not steps or not stdout: raise ValueError('step description is empty!') for step in steps: if step['type'] not in STEP_TYPES: raise ValueError( 'unexpected step type %r in steps %r' % (step['type'], stdout)) self._steps = steps return self._steps def _get_step(self, step_num): """Get a single step (calls :py:meth:`_get_steps`).""" return self._get_steps()[step_num] def _num_steps(self): """Get the number of steps (calls :py:meth:`get_steps`).""" return len(self._get_steps()) def _executable(self, steps=False): # default behavior is to always use an interpreter. local, emr, and # hadoop runners check for executable script paths and prepend the # working_dir, discarding the interpreter if possible. if steps: return self._opts['steps_interpreter'] + [self._script_path] else: return (self._opts['interpreter'] + [self._working_dir_mgr.name('file', self._script_path)]) def _script_args_for_step(self, step_num, mrc): assert self._script_path args = self._executable() + [ '--step-num=%d' % step_num, '--%s' % mrc, ] + self._mr_job_extra_args() if self._setup_wrapper_script_path: return (self._opts['sh_bin'] + [ self._working_dir_mgr.name('file', self._setup_wrapper_script_path) ] + args) else: return args def _substep_cmd_line(self, step_num, mrc): step = self._get_step(step_num) if step[mrc]['type'] == 'command': # never wrap custom hadoop streaming commands in bash return step[mrc]['command'], False elif step[mrc]['type'] == 'script': cmd = cmd_line(self._script_args_for_step(step_num, mrc)) # filter input and pipe for great speed, if user asks # but we have to wrap the command in bash if 'pre_filter' in step[mrc]: return '%s | %s' % (step[mrc]['pre_filter'], cmd), True else: return cmd, False else: raise ValueError("Invalid %s step %d: %r" % (mrc, step_num, step[mrc])) def _render_substep(self, step_num, mrc): step = self._get_step(step_num) if mrc in step: return self._substep_cmd_line(step_num, mrc) else: if mrc == 'mapper': return 'cat', False else: return None, False def _hadoop_streaming_commands(self, step_num): version = self.get_hadoop_version() # Hadoop streaming stuff mapper, bash_wrap_mapper = self._render_substep(step_num, 'mapper') combiner, bash_wrap_combiner = self._render_substep( step_num, 'combiner') reducer, bash_wrap_reducer = self._render_substep(step_num, 'reducer') if (combiner is not None and not supports_combiners_in_hadoop_streaming(version)): # krazy hack to support combiners on hadoop <0.20 bash_wrap_mapper = True mapper = "%s | sort | %s" % (mapper, combiner) # take the combiner away, hadoop will just be confused combiner = None bash_wrap_combiner = False if bash_wrap_mapper: mapper = bash_wrap(mapper) if bash_wrap_combiner: combiner = bash_wrap(combiner) if bash_wrap_reducer: reducer = bash_wrap(reducer) return mapper, combiner, reducer def _mr_job_extra_args(self, local=False): """Return arguments to add to every invocation of MRJob. :type local: boolean :param local: if this is True, use files' local paths rather than the path they'll have inside Hadoop streaming """ return (self._get_file_upload_args(local=local) + self._get_strict_protocols_args() + self._extra_args) def _get_file_upload_args(self, local=False): """Arguments used to pass through config files, etc from the job runner through to the local directory where the script is run. :type local: boolean :param local: if this is True, use files' local paths rather than the path they'll have inside Hadoop streaming """ args = [] for arg, path_dict in self._file_upload_args: args.append(arg) if local: args.append(path_dict['path']) else: args.append(self._working_dir_mgr.name(**path_dict)) return args def _get_strict_protocols_args(self): """Arguments used to control protocol behavior in the job. """ # These are only in the runner so that we can default them from # mrjob.conf, which will allow us to eventually remove them. # See issue #726. if self._opts['strict_protocols']: return ['--strict-protocols'] elif self._opts['strict_protocols'] is None: return [] else: return ['--no-strict-protocols'] def _create_setup_wrapper_script(self, dest='setup-wrapper.sh'): """Create the wrapper script, and write it into our local temp directory (by default, to a file named wrapper.sh). This will set ``self._setup_wrapper_script_path``, and add it to ``self._working_dir_mgr`` This will do nothing if ``self._setup`` is empty or this method has already been called. """ if self._setup_wrapper_script_path: return setup = self._setup if self._opts['bootstrap_mrjob'] and self.BOOTSTRAP_MRJOB_IN_SETUP: # patch setup to add mrjob.tar.gz to PYTYHONPATH mrjob_tar_gz = self._create_mrjob_tar_gz() path_dict = {'type': 'archive', 'name': None, 'path': mrjob_tar_gz} self._working_dir_mgr.add(**path_dict) setup = [['export PYTHONPATH=', path_dict, ':$PYTHONPATH']] + setup if not setup: return path = os.path.join(self._get_local_tmp_dir(), dest) log.info('writing wrapper script to %s' % path) contents = self._setup_wrapper_script_content(setup) for line in StringIO(contents): log.debug('WRAPPER: ' + line.rstrip('\r\n')) with open(path, 'w') as f: f.write(contents) self._setup_wrapper_script_path = path self._working_dir_mgr.add('file', self._setup_wrapper_script_path) def _parse_setup(self): """Parse the *setup* option with :py:func:`mrjob.setup.parse_setup_cmd()`. If *bootstrap_mrjob* and ``self.BOOTSTRAP_MRJOB_IN_SETUP`` are both true, create mrjob.tar.gz (if it doesn't exist already) and prepend a setup command that adds it to PYTHONPATH. Also patch in the deprecated options *python_archives*, *setup_cmd*, and *setup_script* as setup commands. """ setup = [] # python_archives for path in self._opts['python_archives']: path_dict = parse_legacy_hash_path('archive', path) setup.append(['export PYTHONPATH=', path_dict, ':$PYTHONPATH']) # setup for cmd in self._opts['setup']: setup.append(parse_setup_cmd(cmd)) # setup_cmds if self._opts['setup_cmds']: log.warning( "setup_cmds is deprecated since v0.4.2 and will be removed" " in v0.6.0. Consider using setup instead.") for cmd in self._opts['setup_cmds']: if not isinstance(cmd, basestring): cmd = cmd_line(cmd) setup.append([cmd]) # setup_scripts if self._opts['setup_scripts']: log.warning( "setup_scripts is deprecated since v0.4.2 and will be removed" " in v0.6.0. Consider using setup instead.") for path in self._opts['setup_scripts']: path_dict = parse_legacy_hash_path('file', path) setup.append([path_dict]) return setup def _setup_wrapper_script_content(self, setup, mrjob_tar_gz_name=None): """Return a (Bourne) shell script that runs the setup commands and then executes whatever is passed to it (this will be our mapper/reducer). We obtain a file lock so that two copies of the setup commands cannot run simultaneously on the same machine (this helps for running :command:`make` on a shared source code archive, for example). """ out = StringIO() def writeln(line=''): out.write(line + '\n') # we're always going to execute this script as an argument to # sh, so there's no need to add a shebang (e.g. #!/bin/sh) writeln('# store $PWD') writeln('__mrjob_PWD=$PWD') writeln() writeln('# obtain exclusive file lock') # Basically, we're going to tie file descriptor 9 to our lockfile, # use a subprocess to obtain a lock (which we somehow inherit too), # and then release the lock by closing the file descriptor. # File descriptors 10 and higher are used internally by the shell, # so 9 is as out-of-the-way as we can get. writeln('exec 9>/tmp/wrapper.lock.%s' % self._job_name) # would use flock(1), but it's not always available writeln("%s -c 'import fcntl; fcntl.flock(9, fcntl.LOCK_EX)'" % cmd_line(self._opts['python_bin'])) writeln() writeln('# setup commands') # group setup commands so we can redirect their input/output (see # below). Don't use parens; this would invoke a subshell, which would # keep us from exporting environment variables to the task. writeln('{') for cmd in setup: # reconstruct the command line, substituting $__mrjob_PWD/<name> # for path dicts line = ' ' # indent, since these commands are in a group for token in cmd: if isinstance(token, dict): # it's a path dictionary line += '$__mrjob_PWD/' line += pipes.quote(self._working_dir_mgr.name(**token)) else: # it's raw script line += token writeln(line) # redirect setup commands' input/output so they don't interfere # with the task (see Issue #803). writeln('} 0</dev/null 1>&2') writeln() writeln('# release exclusive file lock') writeln('exec 9>&-') writeln() writeln('# run task from the original working directory') writeln('cd $__mrjob_PWD') writeln('"$@"') return out.getvalue() def _get_input_paths(self): """Get the paths to input files, dumping STDIN to a local file if need be.""" if '-' in self._input_paths: if self._stdin_path is None: # prompt user, so they don't think the process has stalled log.info('reading from STDIN') stdin_path = os.path.join(self._get_local_tmp_dir(), 'STDIN') log.debug('dumping stdin to local file %s' % stdin_path) with open(stdin_path, 'w') as stdin_file: for line in self._stdin: # catch missing newlines (often happens with test data) if not line.endswith('\n'): line += '\n' stdin_file.write(line) self._stdin_path = stdin_path return [self._stdin_path if p == '-' else p for p in self._input_paths] def _create_mrjob_tar_gz(self): """Make a tarball of the mrjob library, without .pyc or .pyo files, This will also set ``self._mrjob_tar_gz_path`` and return it. Typically called from :py:meth:`_create_setup_wrapper_script`. It's safe to call this method multiple times (we'll only create the tarball once.) """ if not self._mrjob_tar_gz_path: # find mrjob library import mrjob if not os.path.basename(mrjob.__file__).startswith('__init__.'): raise Exception( "Bad path for mrjob library: %s; can't bootstrap mrjob", mrjob.__file__) mrjob_dir = os.path.dirname(mrjob.__file__) or '.' tar_gz_path = os.path.join(self._get_local_tmp_dir(), 'mrjob.tar.gz') def filter_path(path): filename = os.path.basename(path) return not (file_ext(filename).lower() in ('.pyc', '.pyo') or # filter out emacs backup files filename.endswith('~') or # filter out emacs lock files filename.startswith('.#') or # filter out MacFuse resource forks filename.startswith('._')) log.debug('archiving %s -> %s as %s' % (mrjob_dir, tar_gz_path, os.path.join('mrjob', ''))) tar_and_gzip(mrjob_dir, tar_gz_path, filter=filter_path, prefix='mrjob') self._mrjob_tar_gz_path = tar_gz_path return self._mrjob_tar_gz_path def _jobconf_for_step(self, step_num): """Get the jobconf dictionary, optionally including step-specific jobconf info. Also translate jobconfs to the current Hadoop version, if necessary. """ step = self._get_step(step_num) jobconf = combine_dicts(self._opts['jobconf'], step.get('jobconf')) return add_translated_jobconf_for_hadoop_version( jobconf, self.get_hadoop_version()) def _hadoop_args_for_step(self, step_num): """Build a list of extra arguments to the hadoop binary. This handles *cmdenv*, *hadoop_extra_args*, *hadoop_input_format*, *hadoop_output_format*, *jobconf*, and *partitioner*. This doesn't handle input, output, mappers, reducers, or uploading files. """ assert 0 <= step_num < self._num_steps() args = [] # hadoop_extra_args args.extend(self._opts['hadoop_extra_args']) # new-style jobconf version = self.get_hadoop_version() # translate the jobconf configuration names to match # the hadoop version jobconf = self._jobconf_for_step(step_num) if uses_generic_jobconf(version): for key, value in sorted(jobconf.iteritems()): if value is not None: args.extend(['-D', '%s=%s' % (key, value)]) # old-style jobconf else: for key, value in sorted(jobconf.iteritems()): if value is not None: args.extend(['-jobconf', '%s=%s' % (key, value)]) # partitioner if self._partitioner: args.extend(['-partitioner', self._partitioner]) # cmdenv for key, value in sorted(self._opts['cmdenv'].iteritems()): args.append('-cmdenv') args.append('%s=%s' % (key, value)) # hadoop_input_format if (step_num == 0 and self._hadoop_input_format): args.extend(['-inputformat', self._hadoop_input_format]) # hadoop_output_format if (step_num == self._num_steps() - 1 and self._hadoop_output_format): args.extend(['-outputformat', self._hadoop_output_format]) return args def _arg_hash_paths(self, type, upload_mgr): """Helper function for the *upload_args methods.""" for name, path in self._working_dir_mgr.name_to_path(type).iteritems(): uri = self._upload_mgr.uri(path) yield '%s#%s' % (uri, name) def _new_upload_args(self, upload_mgr): args = [] # TODO: does Hadoop have a way of coping with paths that have # commas in their names? file_hash_paths = list(self._arg_hash_paths('file', upload_mgr)) if file_hash_paths: args.append('-files') args.append(','.join(file_hash_paths)) archive_hash_paths = list(self._arg_hash_paths('archive', upload_mgr)) if archive_hash_paths: args.append('-archives') args.append(','.join(archive_hash_paths)) return args def _old_upload_args(self, upload_mgr): args = [] for file_hash in self._arg_hash_paths('file', upload_mgr): args.append('-cacheFile') args.append(file_hash) for archive_hash in self._arg_hash_paths('archive', upload_mgr): args.append('-cacheArchive') args.append(archive_hash) return args def _invoke_sort(self, input_paths, output_path): """Use the local sort command to sort one or more input files. Raise an exception if there is a problem. This is is just a wrapper to handle limitations of Windows sort (see Issue #288). :type input_paths: list of str :param input_paths: paths of one or more input files :type output_path: str :param output_path: where to pipe sorted output into """ if not input_paths: raise ValueError('Must specify at least one input path.') # ignore locale when sorting env = os.environ.copy() env['LC_ALL'] = 'C' # Make sure that the base tmp dir environment variables are changed if # the default is changed. env['TMP'] = self._opts['base_tmp_dir'] env['TMPDIR'] = self._opts['base_tmp_dir'] env['TEMP'] = self._opts['base_tmp_dir'] log.info('writing to %s' % output_path) err_path = os.path.join(self._get_local_tmp_dir(), 'sort-stderr') # assume we're using UNIX sort unless we know otherwise if (not self._sort_is_windows_sort) or len(input_paths) == 1: with open(output_path, 'w') as output: with open(err_path, 'w') as err: args = ['sort'] + list(input_paths) log.info('> %s' % cmd_line(args)) try: check_call(args, stdout=output, stderr=err, env=env) return except CalledProcessError: pass # Looks like we're using Windows sort self._sort_is_windows_sort = True log.info('Piping files into sort for Windows compatibility') with open(output_path, 'w') as output: with open(err_path, 'w') as err: args = ['sort'] log.info('> %s' % cmd_line(args)) proc = Popen(args, stdin=PIPE, stdout=output, stderr=err, env=env) # shovel bytes into the sort process for input_path in input_paths: with open(input_path, 'r') as input: while True: buf = input.read(_BUFFER_SIZE) if not buf: break proc.stdin.write(buf) proc.stdin.close() proc.wait() if proc.returncode == 0: return # looks like there was a problem. log it and raise an error with open(err_path) as err: for line in err: log.error('STDERR: %s' % line.rstrip('\r\n')) raise CalledProcessError(proc.returncode, args)