def _env_for_task(self, task_type, step_num, task_num, map_split=None): """Set up environment variables for a subprocess (mapper, etc.) This combines, in decreasing order of priority: * environment variables set by the **cmdenv** option * **jobconf** environment variables set by our job (e.g. ``mapreduce.task.ismap`) * environment variables from **jobconf** options, translated to whatever version of Hadoop we're emulating * the current environment * PYTHONPATH set to current working directory We use :py:func:`~mrjob.conf.combine_local_envs`, so ``PATH`` environment variables are handled specially. """ user_jobconf = self._jobconf_for_step(step_num) simulated_jobconf = self._simulate_jobconf_for_step( task_type, step_num, task_num, map_split) def to_env(jobconf): return dict((k.replace('.', '_'), str(v)) for k, v in jobconf.items()) # keep the current environment because we need PATH to find binaries # and make PYTHONPATH work return combine_local_envs(os.environ, to_env(user_jobconf), to_env(simulated_jobconf), self._opts['cmdenv'])
def _load_steps(self): args = (self._executable(True) + ['--steps'] + self._mr_job_extra_args(local=True)) log.debug('> %s' % cmd_line(args)) # add . to PYTHONPATH (in case mrjob isn't actually installed) env = combine_local_envs(os.environ, {'PYTHONPATH': os.path.abspath('.')}) steps_proc = Popen(args, stdout=PIPE, stderr=PIPE, env=env) stdout, stderr = steps_proc.communicate() if steps_proc.returncode != 0: raise Exception( 'error getting step information: \n%s' % stderr) # on Python 3, convert stdout to str so we can json.loads() it if not isinstance(stdout, str): stdout = stdout.decode('utf_8') try: steps = json.loads(stdout) except ValueError: raise ValueError("Bad --steps response: \n%s" % stdout) # verify that this is a proper step description if not steps or not stdout: raise ValueError('step description is empty!') return steps
def _load_steps(self): args = (self._executable(True) + ['--steps'] + self._mr_job_extra_args(local=True)) log.debug('> %s' % cmd_line(args)) # add . to PYTHONPATH (in case mrjob isn't actually installed) env = combine_local_envs(os.environ, {'PYTHONPATH': os.path.abspath('.')}) steps_proc = Popen(args, stdout=PIPE, stderr=PIPE, env=env) stdout, stderr = steps_proc.communicate() if steps_proc.returncode != 0: raise Exception('error getting step information: \n%s' % stderr) # on Python 3, convert stdout to str so we can json.loads() it if not isinstance(stdout, str): stdout = stdout.decode('utf_8') try: steps = json.loads(stdout) except ValueError: raise ValueError("Bad --steps response: \n%s" % stdout) # verify that this is a proper step description if not steps or not stdout: raise ValueError('step description is empty!') return steps
def _subprocess_env(self, step_num, step_type, task_num, working_dir, **split_kwargs): """Set up environment variables for a subprocess (mapper, etc.) This combines, in decreasing order of priority: * environment variables set by the **cmdenv** option * **jobconf** environment variables set by our job (e.g. ``mapreduce.task.ismap`) * environment variables from **jobconf** options, translated to whatever version of Hadoop we're emulating * the current environment * PYTHONPATH set to current working directory We use :py:func:`~mrjob.conf.combine_local_envs`, so ``PATH`` environment variables are handled specially. """ user_jobconf = self._jobconf_for_step(step_num) simulated_jobconf = self._simulate_jobconf_for_step( step_num, step_type, task_num, working_dir, **split_kwargs) def to_env(jobconf): return dict( (k.replace('.', '_'), str(v)) for k, v in jobconf.items()) # keep the current environment because we need PATH to find binaries # and make PYTHONPATH work return combine_local_envs(os.environ, to_env(user_jobconf), to_env(simulated_jobconf), self._opts['cmdenv'])
def test_paths(self): self.assertEqual(combine_local_envs( {'PATH': '/bin:/usr/bin', 'PYTHONPATH': '/usr/lib/python/site-packages', 'PS1': '> '}, {'PATH': '/home/dave/bin', 'PYTHONPATH': '/home/dave/python', 'CLASSPATH': '/home/dave/java', 'PS1': '\w> '}), {'PATH': '/home/dave/bin;/bin:/usr/bin', 'PYTHONPATH': '/home/dave/python;/usr/lib/python/site-packages', 'CLASSPATH': '/home/dave/java', 'PS1': '\w> '})
def test_paths(self): assert_equal(combine_local_envs( {'PATH': '/bin:/usr/bin', 'PYTHONPATH': '/usr/lib/python/site-packages', 'PS1': '> '}, {'PATH': '/home/dave/bin', 'PYTHONPATH': '/home/dave/python', 'CLASSPATH': '/home/dave/java', 'PS1': '\w> '}), {'PATH': '/home/dave/bin;/bin:/usr/bin', 'PYTHONPATH': '/home/dave/python;/usr/lib/python/site-packages', 'CLASSPATH': '/home/dave/java', 'PS1': '\w> '})
def _get_steps(self): """Call the job script to find out how many steps it has, and whether there are mappers and reducers for each step. Validate its output. Returns output as described in :ref:`steps-format`. Results are cached, so call this as many times as you want. """ if self._steps is None: if not self._script_path: self._steps = [] else: args = (self._executable(True) + ['--steps'] + self._mr_job_extra_args(local=True)) log.debug('> %s' % cmd_line(args)) # add . to PYTHONPATH (in case mrjob isn't actually installed) env = combine_local_envs(os.environ, {'PYTHONPATH': os.path.abspath('.')}) steps_proc = Popen(args, stdout=PIPE, stderr=PIPE, env=env) stdout, stderr = steps_proc.communicate() if steps_proc.returncode != 0: raise Exception( 'error getting step information: \n%s' % stderr) # on Python 3, convert stdout to str so we can json.loads() it if not isinstance(stdout, str): stdout = stdout.decode('utf_8') try: steps = json.loads(stdout) except ValueError: raise ValueError("Bad --steps response: \n%s" % stdout) # verify that this is a proper step description if not steps or not stdout: raise ValueError('step description is empty!') for step in steps: if step['type'] not in STEP_TYPES: raise ValueError( 'unexpected step type %r in steps %r' % ( step['type'], stdout)) self._steps = steps return self._steps
def _get_steps(self): """Call the mr_job to find out how many steps it has, and whether there are mappers and reducers for each step. Validate its output. Returns output like ['MR', 'M'] (two steps, second only has a mapper) We'll cache the result (so you can call _get_steps() as many times as you want) """ if self._steps is None: if not self._script: self._steps = [] else: # don't use self._opts['python_bin'] because that # refers to the python binary to use inside Hadoop python_bin = sys.executable or 'python' args = ([python_bin, self._script['path'], '--steps'] + self._mr_job_extra_args(local=True)) log.debug('> %s' % cmd_line(args)) # add . to PYTHONPATH (in case mrjob isn't actually installed) env = combine_local_envs(os.environ, {'PYTHONPATH': os.path.abspath('.')}) steps_proc = Popen(args, stdout=PIPE, stderr=PIPE, env=env) stdout, stderr = steps_proc.communicate() if steps_proc.returncode != 0: raise Exception('error getting step information: %s', stderr) steps = stdout.strip().split(' ') # verify that this is a proper step description if not steps: raise ValueError('step description is empty!') for step in steps: if step not in ('MR', 'M'): raise ValueError( 'unexpected step type %r in steps %r' % (step, stdout)) self._steps = steps return self._steps
def _get_steps(self): """Call the mr_job to find out how many steps it has, and whether there are mappers and reducers for each step. Validate its output. Returns output like ['MR', 'M'] (two steps, second only has a mapper) We'll cache the result (so you can call _get_steps() as many times as you want) """ if self._steps is None: if not self._script: self._steps = [] else: # don't use self._opts['python_bin'] because that # refers to the python binary to use inside Hadoop python_bin = sys.executable or 'python' args = ([python_bin, self._script['path'], '--steps'] + self._mr_job_extra_args(local=True)) log.debug('> %s' % cmd_line(args)) # add . to PYTHONPATH (in case mrjob isn't actually installed) env = combine_local_envs(os.environ, {'PYTHONPATH': os.path.abspath('.')}) steps_proc = Popen(args, stdout=PIPE, stderr=PIPE, env=env) stdout, stderr = steps_proc.communicate() if steps_proc.returncode != 0: raise Exception( 'error getting step information: %s', stderr) steps = stdout.strip().split(' ') # verify that this is a proper step description if not steps: raise ValueError('step description is empty!') for step in steps: if step not in ('MR', 'M'): raise ValueError( 'unexpected step type %r in steps %r' % (step, stdout)) self._steps = steps return self._steps
def test_paths(self): self.assertEqual( combine_local_envs( {"PATH": "/bin:/usr/bin", "PYTHONPATH": "/usr/lib/python/site-packages", "PS1": "> "}, { "PATH": "/home/dave/bin", "PYTHONPATH": "/home/dave/python", "CLASSPATH": "/home/dave/java", "PS1": "\w> ", }, ), { "PATH": "/home/dave/bin;/bin:/usr/bin", "PYTHONPATH": "/home/dave/python;/usr/lib/python/site-packages", "CLASSPATH": "/home/dave/java", "PS1": "\w> ", }, )
def _subprocess_env(self, step_type, step_num, task_num, input_file=None, input_start=None, input_length=None): """Set up environment variables for a subprocess (mapper, etc.) This combines, in decreasing order of priority: * environment variables set by the **cmdenv** option * **jobconf** environment variables set by our job (e.g. ``mapreduce.task.ismap`) * environment variables from **jobconf** options, translated to whatever version of Hadoop we're emulating * the current environment * PYTHONPATH set to current working directory We use :py:func:`~mrjob.conf.combine_local_envs`, so ``PATH`` environment variables are handled specially. """ version = self.get_hadoop_version() jobconf_env = dict( (translate_jobconf(k, version).replace('.', '_'), str(v)) for (k, v) in self._opts['jobconf'].iteritems()) internal_jobconf = self._simulate_jobconf_for_step( step_type, step_num, task_num, input_file=input_file, input_start=input_start, input_length=input_length) internal_jobconf_env = dict( (translate_jobconf(k, version).replace('.', '_'), str(v)) for (k, v) in internal_jobconf.iteritems()) # keep the current environment because we need PATH to find binaries # and make PYTHONPATH work return combine_local_envs(os.environ, jobconf_env, internal_jobconf_env, self._get_cmdenv())
def _run_step(self, step_num, step_type, input_path, output_path, working_dir, env): step = self._get_step(step_num) if step_type == 'mapper': procs_args = self._mapper_arg_chain(step, step_num, input_path) elif step_type == 'reducer': procs_args = self._reducer_arg_chain(step, step_num, input_path) # add . to PYTHONPATH (in case mrjob isn't actually installed) # we need this to access mrjob.cat # if we wanted, we could move read_file() and read_input() # to mrjob.cat and make it a standalone script env = combine_local_envs(env, {'PYTHONPATH': abspath('.')}) proc_dicts = self._invoke_processes(procs_args, output_path, working_dir, env) self._all_proc_dicts.extend(proc_dicts)
def _get_steps(self): """Call the job script to find out how many steps it has, and whether there are mappers and reducers for each step. Validate its output. Returns output as described in :ref:`steps-format`. Results are cached to avoid round trips to a subprocess. """ if self._steps is None: if not self._script: self._steps = [] else: args = (self._opts['steps_python_bin'] + [self._script['path'], '--steps'] + self._mr_job_extra_args(local=True)) log.debug('> %s' % cmd_line(args)) # add . to PYTHONPATH (in case mrjob isn't actually installed) env = combine_local_envs(os.environ, {'PYTHONPATH': os.path.abspath('.')}) steps_proc = Popen(args, stdout=PIPE, stderr=PIPE, env=env) stdout, stderr = steps_proc.communicate() if steps_proc.returncode != 0: raise Exception( 'error getting step information: %s', stderr) try: steps = json.loads(stdout) except json.JSONDecodeError: raise ValueError("Bad --steps response: \n%s" % stdout) # verify that this is a proper step description if not steps or not stdout: raise ValueError('step description is empty!') for step in steps: if step['type'] not in STEP_TYPES: raise ValueError( 'unexpected step type %r in steps %r' % (step['type'], stdout)) self._steps = steps return self._steps
def _subprocess_env(self, step_type, step_num, task_num, input_file=None, input_start=None, input_length=None): """Set up environment variables for a subprocess (mapper, etc.) This combines, in decreasing order of priority: * environment variables set by the **cmdenv** option * **jobconf** environment variables set by our job (e.g. ``mapreduce.task.ismap`) * environment variables from **jobconf** options, translated to whatever version of Hadoop we're emulating * the current environment * PYTHONPATH set to current working directory We use :py:func:`~mrjob.conf.combine_local_envs`, so ``PATH`` environment variables are handled specially. """ version = self.get_hadoop_version() jobconf_env = dict( (translate_jobconf(k, version).replace('.', '_'), str(v)) for (k, v) in self._opts['jobconf'].iteritems()) internal_jobconf = self._simulate_jobconf_for_step( step_type, step_num, task_num, input_file=input_file, input_start=input_start, input_length=input_length) internal_jobconf_env = dict( (translate_jobconf(k, version).replace('.', '_'), str(v)) for (k, v) in internal_jobconf.iteritems()) ironpython_env = {'IRONPYTHONPATH': os.getcwd()} if is_ironpython \ else {} # keep the current environment because we need PATH to find binaries # and make PYTHONPATH work return combine_local_envs({'PYTHONPATH': os.getcwd()}, ironpython_env, os.environ, jobconf_env, internal_jobconf_env, self._get_cmdenv())
def _get_steps(self): """Call the job script to find out how many steps it has, and whether there are mappers and reducers for each step. Validate its output. Returns output as described in :ref:`steps-format`. Results are cached, so call this as many times as you want. """ if self._steps is None: if not self._script_path: self._steps = [] else: args = self._executable(True) + ["--steps"] + self._mr_job_extra_args(local=True) log.debug("> %s" % cmd_line(args)) # add . to PYTHONPATH (in case mrjob isn't actually installed) env = combine_local_envs(os.environ, {"PYTHONPATH": os.path.abspath(".")}) steps_proc = Popen(args, stdout=PIPE, stderr=PIPE, env=env) stdout, stderr = steps_proc.communicate() if steps_proc.returncode != 0: raise Exception("error getting step information: \n%s" % stderr) try: steps = json.loads(stdout) except JSONDecodeError: raise ValueError("Bad --steps response: \n%s" % stdout) # verify that this is a proper step description if not steps or not stdout: raise ValueError("step description is empty!") for step in steps: if step["type"] not in STEP_TYPES: raise ValueError("unexpected step type %r in steps %r" % (step["type"], stdout)) self._steps = steps return self._steps
def _invoke_step(self, args, outfile_name, env=None): """Run the given command, outputting into outfile, and reading from the previous outfile (or, for the first step, from our original output files). outfile is a path relative to our local tmp dir. commands are run inside self._working_dir We'll intelligently handle stderr from the process. """ # keep the current environment because we need PATH to find binaries # and make PYTHONPATH work env = combine_local_envs( {'PYTHONPATH': os.getcwd()}, os.environ, self._get_cmdenv(), env or {}) # decide where to get input if self._prev_outfile is not None: input_paths = [self._prev_outfile] else: input_paths = [] for path in self._input_paths: if path == '-': input_paths.append(self._dump_stdin_to_local_file()) else: input_paths.append(path) # add input to the command line for path in input_paths: args.append(os.path.abspath(path)) log.info('> %s' % cmd_line(args)) # set up outfile outfile = os.path.join(self._get_local_tmp_dir(), outfile_name) log.info('writing to %s' % outfile) log.debug('') self._prev_outfile = outfile write_to = open(outfile, 'w') # run the process proc = Popen(args, stdout=write_to, stderr=PIPE, cwd=self._working_dir, env=env) # handle counters, status msgs, and other stuff on stderr stderr_lines = self._process_stderr_from_script(proc.stderr) tb_lines = find_python_traceback(stderr_lines) self._print_counters() returncode = proc.wait() if returncode != 0: # try to throw a useful exception if tb_lines: raise Exception( 'Command %r returned non-zero exit status %d:\n%s' % (args, returncode, ''.join(tb_lines))) else: raise Exception( 'Command %r returned non-zero exit status %d: %s' % (args, returncode)) # flush file descriptors write_to.flush()
def _invoke_step(self, args, outfile_name, env=None): """Run the given command, outputting into outfile, and reading from the previous outfile (or, for the first step, from our original output files). outfile is a path relative to our local tmp dir. commands are run inside self._working_dir We'll intelligently handle stderr from the process. """ # keep the current environment because we need PATH to find binaries # and make PYTHONPATH work env = combine_local_envs({'PYTHONPATH': os.getcwd()}, os.environ, self._get_cmdenv(), env or {}) # decide where to get input if self._prev_outfile is not None: input_paths = [self._prev_outfile] else: input_paths = [] for path in self._input_paths: if path == '-': input_paths.append(self._dump_stdin_to_local_file()) else: input_paths.append(path) # add input to the command line for path in input_paths: args.append(os.path.abspath(path)) log.info('> %s' % cmd_line(args)) # set up outfile outfile = os.path.join(self._get_local_tmp_dir(), outfile_name) log.info('writing to %s' % outfile) log.debug('') self._prev_outfile = outfile write_to = open(outfile, 'w') # run the process proc = Popen(args, stdout=write_to, stderr=PIPE, cwd=self._working_dir, env=env) # handle counters, status msgs, and other stuff on stderr stderr_lines = self._process_stderr_from_script(proc.stderr) tb_lines = find_python_traceback(stderr_lines) self._print_counters() returncode = proc.wait() if returncode != 0: # try to throw a useful exception if tb_lines: raise Exception( 'Command %r returned non-zero exit status %d:\n%s' % (args, returncode, ''.join(tb_lines))) else: raise Exception( 'Command %r returned non-zero exit status %d: %s' % (args, returncode)) # flush file descriptors write_to.flush()