def _run_step(self, step_num, step_type, input_path, output_path, working_dir, env, child_stdin=None): step = self._get_step(step_num) # if no mapper, just pass the data through (see #1141) if step_type == 'mapper' and not step.get('mapper'): copyfile(input_path, output_path) return # Passing local=False ensures the job uses proper names for file # options (see issue #851 on github) common_args = (['--step-num=%d' % step_num] + self._mr_job_extra_args(local=False)) if step_type == 'mapper': child_args = ( ['--mapper'] + [input_path] + common_args) elif step_type == 'reducer': child_args = ( ['--reducer'] + [input_path] + common_args) elif step_type == 'combiner': child_args = ['--combiner'] + common_args + ['-'] has_combiner = (step_type == 'mapper' and 'combiner' in step) try: # Use custom stdout if has_combiner: child_stdout = BytesIO() else: child_stdout = open(output_path, 'wb') with save_current_environment(): with save_cwd(): os.environ.update(env) os.chdir(working_dir) child_instance = self._mrjob_cls(args=child_args) child_instance.sandbox(stdin=child_stdin, stdout=child_stdout) child_instance.execute() if has_combiner: sorted_lines = sorted(child_stdout.getvalue().splitlines()) combiner_stdin = BytesIO(b'\n'.join(sorted_lines)) else: child_stdout.flush() finally: child_stdout.close() while len(self._counters) <= step_num: self._counters.append({}) parse_mr_job_stderr(child_instance.stderr.getvalue(), counters=self._counters[step_num]) if has_combiner: self._run_step(step_num, 'combiner', None, output_path, working_dir, env, child_stdin=combiner_stdin) combiner_stdin.close()
def _run_step(self, step_num, step_type, input_path, output_path, working_dir, env, child_stdin=None): step = self._get_step(step_num) common_args = (['--step-num=%d' % step_num] + self._mr_job_extra_args(local=True)) if step_type == 'mapper': child_args = (['--mapper'] + [input_path] + common_args) elif step_type == 'reducer': child_args = (['--reducer'] + [input_path] + common_args) elif step_type == 'combiner': child_args = ['--combiner'] + common_args + ['-'] child_instance = self._mrjob_cls(args=child_args) has_combiner = (step_type == 'mapper' and 'combiner' in step) # Use custom stdin if has_combiner: child_stdout = StringIO() else: child_stdout = open(output_path, 'w') with save_current_environment(): with save_cwd(): os.environ.update(env) os.chdir(working_dir) child_instance.sandbox(stdin=child_stdin, stdout=child_stdout) child_instance.execute() if has_combiner: sorted_lines = sorted(child_stdout.getvalue().splitlines()) combiner_stdin = StringIO('\n'.join(sorted_lines)) else: child_stdout.flush() child_stdout.close() while len(self._counters) <= step_num: self._counters.append({}) child_instance.parse_counters(self._counters[step_num]) if has_combiner: self._run_step(step_num, 'combiner', None, output_path, working_dir, env, child_stdin=combiner_stdin) combiner_stdin.close()
def _run_step_on_spark(self, step, step_num): """Set up a fake working directory and environment, and call the Spark method.""" # this is kind of a Spark-specific mash-up of _run_streaming_step() # (in sim.py) and _invoke_task_func(), above # don't create the output dir for the step; that's Spark's job # breaking the Spark step down into tasks is pyspark's job, so # we just have a single dummy task self.fs.mkdir(self._task_dir('spark', step_num, task_num=0)) # could potentially parse this for cause of error stderr_path = self._task_stderr_path('spark', step_num, task_num=0) stdout_path = self._task_output_path('spark', step_num, task_num=0) self._create_dist_cache_dir(step_num) wd = self._setup_working_dir('spark', step_num, task_num=0) # use abspath() on input URIs before changing working dir task_args = self._spark_script_args(step_num) with open(stdout_path, 'wb') as stdout, \ open(stderr_path, 'wb') as stderr: with save_current_environment(), save_cwd(), save_sys_path(), \ save_sys_std(): os.environ.update(_fix_env(self._opts['cmdenv'])) os.chdir(wd) sys.path = [os.getcwd()] + sys.path # pretend we redirected stdout and stderr sys.stdout, sys.stderr = stdout, stderr task = self._mrjob_cls(task_args) task.execute()
def invoke_task(stdin, stdout, stderr, wd, env): with save_current_environment(), save_cwd(): os.environ.update(env) os.chdir(wd) input_uri = None try: args = self._args_for_task(step_num, task_type) if manifest: # read input path from stdin, add to args line = stdin.readline().decode('utf_8') input_uri = line.split('\t')[-1].rstrip() # input_uri is an absolute path, can serve # as path and uri both args = list(args) + [input_uri, input_uri] task = self._mrjob_cls(args) task.sandbox(stdin=stdin, stdout=stdout, stderr=stderr) task.execute() except: # so users can figure out where the exception came from; # see _log_cause_of_error(). we can't wrap the exception # because then we lose the stacktrace (which is the whole # point of the inline runner) if input_uri: # from manifest self._error_while_reading_from = input_uri else: self._error_while_reading_from = self._task_input_path( task_type, step_num, task_num) raise
def test_relative_path_to_uri(self): tmp_dir = realpath(gettempdir()) with save_cwd(): chdir(tmp_dir) foo_uri = to_uri('foo.db') self.assertEqual(foo_uri[:8], 'file:///') self.assertEqual(foo_uri, 'file://' + join(pathname2url(tmp_dir), 'foo.db'))
def _run_step(self, step_num, step_type, input_path, output_path, working_dir, env, child_stdin=None): step = self._get_step(step_num) common_args = (['--step-num=%d' % step_num] + self._mr_job_extra_args(local=True)) if step_type == 'mapper': child_args = ( ['--mapper'] + [input_path] + common_args) elif step_type == 'reducer': child_args = ( ['--reducer'] + [input_path] + common_args) elif step_type == 'combiner': child_args = ['--combiner'] + common_args + ['-'] child_instance = self._mrjob_cls(args=child_args) has_combiner = (step_type == 'mapper' and 'combiner' in step) # Use custom stdin if has_combiner: child_stdout = StringIO() else: child_stdout = open(output_path, 'w') with save_current_environment(): with save_cwd(): os.environ.update(env) os.chdir(working_dir) child_instance.sandbox(stdin=child_stdin, stdout=child_stdout) child_instance.execute() if has_combiner: sorted_lines = sorted(child_stdout.getvalue().splitlines()) combiner_stdin = StringIO('\n'.join(sorted_lines)) else: child_stdout.flush() child_stdout.close() while len(self._counters) <= step_num: self._counters.append({}) parse_mr_job_stderr(child_instance.stderr.getvalue(), counters=self._counters[step_num]) if has_combiner: self._run_step(step_num, 'combiner', None, output_path, working_dir, env, child_stdin=combiner_stdin) combiner_stdin.close()
def _run_step(self, step_num, step_type, input_path, output_path, working_dir, env, child_stdin=None): step = self._get_step(step_num) # Passing local=False ensures the job uses proper names for file # options (see issue #851 on github) common_args = ["--step-num=%d" % step_num] + self._mr_job_extra_args(local=False) if step_type == "mapper": child_args = ["--mapper"] + [input_path] + common_args elif step_type == "reducer": child_args = ["--reducer"] + [input_path] + common_args elif step_type == "combiner": child_args = ["--combiner"] + common_args + ["-"] has_combiner = step_type == "mapper" and "combiner" in step # Use custom stdin if has_combiner: child_stdout = BytesIO() else: child_stdout = open(output_path, "wb") with save_current_environment(): with save_cwd(): os.environ.update(env) os.chdir(working_dir) child_instance = self._mrjob_cls(args=child_args) child_instance.sandbox(stdin=child_stdin, stdout=child_stdout) child_instance.execute() if has_combiner: sorted_lines = sorted(child_stdout.getvalue().splitlines()) combiner_stdin = BytesIO(b"\n".join(sorted_lines)) else: child_stdout.flush() child_stdout.close() while len(self._counters) <= step_num: self._counters.append({}) parse_mr_job_stderr(child_instance.stderr.getvalue(), counters=self._counters[step_num]) if has_combiner: self._run_step(step_num, "combiner", None, output_path, working_dir, env, child_stdin=combiner_stdin) combiner_stdin.close()
def invoke_task(stdin, stdout, stderr, wd, env): with save_current_environment(), save_cwd(): os.environ.update(env) os.chdir(wd) try: task = self._mrjob_cls( args=self._args_for_task(step_num, task_type)) task.sandbox(stdin=stdin, stdout=stdout, stderr=stderr) task.execute() except: # so users can figure out where the exception came from; # see _log_cause_of_error(). we can't wrap the exception # because then we lose the stacktrace (which is the whole # point of the inline runner) # TODO: could write this to a file instead self._error_while_reading_from = self._task_input_path( task_type, step_num, task_num) raise
def invoke_task(stdin, stdout, stderr, wd, env): with save_current_environment(), save_cwd(), save_sys_path(), \ save_sys_std(): # pretend we're running the script in the working dir os.environ.update(env) os.chdir(wd) sys.path = [os.getcwd()] + sys.path # pretend we've redirected stdin/stdout/stderr sys.stdin = stdin sys.stdout = stdout sys.stderr = stderr input_uri = None try: args = self._args_for_task(step_num, task_type) if manifest: # read input path from stdin, add to args line = stdin.readline().decode('utf_8') input_uri = line.split('\t')[-1].rstrip() # input_uri is an absolute path, can serve # as path and uri both args = list(args) + [input_uri, input_uri] task = self._mrjob_cls(args) task.execute() except: # so users can figure out where the exception came from; # see _log_cause_of_error(). we can't wrap the exception # because then we lose the stacktrace (which is the whole # point of the inline runner) if input_uri: # from manifest self._error_while_reading_from = input_uri else: self._error_while_reading_from = self._task_input_path( task_type, step_num, task_num) raise