def invoke_task(stdin, stdout, stderr, wd, env): with save_current_environment(), save_cwd(): os.environ.update(env) os.chdir(wd) input_uri = None try: args = self._args_for_task(step_num, task_type) if manifest: # read input path from stdin, add to args line = stdin.readline().decode('utf_8') input_uri = line.split('\t')[-1].rstrip() # input_uri is an absolute path, can serve # as path and uri both args = list(args) + [input_uri, input_uri] task = self._mrjob_cls(args) task.sandbox(stdin=stdin, stdout=stdout, stderr=stderr) task.execute() except: # so users can figure out where the exception came from; # see _log_cause_of_error(). we can't wrap the exception # because then we lose the stacktrace (which is the whole # point of the inline runner) if input_uri: # from manifest self._error_while_reading_from = input_uri else: self._error_while_reading_from = self._task_input_path( task_type, step_num, task_num) raise
def _run_step(self, step_num, step_type, input_path, output_path, working_dir, env, child_stdin=None): step = self._get_step(step_num) # if no mapper, just pass the data through (see #1141) if step_type == 'mapper' and not step.get('mapper'): copyfile(input_path, output_path) return # Passing local=False ensures the job uses proper names for file # options (see issue #851 on github) common_args = (['--step-num=%d' % step_num] + self._mr_job_extra_args(local=False)) if step_type == 'mapper': child_args = ( ['--mapper'] + [input_path] + common_args) elif step_type == 'reducer': child_args = ( ['--reducer'] + [input_path] + common_args) elif step_type == 'combiner': child_args = ['--combiner'] + common_args + ['-'] has_combiner = (step_type == 'mapper' and 'combiner' in step) try: # Use custom stdout if has_combiner: child_stdout = BytesIO() else: child_stdout = open(output_path, 'wb') with save_current_environment(): with save_cwd(): os.environ.update(env) os.chdir(working_dir) child_instance = self._mrjob_cls(args=child_args) child_instance.sandbox(stdin=child_stdin, stdout=child_stdout) child_instance.execute() if has_combiner: sorted_lines = sorted(child_stdout.getvalue().splitlines()) combiner_stdin = BytesIO(b'\n'.join(sorted_lines)) else: child_stdout.flush() finally: child_stdout.close() while len(self._counters) <= step_num: self._counters.append({}) parse_mr_job_stderr(child_instance.stderr.getvalue(), counters=self._counters[step_num]) if has_combiner: self._run_step(step_num, 'combiner', None, output_path, working_dir, env, child_stdin=combiner_stdin) combiner_stdin.close()
def _run_step_on_spark(self, step, step_num): """Set up a fake working directory and environment, and call the Spark method.""" # this is kind of a Spark-specific mash-up of _run_streaming_step() # (in sim.py) and _invoke_task_func(), above # don't create the output dir for the step; that's Spark's job # breaking the Spark step down into tasks is pyspark's job, so # we just have a single dummy task self.fs.mkdir(self._task_dir('spark', step_num, task_num=0)) # could potentially parse this for cause of error stderr_path = self._task_stderr_path('spark', step_num, task_num=0) stdout_path = self._task_output_path('spark', step_num, task_num=0) self._create_dist_cache_dir(step_num) wd = self._setup_working_dir('spark', step_num, task_num=0) # use abspath() on input URIs before changing working dir task_args = self._spark_script_args(step_num) with open(stdout_path, 'wb') as stdout, \ open(stderr_path, 'wb') as stderr: with save_current_environment(), save_cwd(), save_sys_path(), \ save_sys_std(): os.environ.update(_fix_env(self._opts['cmdenv'])) os.chdir(wd) sys.path = [os.getcwd()] + sys.path # pretend we redirected stdout and stderr sys.stdout, sys.stderr = stdout, stderr task = self._mrjob_cls(task_args) task.execute()
def _run_step(self, step_num, step_type, input_path, output_path, working_dir, env, child_stdin=None): step = self._get_step(step_num) common_args = (['--step-num=%d' % step_num] + self._mr_job_extra_args(local=True)) if step_type == 'mapper': child_args = (['--mapper'] + [input_path] + common_args) elif step_type == 'reducer': child_args = (['--reducer'] + [input_path] + common_args) elif step_type == 'combiner': child_args = ['--combiner'] + common_args + ['-'] child_instance = self._mrjob_cls(args=child_args) has_combiner = (step_type == 'mapper' and 'combiner' in step) # Use custom stdin if has_combiner: child_stdout = StringIO() else: child_stdout = open(output_path, 'w') with save_current_environment(): with save_cwd(): os.environ.update(env) os.chdir(working_dir) child_instance.sandbox(stdin=child_stdin, stdout=child_stdout) child_instance.execute() if has_combiner: sorted_lines = sorted(child_stdout.getvalue().splitlines()) combiner_stdin = StringIO('\n'.join(sorted_lines)) else: child_stdout.flush() child_stdout.close() while len(self._counters) <= step_num: self._counters.append({}) child_instance.parse_counters(self._counters[step_num]) if has_combiner: self._run_step(step_num, 'combiner', None, output_path, working_dir, env, child_stdin=combiner_stdin) combiner_stdin.close()
def _run(self): self._setup_output_dir() assert self._script # shouldn't be able to run if no script for ignored_opt in self.IGNORED_HADOOP_OPTS: if ((not self._opts.is_default(ignored_opt)) and self._opts[ignored_opt]): log.warning('ignoring %s option (requires real Hadoop): %r' % (ignored_opt, self._opts[ignored_opt])) for ignored_attr in self.IGNORED_HADOOP_ATTRS: value = getattr(self, ignored_attr) if value is not None: log.warning( 'ignoring %s keyword arg (requires real Hadoop): %r' % (ignored_attr[1:], value)) for ignored_opt in self.IGNORED_LOCAL_OPTS: if ((not self._opts.is_default(ignored_opt)) and self._opts[ignored_opt]): log.warning('ignoring %s option (use -r local instead): %r' % (ignored_opt, self._opts[ignored_opt])) with save_current_environment(): # set cmdenv variables os.environ.update(self._get_cmdenv()) steps = self._get_steps() for step_dict in steps: self._check_step_is_mrjob_only(step_dict) # run mapper, sort, reducer for each step for step_number, step_dict in enumerate(steps): self._invoke_inline_mrjob( step_number, step_dict, 'step-%d-mapper' % step_number, 'mapper') if 'reducer' in step_dict: mapper_output_path = self._prev_outfile sorted_mapper_output_path = self._decide_output_path( 'step-%d-mapper-sorted' % step_number) with open(sorted_mapper_output_path, 'w') as sort_out: proc = subprocess.Popen( ['sort', mapper_output_path], stdout=sort_out, env={'LC_ALL': 'C'}) proc.wait() # This'll read from sorted_mapper_output_path self._invoke_inline_mrjob( step_number, step_dict, 'step-%d-reducer' % step_number, 'reducer') # move final output to output directory self._final_outfile = os.path.join(self._output_dir, 'part-00000') log.info('Moving %s -> %s' % (self._prev_outfile, self._final_outfile)) shutil.move(self._prev_outfile, self._final_outfile)
def _run(self): self._setup_output_dir() for ignored_opt in self.IGNORED_HADOOP_OPTS: if ((not self._opts.is_default(ignored_opt)) and self._opts[ignored_opt]): log.warning('ignoring %s option (requires real Hadoop): %r' % (ignored_opt, self._opts[ignored_opt])) for ignored_attr in self.IGNORED_HADOOP_ATTRS: value = getattr(self, ignored_attr) if value is not None: log.warning( 'ignoring %s keyword arg (requires real Hadoop): %r' % (ignored_attr[1:], value)) for ignored_opt in self.IGNORED_LOCAL_OPTS: if ((not self._opts.is_default(ignored_opt)) and self._opts[ignored_opt]): log.warning('ignoring %s option (use -r local instead): %r' % (ignored_opt, self._opts[ignored_opt])) with save_current_environment(): # set cmdenv variables os.environ.update(self._get_cmdenv()) steps = self._get_steps() for step_dict in steps: self._check_step_is_mrjob_only(step_dict) # run mapper, sort, reducer for each step for step_number, step_dict in enumerate(steps): self._invoke_inline_mrjob(step_number, step_dict, 'step-%d-mapper' % step_number, 'mapper') if 'reducer' in step_dict: mapper_output_path = self._prev_outfile sorted_mapper_output_path = self._decide_output_path( 'step-%d-mapper-sorted' % step_number) with open(sorted_mapper_output_path, 'w') as sort_out: proc = subprocess.Popen(['sort', mapper_output_path], stdout=sort_out, env={'LC_ALL': 'C'}) proc.wait() # This'll read from sorted_mapper_output_path self._invoke_inline_mrjob(step_number, step_dict, 'step-%d-reducer' % step_number, 'reducer') # move final output to output directory self._final_outfile = os.path.join(self._output_dir, 'part-00000') log.info('Moving %s -> %s' % (self._prev_outfile, self._final_outfile)) shutil.move(self._prev_outfile, self._final_outfile)
def run_step(self, step_dict, input_file, outfile_name, step_number, step_type, env, child_stdin=None): common_args = (['--step-num=%d' % step_number] + self._mr_job_extra_args(local=True)) if step_type == 'mapper': child_args = (['--mapper'] + [input_file] + common_args) elif step_type == 'reducer': child_args = (['--reducer'] + [input_file] + common_args) elif step_type == 'combiner': child_args = ['--combiner'] + common_args + ['-'] child_instance = self._mrjob_cls(args=child_args) has_combiner = (step_type == 'mapper' and 'combiner' in step_dict) # Use custom stdin if has_combiner: child_stdout = StringIO() else: child_stdout = open(outfile_name, 'w') with save_current_environment(): os.environ.update(env) child_instance.sandbox(stdin=child_stdin, stdout=child_stdout) child_instance.execute() if has_combiner: sorted_lines = sorted(child_stdout.getvalue().splitlines()) combiner_stdin = StringIO('\n'.join(sorted_lines)) else: child_stdout.flush() child_stdout.close() while len(self._counters) <= step_number: self._counters.append({}) child_instance.parse_counters(self._counters[step_number - 1]) if has_combiner: self.run_step(step_dict, "", outfile_name, step_number, 'combiner', env=env, child_stdin=combiner_stdin) combiner_stdin.close()
def _run_step(self, step_num, step_type, input_path, output_path, working_dir, env, child_stdin=None): step = self._get_step(step_num) common_args = (['--step-num=%d' % step_num] + self._mr_job_extra_args(local=True)) if step_type == 'mapper': child_args = ( ['--mapper'] + [input_path] + common_args) elif step_type == 'reducer': child_args = ( ['--reducer'] + [input_path] + common_args) elif step_type == 'combiner': child_args = ['--combiner'] + common_args + ['-'] child_instance = self._mrjob_cls(args=child_args) has_combiner = (step_type == 'mapper' and 'combiner' in step) # Use custom stdin if has_combiner: child_stdout = StringIO() else: child_stdout = open(output_path, 'w') with save_current_environment(): with save_cwd(): os.environ.update(env) os.chdir(working_dir) child_instance.sandbox(stdin=child_stdin, stdout=child_stdout) child_instance.execute() if has_combiner: sorted_lines = sorted(child_stdout.getvalue().splitlines()) combiner_stdin = StringIO('\n'.join(sorted_lines)) else: child_stdout.flush() child_stdout.close() while len(self._counters) <= step_num: self._counters.append({}) parse_mr_job_stderr(child_instance.stderr.getvalue(), counters=self._counters[step_num]) if has_combiner: self._run_step(step_num, 'combiner', None, output_path, working_dir, env, child_stdin=combiner_stdin) combiner_stdin.close()
def run_step(self, step_dict, input_file, outfile_name, step_number, step_type, env, child_stdin=None): common_args = (['--step-num=%d' % step_number] + self._mr_job_extra_args(local=True)) if step_type == 'mapper': child_args = ( ['--mapper'] + [input_file] + common_args) elif step_type == 'reducer': child_args = ( ['--reducer'] + [input_file] + common_args) elif step_type == 'combiner': child_args = ['--combiner'] + common_args + ['-'] child_instance = self._mrjob_cls(args=child_args) has_combiner = (step_type == 'mapper' and 'combiner' in step_dict) # Use custom stdin if has_combiner: child_stdout = StringIO() else: child_stdout = open(outfile_name, 'w') with save_current_environment(): os.environ.update(env) child_instance.sandbox(stdin=child_stdin, stdout=child_stdout) child_instance.execute() if has_combiner: sorted_lines = sorted(child_stdout.getvalue().splitlines()) combiner_stdin = StringIO('\n'.join(sorted_lines)) else: child_stdout.flush() child_stdout.close() while len(self._counters) <= step_number: self._counters.append({}) child_instance.parse_counters(self._counters[step_number - 1]) if has_combiner: self.run_step(step_dict, "", outfile_name, step_number, 'combiner', env=env, child_stdin=combiner_stdin) combiner_stdin.close()
def _run_step(self, step_num, step_type, input_path, output_path, working_dir, env, child_stdin=None): step = self._get_step(step_num) # Passing local=False ensures the job uses proper names for file # options (see issue #851 on github) common_args = ["--step-num=%d" % step_num] + self._mr_job_extra_args(local=False) if step_type == "mapper": child_args = ["--mapper"] + [input_path] + common_args elif step_type == "reducer": child_args = ["--reducer"] + [input_path] + common_args elif step_type == "combiner": child_args = ["--combiner"] + common_args + ["-"] has_combiner = step_type == "mapper" and "combiner" in step # Use custom stdin if has_combiner: child_stdout = BytesIO() else: child_stdout = open(output_path, "wb") with save_current_environment(): with save_cwd(): os.environ.update(env) os.chdir(working_dir) child_instance = self._mrjob_cls(args=child_args) child_instance.sandbox(stdin=child_stdin, stdout=child_stdout) child_instance.execute() if has_combiner: sorted_lines = sorted(child_stdout.getvalue().splitlines()) combiner_stdin = BytesIO(b"\n".join(sorted_lines)) else: child_stdout.flush() child_stdout.close() while len(self._counters) <= step_num: self._counters.append({}) parse_mr_job_stderr(child_instance.stderr.getvalue(), counters=self._counters[step_num]) if has_combiner: self._run_step(step_num, "combiner", None, output_path, working_dir, env, child_stdin=combiner_stdin) combiner_stdin.close()
def invoke_task(stdin, stdout, stderr, wd, env): with save_current_environment(), save_cwd(): os.environ.update(env) os.chdir(wd) try: task = self._mrjob_cls( args=self._args_for_task(step_num, task_type)) task.sandbox(stdin=stdin, stdout=stdout, stderr=stderr) task.execute() except: # so users can figure out where the exception came from; # see _log_cause_of_error(). we can't wrap the exception # because then we lose the stacktrace (which is the whole # point of the inline runner) # TODO: could write this to a file instead self._error_while_reading_from = self._task_input_path( task_type, step_num, task_num) raise
def invoke_task(stdin, stdout, stderr, wd, env): with save_current_environment(), save_cwd(), save_sys_path(), \ save_sys_std(): # pretend we're running the script in the working dir os.environ.update(env) os.chdir(wd) sys.path = [os.getcwd()] + sys.path # pretend we've redirected stdin/stdout/stderr sys.stdin = stdin sys.stdout = stdout sys.stderr = stderr input_uri = None try: args = self._args_for_task(step_num, task_type) if manifest: # read input path from stdin, add to args line = stdin.readline().decode('utf_8') input_uri = line.split('\t')[-1].rstrip() # input_uri is an absolute path, can serve # as path and uri both args = list(args) + [input_uri, input_uri] task = self._mrjob_cls(args) task.execute() except: # so users can figure out where the exception came from; # see _log_cause_of_error(). we can't wrap the exception # because then we lose the stacktrace (which is the whole # point of the inline runner) if input_uri: # from manifest self._error_while_reading_from = input_uri else: self._error_while_reading_from = self._task_input_path( task_type, step_num, task_num) raise
def test_explicit_region_beats_environment(self): with save_current_environment(): os.environ['CLOUDSDK_COMPUTE_REGION'] = 'us-east1' runner = DataprocJobRunner(region='europe-west1-a') self.assertEqual(runner._opts['region'], 'europe-west1-a')
def test_explicit_zone_beats_environment(self): with save_current_environment(): os.environ['CLOUDSDK_COMPUTE_ZONE'] = 'us-west1-b' runner = DataprocJobRunner(zone='europe-west1-a') self.assertEqual(runner._opts['zone'], 'europe-west1-a')
def test_zone_from_environment(self): with save_current_environment(): os.environ['CLOUDSDK_COMPUTE_ZONE'] = 'us-west1-b' runner = DataprocJobRunner() self.assertEqual(runner._opts['zone'], 'us-west1-b')
def test_region_from_environment(self): with save_current_environment(): os.environ['CLOUDSDK_COMPUTE_REGION'] = 'us-east1' runner = DataprocJobRunner() self.assertEqual(runner._opts['region'], 'us-east1')