def _run_step_on_spark(self, step, step_num): """Set up a fake working directory and environment, and call the Spark method.""" # this is kind of a Spark-specific mash-up of _run_streaming_step() # (in sim.py) and _invoke_task_func(), above # don't create the output dir for the step; that's Spark's job # breaking the Spark step down into tasks is pyspark's job, so # we just have a single dummy task self.fs.mkdir(self._task_dir('spark', step_num, task_num=0)) # could potentially parse this for cause of error stderr_path = self._task_stderr_path('spark', step_num, task_num=0) stdout_path = self._task_output_path('spark', step_num, task_num=0) self._create_dist_cache_dir(step_num) wd = self._setup_working_dir('spark', step_num, task_num=0) # use abspath() on input URIs before changing working dir task_args = self._spark_script_args(step_num) with open(stdout_path, 'wb') as stdout, \ open(stderr_path, 'wb') as stderr: with save_current_environment(), save_cwd(), save_sys_path(), \ save_sys_std(): os.environ.update(_fix_env(self._opts['cmdenv'])) os.chdir(wd) sys.path = [os.getcwd()] + sys.path # pretend we redirected stdout and stderr sys.stdout, sys.stderr = stdout, stderr task = self._mrjob_cls(task_args) task.execute()
def invoke_task(stdin, stdout, stderr, wd, env): with save_current_environment(), save_cwd(), save_sys_path(), \ save_sys_std(): # pretend we're running the script in the working dir os.environ.update(env) os.chdir(wd) sys.path = [os.getcwd()] + sys.path # pretend we've redirected stdin/stdout/stderr sys.stdin = stdin sys.stdout = stdout sys.stderr = stderr input_uri = None try: args = self._args_for_task(step_num, task_type) if manifest: # read input path from stdin, add to args line = stdin.readline().decode('utf_8') input_uri = line.split('\t')[-1].rstrip() # input_uri is an absolute path, can serve # as path and uri both args = list(args) + [input_uri, input_uri] task = self._mrjob_cls(args) task.execute() except: # so users can figure out where the exception came from; # see _log_cause_of_error(). we can't wrap the exception # because then we lose the stacktrace (which is the whole # point of the inline runner) if input_uri: # from manifest self._error_while_reading_from = input_uri else: self._error_while_reading_from = self._task_input_path( task_type, step_num, task_num) raise