Exemple #1
0
    def _run_step(self, step_num, step_type, input_path, output_path,
                  working_dir, env, child_stdin=None):
        step = self._get_step(step_num)

        # if no mapper, just pass the data through (see #1141)
        if step_type == 'mapper' and not step.get('mapper'):
            copyfile(input_path, output_path)
            return

        # Passing local=False ensures the job uses proper names for file
        # options (see issue #851 on github)
        common_args = (['--step-num=%d' % step_num] +
                       self._mr_job_extra_args(local=False))

        if step_type == 'mapper':
            child_args = (
                ['--mapper'] + [input_path] + common_args)
        elif step_type == 'reducer':
            child_args = (
                ['--reducer'] + [input_path] + common_args)
        elif step_type == 'combiner':
            child_args = ['--combiner'] + common_args + ['-']

        has_combiner = (step_type == 'mapper' and 'combiner' in step)

        try:
            # Use custom stdout
            if has_combiner:
                child_stdout = BytesIO()
            else:
                child_stdout = open(output_path, 'wb')

            with save_current_environment():
                with save_cwd():
                    os.environ.update(env)
                    os.chdir(working_dir)

                    child_instance = self._mrjob_cls(args=child_args)
                    child_instance.sandbox(stdin=child_stdin,
                                           stdout=child_stdout)
                    child_instance.execute()

            if has_combiner:
                sorted_lines = sorted(child_stdout.getvalue().splitlines())
                combiner_stdin = BytesIO(b'\n'.join(sorted_lines))
            else:
                child_stdout.flush()
        finally:
            child_stdout.close()

        while len(self._counters) <= step_num:
            self._counters.append({})
        parse_mr_job_stderr(child_instance.stderr.getvalue(),
                            counters=self._counters[step_num])

        if has_combiner:
            self._run_step(step_num, 'combiner', None, output_path,
                           working_dir, env, child_stdin=combiner_stdin)

            combiner_stdin.close()
Exemple #2
0
    def _run_step(self,
                  step_num,
                  step_type,
                  input_path,
                  output_path,
                  working_dir,
                  env,
                  child_stdin=None):
        step = self._get_step(step_num)

        common_args = (['--step-num=%d' % step_num] +
                       self._mr_job_extra_args(local=True))

        if step_type == 'mapper':
            child_args = (['--mapper'] + [input_path] + common_args)
        elif step_type == 'reducer':
            child_args = (['--reducer'] + [input_path] + common_args)
        elif step_type == 'combiner':
            child_args = ['--combiner'] + common_args + ['-']

        child_instance = self._mrjob_cls(args=child_args)

        has_combiner = (step_type == 'mapper' and 'combiner' in step)

        # Use custom stdin
        if has_combiner:
            child_stdout = StringIO()
        else:
            child_stdout = open(output_path, 'w')

        with save_current_environment():
            with save_cwd():
                os.environ.update(env)
                os.chdir(working_dir)

                child_instance.sandbox(stdin=child_stdin, stdout=child_stdout)
                child_instance.execute()

        if has_combiner:
            sorted_lines = sorted(child_stdout.getvalue().splitlines())
            combiner_stdin = StringIO('\n'.join(sorted_lines))
        else:
            child_stdout.flush()

        child_stdout.close()

        while len(self._counters) <= step_num:
            self._counters.append({})
        child_instance.parse_counters(self._counters[step_num])

        if has_combiner:
            self._run_step(step_num,
                           'combiner',
                           None,
                           output_path,
                           working_dir,
                           env,
                           child_stdin=combiner_stdin)

            combiner_stdin.close()
Exemple #3
0
    def _run_step_on_spark(self, step, step_num):
        """Set up a fake working directory and environment, and call the Spark
        method."""
        # this is kind of a Spark-specific mash-up of _run_streaming_step()
        # (in sim.py) and _invoke_task_func(), above

        # don't create the output dir for the step; that's Spark's job

        # breaking the Spark step down into tasks is pyspark's job, so
        # we just have a single dummy task

        self.fs.mkdir(self._task_dir('spark', step_num, task_num=0))
        # could potentially parse this for cause of error
        stderr_path = self._task_stderr_path('spark', step_num, task_num=0)
        stdout_path = self._task_output_path('spark', step_num, task_num=0)

        self._create_dist_cache_dir(step_num)
        wd = self._setup_working_dir('spark', step_num, task_num=0)

        # use abspath() on input URIs before changing working dir
        task_args = self._spark_script_args(step_num)

        with open(stdout_path, 'wb') as stdout, \
                open(stderr_path, 'wb') as stderr:
            with save_current_environment(), save_cwd(), save_sys_path(), \
                    save_sys_std():
                os.environ.update(_fix_env(self._opts['cmdenv']))
                os.chdir(wd)
                sys.path = [os.getcwd()] + sys.path

                # pretend we redirected stdout and stderr
                sys.stdout, sys.stderr = stdout, stderr

                task = self._mrjob_cls(task_args)
                task.execute()
Exemple #4
0
        def invoke_task(stdin, stdout, stderr, wd, env):
            with save_current_environment(), save_cwd():
                os.environ.update(env)
                os.chdir(wd)

                input_uri = None
                try:
                    args = self._args_for_task(step_num, task_type)

                    if manifest:
                        # read input path from stdin, add to args
                        line = stdin.readline().decode('utf_8')
                        input_uri = line.split('\t')[-1].rstrip()
                        # input_uri is an absolute path, can serve
                        # as path and uri both
                        args = list(args) + [input_uri, input_uri]

                    task = self._mrjob_cls(args)
                    task.sandbox(stdin=stdin, stdout=stdout, stderr=stderr)

                    task.execute()
                except:
                    # so users can figure out where the exception came from;
                    # see _log_cause_of_error(). we can't wrap the exception
                    # because then we lose the stacktrace (which is the whole
                    # point of the inline runner)

                    if input_uri:  # from manifest
                        self._error_while_reading_from = input_uri
                    else:
                        self._error_while_reading_from = self._task_input_path(
                            task_type, step_num, task_num)

                    raise
Exemple #5
0
    def _run_step_on_spark(self, step, step_num):
        """Set up a fake working directory and environment, and call the Spark
        method."""
        # this is kind of a Spark-specific mash-up of _run_streaming_step()
        # (in sim.py) and _invoke_task_func(), above

        # don't create the output dir for the step; that's Spark's job

        # breaking the Spark step down into tasks is pyspark's job, so
        # we just have a single dummy task

        self.fs.mkdir(self._task_dir('spark', step_num, task_num=0))
        # could potentially parse this for cause of error
        stderr_path = self._task_stderr_path('spark', step_num, task_num=0)
        stdout_path = self._task_output_path('spark', step_num, task_num=0)

        self._create_dist_cache_dir(step_num)
        wd = self._setup_working_dir('spark', step_num, task_num=0)

        # use abspath() on input URIs before changing working dir
        task_args = self._spark_script_args(step_num)

        with open(stdout_path, 'wb') as stdout, \
                open(stderr_path, 'wb') as stderr:
            with save_current_environment(), save_cwd(), save_sys_path(), \
                    save_sys_std():
                os.environ.update(_fix_env(self._opts['cmdenv']))
                os.chdir(wd)
                sys.path = [os.getcwd()] + sys.path

                # pretend we redirected stdout and stderr
                sys.stdout, sys.stderr = stdout, stderr

                task = self._mrjob_cls(task_args)
                task.execute()
Exemple #6
0
    def test_relative_path_to_uri(self):
        tmp_dir = realpath(gettempdir())

        with save_cwd():
            chdir(tmp_dir)

            foo_uri = to_uri('foo.db')

            self.assertEqual(foo_uri[:8], 'file:///')
            self.assertEqual(foo_uri,
                             'file://' + join(pathname2url(tmp_dir), 'foo.db'))
Exemple #7
0
    def _run_step(self, step_num, step_type, input_path, output_path,
                  working_dir, env, child_stdin=None):
        step = self._get_step(step_num)

        common_args = (['--step-num=%d' % step_num] +
                       self._mr_job_extra_args(local=True))

        if step_type == 'mapper':
            child_args = (
                ['--mapper'] + [input_path] + common_args)
        elif step_type == 'reducer':
            child_args = (
                ['--reducer'] + [input_path] + common_args)
        elif step_type == 'combiner':
            child_args = ['--combiner'] + common_args + ['-']

        child_instance = self._mrjob_cls(args=child_args)

        has_combiner = (step_type == 'mapper' and 'combiner' in step)

        # Use custom stdin
        if has_combiner:
            child_stdout = StringIO()
        else:
            child_stdout = open(output_path, 'w')

        with save_current_environment():
            with save_cwd():
                os.environ.update(env)
                os.chdir(working_dir)

                child_instance.sandbox(stdin=child_stdin, stdout=child_stdout)
                child_instance.execute()

        if has_combiner:
            sorted_lines = sorted(child_stdout.getvalue().splitlines())
            combiner_stdin = StringIO('\n'.join(sorted_lines))
        else:
            child_stdout.flush()

        child_stdout.close()

        while len(self._counters) <= step_num:
            self._counters.append({})
        parse_mr_job_stderr(child_instance.stderr.getvalue(),
                            counters=self._counters[step_num])

        if has_combiner:
            self._run_step(step_num, 'combiner', None, output_path,
                           working_dir, env, child_stdin=combiner_stdin)

            combiner_stdin.close()
Exemple #8
0
    def _run_step(self, step_num, step_type, input_path, output_path, working_dir, env, child_stdin=None):
        step = self._get_step(step_num)

        # Passing local=False ensures the job uses proper names for file
        # options (see issue #851 on github)
        common_args = ["--step-num=%d" % step_num] + self._mr_job_extra_args(local=False)

        if step_type == "mapper":
            child_args = ["--mapper"] + [input_path] + common_args
        elif step_type == "reducer":
            child_args = ["--reducer"] + [input_path] + common_args
        elif step_type == "combiner":
            child_args = ["--combiner"] + common_args + ["-"]

        has_combiner = step_type == "mapper" and "combiner" in step

        # Use custom stdin
        if has_combiner:
            child_stdout = BytesIO()
        else:
            child_stdout = open(output_path, "wb")

        with save_current_environment():
            with save_cwd():
                os.environ.update(env)
                os.chdir(working_dir)

                child_instance = self._mrjob_cls(args=child_args)
                child_instance.sandbox(stdin=child_stdin, stdout=child_stdout)
                child_instance.execute()

        if has_combiner:
            sorted_lines = sorted(child_stdout.getvalue().splitlines())
            combiner_stdin = BytesIO(b"\n".join(sorted_lines))
        else:
            child_stdout.flush()

        child_stdout.close()

        while len(self._counters) <= step_num:
            self._counters.append({})
        parse_mr_job_stderr(child_instance.stderr.getvalue(), counters=self._counters[step_num])

        if has_combiner:
            self._run_step(step_num, "combiner", None, output_path, working_dir, env, child_stdin=combiner_stdin)

            combiner_stdin.close()
Exemple #9
0
        def invoke_task(stdin, stdout, stderr, wd, env):
            with save_current_environment(), save_cwd():
                os.environ.update(env)
                os.chdir(wd)

                try:
                    task = self._mrjob_cls(
                        args=self._args_for_task(step_num, task_type))
                    task.sandbox(stdin=stdin, stdout=stdout, stderr=stderr)

                    task.execute()
                except:
                    # so users can figure out where the exception came from;
                    # see _log_cause_of_error(). we can't wrap the exception
                    # because then we lose the stacktrace (which is the whole
                    # point of the inline runner)

                    # TODO: could write this to a file instead
                    self._error_while_reading_from = self._task_input_path(
                        task_type, step_num, task_num)
                    raise
Exemple #10
0
        def invoke_task(stdin, stdout, stderr, wd, env):
            with save_current_environment(), save_cwd():
                os.environ.update(env)
                os.chdir(wd)

                try:
                    task = self._mrjob_cls(
                        args=self._args_for_task(step_num, task_type))
                    task.sandbox(stdin=stdin, stdout=stdout, stderr=stderr)

                    task.execute()
                except:
                    # so users can figure out where the exception came from;
                    # see _log_cause_of_error(). we can't wrap the exception
                    # because then we lose the stacktrace (which is the whole
                    # point of the inline runner)

                    # TODO: could write this to a file instead
                    self._error_while_reading_from = self._task_input_path(
                        task_type, step_num, task_num)
                    raise
Exemple #11
0
        def invoke_task(stdin, stdout, stderr, wd, env):
            with save_current_environment(), save_cwd(), save_sys_path(), \
                    save_sys_std():
                # pretend we're running the script in the working dir
                os.environ.update(env)
                os.chdir(wd)
                sys.path = [os.getcwd()] + sys.path

                # pretend we've redirected stdin/stdout/stderr
                sys.stdin = stdin
                sys.stdout = stdout
                sys.stderr = stderr

                input_uri = None
                try:
                    args = self._args_for_task(step_num, task_type)

                    if manifest:
                        # read input path from stdin, add to args
                        line = stdin.readline().decode('utf_8')
                        input_uri = line.split('\t')[-1].rstrip()
                        # input_uri is an absolute path, can serve
                        # as path and uri both
                        args = list(args) + [input_uri, input_uri]

                    task = self._mrjob_cls(args)
                    task.execute()
                except:
                    # so users can figure out where the exception came from;
                    # see _log_cause_of_error(). we can't wrap the exception
                    # because then we lose the stacktrace (which is the whole
                    # point of the inline runner)

                    if input_uri:  # from manifest
                        self._error_while_reading_from = input_uri
                    else:
                        self._error_while_reading_from = self._task_input_path(
                            task_type, step_num, task_num)

                    raise