Exemple #1
0
        def invoke_task(stdin, stdout, stderr, wd, env):
            with save_current_environment(), save_cwd():
                os.environ.update(env)
                os.chdir(wd)

                input_uri = None
                try:
                    args = self._args_for_task(step_num, task_type)

                    if manifest:
                        # read input path from stdin, add to args
                        line = stdin.readline().decode('utf_8')
                        input_uri = line.split('\t')[-1].rstrip()
                        # input_uri is an absolute path, can serve
                        # as path and uri both
                        args = list(args) + [input_uri, input_uri]

                    task = self._mrjob_cls(args)
                    task.sandbox(stdin=stdin, stdout=stdout, stderr=stderr)

                    task.execute()
                except:
                    # so users can figure out where the exception came from;
                    # see _log_cause_of_error(). we can't wrap the exception
                    # because then we lose the stacktrace (which is the whole
                    # point of the inline runner)

                    if input_uri:  # from manifest
                        self._error_while_reading_from = input_uri
                    else:
                        self._error_while_reading_from = self._task_input_path(
                            task_type, step_num, task_num)

                    raise
Exemple #2
0
    def _run_step(self, step_num, step_type, input_path, output_path,
                  working_dir, env, child_stdin=None):
        step = self._get_step(step_num)

        # if no mapper, just pass the data through (see #1141)
        if step_type == 'mapper' and not step.get('mapper'):
            copyfile(input_path, output_path)
            return

        # Passing local=False ensures the job uses proper names for file
        # options (see issue #851 on github)
        common_args = (['--step-num=%d' % step_num] +
                       self._mr_job_extra_args(local=False))

        if step_type == 'mapper':
            child_args = (
                ['--mapper'] + [input_path] + common_args)
        elif step_type == 'reducer':
            child_args = (
                ['--reducer'] + [input_path] + common_args)
        elif step_type == 'combiner':
            child_args = ['--combiner'] + common_args + ['-']

        has_combiner = (step_type == 'mapper' and 'combiner' in step)

        try:
            # Use custom stdout
            if has_combiner:
                child_stdout = BytesIO()
            else:
                child_stdout = open(output_path, 'wb')

            with save_current_environment():
                with save_cwd():
                    os.environ.update(env)
                    os.chdir(working_dir)

                    child_instance = self._mrjob_cls(args=child_args)
                    child_instance.sandbox(stdin=child_stdin,
                                           stdout=child_stdout)
                    child_instance.execute()

            if has_combiner:
                sorted_lines = sorted(child_stdout.getvalue().splitlines())
                combiner_stdin = BytesIO(b'\n'.join(sorted_lines))
            else:
                child_stdout.flush()
        finally:
            child_stdout.close()

        while len(self._counters) <= step_num:
            self._counters.append({})
        parse_mr_job_stderr(child_instance.stderr.getvalue(),
                            counters=self._counters[step_num])

        if has_combiner:
            self._run_step(step_num, 'combiner', None, output_path,
                           working_dir, env, child_stdin=combiner_stdin)

            combiner_stdin.close()
Exemple #3
0
    def _run_step_on_spark(self, step, step_num):
        """Set up a fake working directory and environment, and call the Spark
        method."""
        # this is kind of a Spark-specific mash-up of _run_streaming_step()
        # (in sim.py) and _invoke_task_func(), above

        # don't create the output dir for the step; that's Spark's job

        # breaking the Spark step down into tasks is pyspark's job, so
        # we just have a single dummy task

        self.fs.mkdir(self._task_dir('spark', step_num, task_num=0))
        # could potentially parse this for cause of error
        stderr_path = self._task_stderr_path('spark', step_num, task_num=0)
        stdout_path = self._task_output_path('spark', step_num, task_num=0)

        self._create_dist_cache_dir(step_num)
        wd = self._setup_working_dir('spark', step_num, task_num=0)

        # use abspath() on input URIs before changing working dir
        task_args = self._spark_script_args(step_num)

        with open(stdout_path, 'wb') as stdout, \
                open(stderr_path, 'wb') as stderr:
            with save_current_environment(), save_cwd(), save_sys_path(), \
                    save_sys_std():
                os.environ.update(_fix_env(self._opts['cmdenv']))
                os.chdir(wd)
                sys.path = [os.getcwd()] + sys.path

                # pretend we redirected stdout and stderr
                sys.stdout, sys.stderr = stdout, stderr

                task = self._mrjob_cls(task_args)
                task.execute()
Exemple #4
0
    def _run_step_on_spark(self, step, step_num):
        """Set up a fake working directory and environment, and call the Spark
        method."""
        # this is kind of a Spark-specific mash-up of _run_streaming_step()
        # (in sim.py) and _invoke_task_func(), above

        # don't create the output dir for the step; that's Spark's job

        # breaking the Spark step down into tasks is pyspark's job, so
        # we just have a single dummy task

        self.fs.mkdir(self._task_dir('spark', step_num, task_num=0))
        # could potentially parse this for cause of error
        stderr_path = self._task_stderr_path('spark', step_num, task_num=0)
        stdout_path = self._task_output_path('spark', step_num, task_num=0)

        self._create_dist_cache_dir(step_num)
        wd = self._setup_working_dir('spark', step_num, task_num=0)

        # use abspath() on input URIs before changing working dir
        task_args = self._spark_script_args(step_num)

        with open(stdout_path, 'wb') as stdout, \
                open(stderr_path, 'wb') as stderr:
            with save_current_environment(), save_cwd(), save_sys_path(), \
                    save_sys_std():
                os.environ.update(_fix_env(self._opts['cmdenv']))
                os.chdir(wd)
                sys.path = [os.getcwd()] + sys.path

                # pretend we redirected stdout and stderr
                sys.stdout, sys.stderr = stdout, stderr

                task = self._mrjob_cls(task_args)
                task.execute()
Exemple #5
0
    def _run_step(self,
                  step_num,
                  step_type,
                  input_path,
                  output_path,
                  working_dir,
                  env,
                  child_stdin=None):
        step = self._get_step(step_num)

        common_args = (['--step-num=%d' % step_num] +
                       self._mr_job_extra_args(local=True))

        if step_type == 'mapper':
            child_args = (['--mapper'] + [input_path] + common_args)
        elif step_type == 'reducer':
            child_args = (['--reducer'] + [input_path] + common_args)
        elif step_type == 'combiner':
            child_args = ['--combiner'] + common_args + ['-']

        child_instance = self._mrjob_cls(args=child_args)

        has_combiner = (step_type == 'mapper' and 'combiner' in step)

        # Use custom stdin
        if has_combiner:
            child_stdout = StringIO()
        else:
            child_stdout = open(output_path, 'w')

        with save_current_environment():
            with save_cwd():
                os.environ.update(env)
                os.chdir(working_dir)

                child_instance.sandbox(stdin=child_stdin, stdout=child_stdout)
                child_instance.execute()

        if has_combiner:
            sorted_lines = sorted(child_stdout.getvalue().splitlines())
            combiner_stdin = StringIO('\n'.join(sorted_lines))
        else:
            child_stdout.flush()

        child_stdout.close()

        while len(self._counters) <= step_num:
            self._counters.append({})
        child_instance.parse_counters(self._counters[step_num])

        if has_combiner:
            self._run_step(step_num,
                           'combiner',
                           None,
                           output_path,
                           working_dir,
                           env,
                           child_stdin=combiner_stdin)

            combiner_stdin.close()
Exemple #6
0
    def _run(self):
        self._setup_output_dir()

        assert self._script  # shouldn't be able to run if no script

        for ignored_opt in self.IGNORED_HADOOP_OPTS:
            if ((not self._opts.is_default(ignored_opt)) and
                self._opts[ignored_opt]):
                log.warning('ignoring %s option (requires real Hadoop): %r' %
                            (ignored_opt, self._opts[ignored_opt]))

        for ignored_attr in self.IGNORED_HADOOP_ATTRS:
            value = getattr(self, ignored_attr)
            if value is not None:
                log.warning(
                    'ignoring %s keyword arg (requires real Hadoop): %r' %
                    (ignored_attr[1:], value))

        for ignored_opt in self.IGNORED_LOCAL_OPTS:
            if ((not self._opts.is_default(ignored_opt)) and
                self._opts[ignored_opt]):
                log.warning('ignoring %s option (use -r local instead): %r' %
                            (ignored_opt, self._opts[ignored_opt]))

        with save_current_environment():
            # set cmdenv variables
            os.environ.update(self._get_cmdenv())

            steps = self._get_steps()

            for step_dict in steps:
                self._check_step_is_mrjob_only(step_dict)

            # run mapper, sort, reducer for each step
            for step_number, step_dict in enumerate(steps):

                self._invoke_inline_mrjob(
                    step_number, step_dict, 'step-%d-mapper' % step_number,
                    'mapper')

                if 'reducer' in step_dict:
                    mapper_output_path = self._prev_outfile
                    sorted_mapper_output_path = self._decide_output_path(
                        'step-%d-mapper-sorted' % step_number)
                    with open(sorted_mapper_output_path, 'w') as sort_out:
                        proc = subprocess.Popen(
                            ['sort', mapper_output_path],
                            stdout=sort_out, env={'LC_ALL': 'C'})
                    proc.wait()

                    # This'll read from sorted_mapper_output_path
                    self._invoke_inline_mrjob(
                        step_number, step_dict,
                        'step-%d-reducer' % step_number, 'reducer')

        # move final output to output directory
        self._final_outfile = os.path.join(self._output_dir, 'part-00000')
        log.info('Moving %s -> %s' % (self._prev_outfile, self._final_outfile))
        shutil.move(self._prev_outfile, self._final_outfile)
Exemple #7
0
    def _run(self):
        self._setup_output_dir()

        for ignored_opt in self.IGNORED_HADOOP_OPTS:
            if ((not self._opts.is_default(ignored_opt))
                    and self._opts[ignored_opt]):
                log.warning('ignoring %s option (requires real Hadoop): %r' %
                            (ignored_opt, self._opts[ignored_opt]))

        for ignored_attr in self.IGNORED_HADOOP_ATTRS:
            value = getattr(self, ignored_attr)
            if value is not None:
                log.warning(
                    'ignoring %s keyword arg (requires real Hadoop): %r' %
                    (ignored_attr[1:], value))

        for ignored_opt in self.IGNORED_LOCAL_OPTS:
            if ((not self._opts.is_default(ignored_opt))
                    and self._opts[ignored_opt]):
                log.warning('ignoring %s option (use -r local instead): %r' %
                            (ignored_opt, self._opts[ignored_opt]))

        with save_current_environment():
            # set cmdenv variables
            os.environ.update(self._get_cmdenv())

            steps = self._get_steps()

            for step_dict in steps:
                self._check_step_is_mrjob_only(step_dict)

            # run mapper, sort, reducer for each step
            for step_number, step_dict in enumerate(steps):

                self._invoke_inline_mrjob(step_number, step_dict,
                                          'step-%d-mapper' % step_number,
                                          'mapper')

                if 'reducer' in step_dict:
                    mapper_output_path = self._prev_outfile
                    sorted_mapper_output_path = self._decide_output_path(
                        'step-%d-mapper-sorted' % step_number)
                    with open(sorted_mapper_output_path, 'w') as sort_out:
                        proc = subprocess.Popen(['sort', mapper_output_path],
                                                stdout=sort_out,
                                                env={'LC_ALL': 'C'})
                    proc.wait()

                    # This'll read from sorted_mapper_output_path
                    self._invoke_inline_mrjob(step_number, step_dict,
                                              'step-%d-reducer' % step_number,
                                              'reducer')

        # move final output to output directory
        self._final_outfile = os.path.join(self._output_dir, 'part-00000')
        log.info('Moving %s -> %s' % (self._prev_outfile, self._final_outfile))
        shutil.move(self._prev_outfile, self._final_outfile)
Exemple #8
0
    def run_step(self,
                 step_dict,
                 input_file,
                 outfile_name,
                 step_number,
                 step_type,
                 env,
                 child_stdin=None):
        common_args = (['--step-num=%d' % step_number] +
                       self._mr_job_extra_args(local=True))

        if step_type == 'mapper':
            child_args = (['--mapper'] + [input_file] + common_args)
        elif step_type == 'reducer':
            child_args = (['--reducer'] + [input_file] + common_args)
        elif step_type == 'combiner':
            child_args = ['--combiner'] + common_args + ['-']

        child_instance = self._mrjob_cls(args=child_args)

        has_combiner = (step_type == 'mapper' and 'combiner' in step_dict)

        # Use custom stdin
        if has_combiner:
            child_stdout = StringIO()
        else:
            child_stdout = open(outfile_name, 'w')

        with save_current_environment():
            os.environ.update(env)
            child_instance.sandbox(stdin=child_stdin, stdout=child_stdout)
            child_instance.execute()

        if has_combiner:
            sorted_lines = sorted(child_stdout.getvalue().splitlines())
            combiner_stdin = StringIO('\n'.join(sorted_lines))
        else:
            child_stdout.flush()

        child_stdout.close()

        while len(self._counters) <= step_number:
            self._counters.append({})
        child_instance.parse_counters(self._counters[step_number - 1])

        if has_combiner:
            self.run_step(step_dict,
                          "",
                          outfile_name,
                          step_number,
                          'combiner',
                          env=env,
                          child_stdin=combiner_stdin)

            combiner_stdin.close()
Exemple #9
0
    def _run_step(self, step_num, step_type, input_path, output_path,
                  working_dir, env, child_stdin=None):
        step = self._get_step(step_num)

        common_args = (['--step-num=%d' % step_num] +
                       self._mr_job_extra_args(local=True))

        if step_type == 'mapper':
            child_args = (
                ['--mapper'] + [input_path] + common_args)
        elif step_type == 'reducer':
            child_args = (
                ['--reducer'] + [input_path] + common_args)
        elif step_type == 'combiner':
            child_args = ['--combiner'] + common_args + ['-']

        child_instance = self._mrjob_cls(args=child_args)

        has_combiner = (step_type == 'mapper' and 'combiner' in step)

        # Use custom stdin
        if has_combiner:
            child_stdout = StringIO()
        else:
            child_stdout = open(output_path, 'w')

        with save_current_environment():
            with save_cwd():
                os.environ.update(env)
                os.chdir(working_dir)

                child_instance.sandbox(stdin=child_stdin, stdout=child_stdout)
                child_instance.execute()

        if has_combiner:
            sorted_lines = sorted(child_stdout.getvalue().splitlines())
            combiner_stdin = StringIO('\n'.join(sorted_lines))
        else:
            child_stdout.flush()

        child_stdout.close()

        while len(self._counters) <= step_num:
            self._counters.append({})
        parse_mr_job_stderr(child_instance.stderr.getvalue(),
                            counters=self._counters[step_num])

        if has_combiner:
            self._run_step(step_num, 'combiner', None, output_path,
                           working_dir, env, child_stdin=combiner_stdin)

            combiner_stdin.close()
Exemple #10
0
    def run_step(self, step_dict, input_file, outfile_name,
                 step_number, step_type, env,
                child_stdin=None):
        common_args = (['--step-num=%d' % step_number] +
                       self._mr_job_extra_args(local=True))

        if step_type == 'mapper':
            child_args = (
                ['--mapper'] + [input_file] + common_args)
        elif step_type == 'reducer':
            child_args = (
                ['--reducer'] + [input_file] + common_args)
        elif step_type == 'combiner':
            child_args = ['--combiner'] + common_args + ['-']

        child_instance = self._mrjob_cls(args=child_args)

        has_combiner = (step_type == 'mapper' and 'combiner' in step_dict)

        # Use custom stdin
        if has_combiner:
            child_stdout = StringIO()
        else:
            child_stdout = open(outfile_name, 'w')

        with save_current_environment():
            os.environ.update(env)
            child_instance.sandbox(stdin=child_stdin, stdout=child_stdout)
            child_instance.execute()

        if has_combiner:
            sorted_lines = sorted(child_stdout.getvalue().splitlines())
            combiner_stdin = StringIO('\n'.join(sorted_lines))
        else:
            child_stdout.flush()

        child_stdout.close()

        while len(self._counters) <= step_number:
            self._counters.append({})
        child_instance.parse_counters(self._counters[step_number - 1])

        if has_combiner:
            self.run_step(step_dict, "", outfile_name, step_number, 'combiner',
                          env=env, child_stdin=combiner_stdin)

            combiner_stdin.close()
Exemple #11
0
    def _run_step(self, step_num, step_type, input_path, output_path, working_dir, env, child_stdin=None):
        step = self._get_step(step_num)

        # Passing local=False ensures the job uses proper names for file
        # options (see issue #851 on github)
        common_args = ["--step-num=%d" % step_num] + self._mr_job_extra_args(local=False)

        if step_type == "mapper":
            child_args = ["--mapper"] + [input_path] + common_args
        elif step_type == "reducer":
            child_args = ["--reducer"] + [input_path] + common_args
        elif step_type == "combiner":
            child_args = ["--combiner"] + common_args + ["-"]

        has_combiner = step_type == "mapper" and "combiner" in step

        # Use custom stdin
        if has_combiner:
            child_stdout = BytesIO()
        else:
            child_stdout = open(output_path, "wb")

        with save_current_environment():
            with save_cwd():
                os.environ.update(env)
                os.chdir(working_dir)

                child_instance = self._mrjob_cls(args=child_args)
                child_instance.sandbox(stdin=child_stdin, stdout=child_stdout)
                child_instance.execute()

        if has_combiner:
            sorted_lines = sorted(child_stdout.getvalue().splitlines())
            combiner_stdin = BytesIO(b"\n".join(sorted_lines))
        else:
            child_stdout.flush()

        child_stdout.close()

        while len(self._counters) <= step_num:
            self._counters.append({})
        parse_mr_job_stderr(child_instance.stderr.getvalue(), counters=self._counters[step_num])

        if has_combiner:
            self._run_step(step_num, "combiner", None, output_path, working_dir, env, child_stdin=combiner_stdin)

            combiner_stdin.close()
Exemple #12
0
        def invoke_task(stdin, stdout, stderr, wd, env):
            with save_current_environment(), save_cwd():
                os.environ.update(env)
                os.chdir(wd)

                try:
                    task = self._mrjob_cls(
                        args=self._args_for_task(step_num, task_type))
                    task.sandbox(stdin=stdin, stdout=stdout, stderr=stderr)

                    task.execute()
                except:
                    # so users can figure out where the exception came from;
                    # see _log_cause_of_error(). we can't wrap the exception
                    # because then we lose the stacktrace (which is the whole
                    # point of the inline runner)

                    # TODO: could write this to a file instead
                    self._error_while_reading_from = self._task_input_path(
                        task_type, step_num, task_num)
                    raise
Exemple #13
0
        def invoke_task(stdin, stdout, stderr, wd, env):
            with save_current_environment(), save_cwd():
                os.environ.update(env)
                os.chdir(wd)

                try:
                    task = self._mrjob_cls(
                        args=self._args_for_task(step_num, task_type))
                    task.sandbox(stdin=stdin, stdout=stdout, stderr=stderr)

                    task.execute()
                except:
                    # so users can figure out where the exception came from;
                    # see _log_cause_of_error(). we can't wrap the exception
                    # because then we lose the stacktrace (which is the whole
                    # point of the inline runner)

                    # TODO: could write this to a file instead
                    self._error_while_reading_from = self._task_input_path(
                        task_type, step_num, task_num)
                    raise
Exemple #14
0
        def invoke_task(stdin, stdout, stderr, wd, env):
            with save_current_environment(), save_cwd(), save_sys_path(), \
                    save_sys_std():
                # pretend we're running the script in the working dir
                os.environ.update(env)
                os.chdir(wd)
                sys.path = [os.getcwd()] + sys.path

                # pretend we've redirected stdin/stdout/stderr
                sys.stdin = stdin
                sys.stdout = stdout
                sys.stderr = stderr

                input_uri = None
                try:
                    args = self._args_for_task(step_num, task_type)

                    if manifest:
                        # read input path from stdin, add to args
                        line = stdin.readline().decode('utf_8')
                        input_uri = line.split('\t')[-1].rstrip()
                        # input_uri is an absolute path, can serve
                        # as path and uri both
                        args = list(args) + [input_uri, input_uri]

                    task = self._mrjob_cls(args)
                    task.execute()
                except:
                    # so users can figure out where the exception came from;
                    # see _log_cause_of_error(). we can't wrap the exception
                    # because then we lose the stacktrace (which is the whole
                    # point of the inline runner)

                    if input_uri:  # from manifest
                        self._error_while_reading_from = input_uri
                    else:
                        self._error_while_reading_from = self._task_input_path(
                            task_type, step_num, task_num)

                    raise
Exemple #15
0
    def test_explicit_region_beats_environment(self):
        with save_current_environment():
            os.environ['CLOUDSDK_COMPUTE_REGION'] = 'us-east1'
            runner = DataprocJobRunner(region='europe-west1-a')

        self.assertEqual(runner._opts['region'], 'europe-west1-a')
Exemple #16
0
    def test_explicit_zone_beats_environment(self):
        with save_current_environment():
            os.environ['CLOUDSDK_COMPUTE_ZONE'] = 'us-west1-b'
            runner = DataprocJobRunner(zone='europe-west1-a')

        self.assertEqual(runner._opts['zone'], 'europe-west1-a')
Exemple #17
0
    def test_zone_from_environment(self):
        with save_current_environment():
            os.environ['CLOUDSDK_COMPUTE_ZONE'] = 'us-west1-b'
            runner = DataprocJobRunner()

        self.assertEqual(runner._opts['zone'], 'us-west1-b')
Exemple #18
0
    def test_region_from_environment(self):
        with save_current_environment():
            os.environ['CLOUDSDK_COMPUTE_REGION'] = 'us-east1'
            runner = DataprocJobRunner()

        self.assertEqual(runner._opts['region'], 'us-east1')