Ejemplo n.º 1
0
    def _run_step_on_spark(self, step, step_num):
        """Set up a fake working directory and environment, and call the Spark
        method."""
        # this is kind of a Spark-specific mash-up of _run_streaming_step()
        # (in sim.py) and _invoke_task_func(), above

        # don't create the output dir for the step; that's Spark's job

        # breaking the Spark step down into tasks is pyspark's job, so
        # we just have a single dummy task

        self.fs.mkdir(self._task_dir('spark', step_num, task_num=0))
        # could potentially parse this for cause of error
        stderr_path = self._task_stderr_path('spark', step_num, task_num=0)
        stdout_path = self._task_output_path('spark', step_num, task_num=0)

        self._create_dist_cache_dir(step_num)
        wd = self._setup_working_dir('spark', step_num, task_num=0)

        # use abspath() on input URIs before changing working dir
        task_args = self._spark_script_args(step_num)

        with open(stdout_path, 'wb') as stdout, \
                open(stderr_path, 'wb') as stderr:
            with save_current_environment(), save_cwd(), save_sys_path(), \
                    save_sys_std():
                os.environ.update(_fix_env(self._opts['cmdenv']))
                os.chdir(wd)
                sys.path = [os.getcwd()] + sys.path

                # pretend we redirected stdout and stderr
                sys.stdout, sys.stderr = stdout, stderr

                task = self._mrjob_cls(task_args)
                task.execute()
Ejemplo n.º 2
0
    def test_basic(self):
        fake_stdin = BytesIO(b'HI')
        fake_stdout = BytesIO()
        fake_stderr = BytesIO()

        with save_sys_std():
            sys.stdin = fake_stdin
            self.assertEqual(sys.stdin.read(), b'HI')

            sys.stdout = fake_stdout
            sys.stdout.write(b'Hello!\n')

            sys.stderr = fake_stderr
            sys.stderr.write(b'!!!')

        self.assertEqual(sys.stdin, self.stdin)
        self.assertEqual(sys.stdout, self.stdout)
        self.assertEqual(sys.stderr, self.stderr)

        self.assertFalse(self.stdin.read.called)
        self.assertFalse(self.stdout.write.called)
        self.assertFalse(self.stderr.write.called)

        self.assertEqual(fake_stdout.getvalue(), b'Hello!\n')
        self.assertEqual(fake_stderr.getvalue(), b'!!!')
Ejemplo n.º 3
0
    def test_basic(self):
        fake_stdin = BytesIO(b'HI')
        fake_stdout = BytesIO()
        fake_stderr = BytesIO()

        with save_sys_std():
            sys.stdin = fake_stdin
            self.assertEqual(sys.stdin.read(), b'HI')

            sys.stdout = fake_stdout
            sys.stdout.write(b'Hello!\n')

            sys.stderr = fake_stderr
            sys.stderr.write(b'!!!')

        self.assertEqual(sys.stdin, self.stdin)
        self.assertEqual(sys.stdout, self.stdout)
        self.assertEqual(sys.stderr, self.stderr)

        self.assertFalse(self.stdin.read.called)
        self.assertFalse(self.stdout.write.called)
        self.assertFalse(self.stderr.write.called)

        self.assertEqual(fake_stdout.getvalue(), b'Hello!\n')
        self.assertEqual(fake_stderr.getvalue(), b'!!!')
Ejemplo n.º 4
0
Archivo: inline.py Proyecto: Yelp/mrjob
    def _run_step_on_spark(self, step, step_num):
        """Set up a fake working directory and environment, and call the Spark
        method."""
        # this is kind of a Spark-specific mash-up of _run_streaming_step()
        # (in sim.py) and _invoke_task_func(), above

        # don't create the output dir for the step; that's Spark's job

        # breaking the Spark step down into tasks is pyspark's job, so
        # we just have a single dummy task

        self.fs.mkdir(self._task_dir('spark', step_num, task_num=0))
        # could potentially parse this for cause of error
        stderr_path = self._task_stderr_path('spark', step_num, task_num=0)
        stdout_path = self._task_output_path('spark', step_num, task_num=0)

        self._create_dist_cache_dir(step_num)
        wd = self._setup_working_dir('spark', step_num, task_num=0)

        # use abspath() on input URIs before changing working dir
        task_args = self._spark_script_args(step_num)

        with open(stdout_path, 'wb') as stdout, \
                open(stderr_path, 'wb') as stderr:
            with save_current_environment(), save_cwd(), save_sys_path(), \
                    save_sys_std():
                os.environ.update(_fix_env(self._opts['cmdenv']))
                os.chdir(wd)
                sys.path = [os.getcwd()] + sys.path

                # pretend we redirected stdout and stderr
                sys.stdout, sys.stderr = stdout, stderr

                task = self._mrjob_cls(task_args)
                task.execute()
Ejemplo n.º 5
0
    def test_bad_flush(self):
        fake_stdout = "LOOK AT ME I'M STDOUT"
        self.assertFalse(hasattr(fake_stdout, 'flush'))

        with save_sys_std():
            sys.stdout = fake_stdout

        self.assertEqual(sys.stdout, self.stdout)
        self.assertEqual(self.stdout.flush.call_count, 1)

        # sys.stderr, which was not patched, should be flushed twice
        self.assertEqual(self.stderr.flush.call_count, 2)
Ejemplo n.º 6
0
    def test_bad_flush(self):
        fake_stdout = "LOOK AT ME I'M STDOUT"
        self.assertFalse(hasattr(fake_stdout, 'flush'))

        with save_sys_std():
            sys.stdout = fake_stdout

        self.assertEqual(sys.stdout, self.stdout)
        self.assertEqual(self.stdout.flush.call_count, 1)

        # sys.stderr, which was not patched, should be flushed twice
        self.assertEqual(self.stderr.flush.call_count, 2)
Ejemplo n.º 7
0
    def test_flushing(self):
        fake_stderr = Mock()

        with save_sys_std():
            sys.stderr = fake_stderr
            sys.stderr.write(b'Hello!\n')

        self.assertEqual(self.stderr.flush.call_count, 1)
        self.assertEqual(fake_stderr.flush.call_count, 1)

        # stdout was never patched, so it gets flushed twice
        self.assertEqual(self.stdout.flush.call_count, 2)

        # we don't flush stdin
        self.assertFalse(self.stdin.flush.called)
Ejemplo n.º 8
0
    def test_flushing(self):
        fake_stderr = Mock()

        with save_sys_std():
            sys.stderr = fake_stderr
            sys.stderr.write(b'Hello!\n')

        self.assertEqual(self.stderr.flush.call_count, 1)
        self.assertEqual(fake_stderr.flush.call_count, 1)

        # stdout was never patched, so it gets flushed twice
        self.assertEqual(self.stdout.flush.call_count, 2)

        # we don't flush stdin
        self.assertFalse(self.stdin.flush.called)
Ejemplo n.º 9
0
        def invoke_task(stdin, stdout, stderr, wd, env):
            with save_current_environment(), save_cwd(), save_sys_path(), \
                    save_sys_std():
                # pretend we're running the script in the working dir
                os.environ.update(env)
                os.chdir(wd)
                sys.path = [os.getcwd()] + sys.path

                # pretend we've redirected stdin/stdout/stderr
                sys.stdin = stdin
                sys.stdout = stdout
                sys.stderr = stderr

                input_uri = None
                try:
                    args = self._args_for_task(step_num, task_type)

                    if manifest:
                        # read input path from stdin, add to args
                        line = stdin.readline().decode('utf_8')
                        input_uri = line.split('\t')[-1].rstrip()
                        # input_uri is an absolute path, can serve
                        # as path and uri both
                        args = list(args) + [input_uri, input_uri]

                    task = self._mrjob_cls(args)
                    task.execute()
                except:
                    # so users can figure out where the exception came from;
                    # see _log_cause_of_error(). we can't wrap the exception
                    # because then we lose the stacktrace (which is the whole
                    # point of the inline runner)

                    if input_uri:  # from manifest
                        self._error_while_reading_from = input_uri
                    else:
                        self._error_while_reading_from = self._task_input_path(
                            task_type, step_num, task_num)

                    raise
Ejemplo n.º 10
0
Archivo: inline.py Proyecto: Yelp/mrjob
        def invoke_task(stdin, stdout, stderr, wd, env):
            with save_current_environment(), save_cwd(), save_sys_path(), \
                    save_sys_std():
                # pretend we're running the script in the working dir
                os.environ.update(env)
                os.chdir(wd)
                sys.path = [os.getcwd()] + sys.path

                # pretend we've redirected stdin/stdout/stderr
                sys.stdin = stdin
                sys.stdout = stdout
                sys.stderr = stderr

                input_uri = None
                try:
                    args = self._args_for_task(step_num, task_type)

                    if manifest:
                        # read input path from stdin, add to args
                        line = stdin.readline().decode('utf_8')
                        input_uri = line.split('\t')[-1].rstrip()
                        # input_uri is an absolute path, can serve
                        # as path and uri both
                        args = list(args) + [input_uri, input_uri]

                    task = self._mrjob_cls(args)
                    task.execute()
                except:
                    # so users can figure out where the exception came from;
                    # see _log_cause_of_error(). we can't wrap the exception
                    # because then we lose the stacktrace (which is the whole
                    # point of the inline runner)

                    if input_uri:  # from manifest
                        self._error_while_reading_from = input_uri
                    else:
                        self._error_while_reading_from = self._task_input_path(
                            task_type, step_num, task_num)

                    raise