Example #1
0
    def _assert_output_matches(
            self, job_class, input_bytes=b'', input_paths=(), job_args=[]):

        # run classes defined in this module in inline mode, classes
        # with their own script files in local mode. used by
        # test_skip_combiner_that_runs_cmd()
        if job_class.__module__ == __name__:
            runner_alias = 'inline'
        else:
            runner_alias = 'local'

        reference_job = self._reference_job(
            job_class, input_bytes=input_bytes,
            input_paths=input_paths,
            job_args=job_args,
            runner_alias=runner_alias)

        with reference_job.make_runner() as runner:
            runner.run()

            reference_output = sorted(to_lines(runner.cat_output()))

        harness_job = self._harness_job(
            job_class, input_bytes=input_bytes,
            input_paths=input_paths,
            job_args=job_args)

        with harness_job.make_runner() as runner:
            runner.run()

            harness_output = sorted(to_lines(runner.cat_output()))

        self.assertEqual(harness_output, reference_output)
Example #2
0
 def test_no_trailing_newline(self):
     self.assertEqual(
         list(to_lines(iter([
             b'Alouette,\ngentille',
             b' Alouette.',
         ]))),
         [b'Alouette,\n', b'gentille Alouette.'])
Example #3
0
    def test_python_dash_v_as_python_bin(self):
        python_cmd = cmd_line([sys.executable or 'python', '-v'])
        mr_job = MRTwoStepJob(['--python-bin', python_cmd, '--no-conf',
                               '-r', 'local'])
        mr_job.sandbox(stdin=[b'bar\n'])

        with mr_job.make_runner() as runner:
            runner.run()

            # expect python -v crud in stderr

            with open(runner._task_stderr_path('mapper', 0, 0)) as lines:
                self.assertTrue(any(
                    'import mrjob' in line or  # Python 2
                    "import 'mrjob'" in line
                    for line in lines))

            with open(runner._task_stderr_path('mapper', 0, 0)) as lines:
                self.assertTrue(any(
                    '#' in line for line in lines))

            # should still get expected results
            self.assertEqual(
                sorted(to_lines(runner.cat_output())),
                sorted([b'1\tnull\n', b'1\t"bar"\n']))
Example #4
0
    def test_loading_bootstrapped_mrjob_library(self):
        # track the dir we're loading mrjob from rather than the full path
        # to deal with edge cases where we load from the .py file,
        # and the script loads from the .pyc compiled from that .py file.
        our_mrjob_dir = os.path.dirname(os.path.realpath(mrjob.__file__))

        with mrjob_conf_patcher():
            mr_job = MRJobWhereAreYou(['-r', 'local', '--bootstrap-mrjob'])
            mr_job.sandbox()

            with mr_job.make_runner() as runner:
                # sanity check
                self.assertEqual(runner._bootstrap_mrjob(), True)
                local_tmp_dir = os.path.realpath(runner._get_local_tmp_dir())

                runner.run()

                output = list(to_lines(runner.cat_output()))
                self.assertEqual(len(output), 1)

                # script should load mrjob from its working dir
                _, script_mrjob_dir = mr_job.parse_output_line(output[0])

                self.assertNotEqual(our_mrjob_dir, script_mrjob_dir)
                self.assertTrue(script_mrjob_dir.startswith(local_tmp_dir))
Example #5
0
    def test_mixed_job(self):
        # test a combination of streaming and spark steps
        job = MRStreamingAndSpark(['-r', 'spark'])
        job.sandbox(stdin=BytesIO(
            b'foo\nbar\n'))

        with job.make_runner() as runner:
            runner.run()

            # converts to 'null\t"foo"', 'null\t"bar"' and then counts chars
            self.assertEqual(
                sorted(to_lines(runner.cat_output())),
                [
                    b'\t 2\n',
                    b'" 4\n',
                    b'a 1\n',
                    b'b 1\n',
                    b'f 1\n',
                    b'l 4\n',
                    b'n 2\n',
                    b'o 2\n',
                    b'r 1\n',
                    b'u 2\n',
                ]
            )
Example #6
0
    def test_cat_output(self):
        a_dir_path = os.path.join(self.tmp_dir, 'a')
        b_dir_path = os.path.join(self.tmp_dir, 'b')
        l_dir_path = os.path.join(self.tmp_dir, '_logs')
        os.mkdir(a_dir_path)
        os.mkdir(b_dir_path)
        os.mkdir(l_dir_path)

        a_file_path = os.path.join(a_dir_path, 'part-00000')
        b_file_path = os.path.join(b_dir_path, 'part-00001')
        c_file_path = os.path.join(self.tmp_dir, 'part-00002')
        x_file_path = os.path.join(l_dir_path, 'log.xml')
        y_file_path = os.path.join(self.tmp_dir, '_SUCCESS')

        with open(a_file_path, 'w') as f:
            f.write('A')

        with open(b_file_path, 'w') as f:
            f.write('B')

        with open(c_file_path, 'w') as f:
            f.write('C')

        with open(x_file_path, 'w') as f:
            f.write('<XML XML XML/>')

        with open(y_file_path, 'w') as f:
            f.write('I win')

        runner = InlineMRJobRunner(conf_paths=[], output_dir=self.tmp_dir)
        self.assertEqual(sorted(to_lines(runner.cat_output())),
                         [b'A', b'B', b'C'])
Example #7
0
 def test_multiple_2(self):
     data = b'x\ny\nz\n'
     job = MRCmdJob(['--mapper-cmd=cat', '--reducer-cmd-2', 'wc -l',
                     '--runner=local', '--no-conf'])
     job.sandbox(stdin=BytesIO(data))
     with job.make_runner() as r:
         r.run()
         self.assertEqual(sum(int(l) for l in to_lines(r.cat_output())), 3)
Example #8
0
File: job.py Project: Yelp/mrjob
    def parse_output(self, chunks):
        """Parse the final output of this MRJob (as a stream of byte chunks)
        into a stream of ``(key, value)``.
        """
        read = self.output_protocol().read

        for line in to_lines(chunks):
            yield read(line)
Example #9
0
 def test_buffered_lines(self):
     self.assertEqual(
         list(to_lines(chunk for chunk in
                       [b'The quick\nbrown fox\nju',
                        b'mped over\nthe lazy\ndog',
                        b's.\n'])),
         [b'The quick\n', b'brown fox\n', b'jumped over\n', b'the lazy\n',
          b'dogs.\n'])
Example #10
0
 def test_long_lines(self):
     super_long_line = b'a' * 10000 + b'\n' + b'b' * 1000 + b'\nlast\n'
     self.assertEqual(
         list(to_lines(
             chunk for chunk in
             (super_long_line[0 + i:1024 + i]
              for i in range(0, len(super_long_line), 1024)))),
         [b'a' * 10000 + b'\n', b'b' * 1000 + b'\n', b'last\n'])
Example #11
0
    def test_read_all_non_hidden_files(self):
        self.makefile(os.path.join(self.output_dir, 'baz'),
                      b'qux\n')

        self.makefile(os.path.join(self.output_dir, 'foo', 'bar'),
                      b'baz\n')

        self.assertEqual(sorted(to_lines(self.runner.cat_output())),
                         [b'baz\n', b'qux\n'])
Example #12
0
    def test_output_dir_not_considered_hidden(self):
        output_dir = os.path.join(self.tmp_dir, '_hidden', '_output_dir')

        self.makefile(os.path.join(output_dir, 'part-00000'),
                      b'cats\n')

        runner = InlineMRJobRunner(conf_paths=[], output_dir=output_dir)

        self.assertEqual(sorted(to_lines(runner.stream_output())),
                         [b'cats\n'])
Example #13
0
    def stream_output(self):
        """Like :py:meth:`cat_output` except that it groups bytes into
        lines. Equivalent to ``mrjob.util.to_lines(runner.stream_output())``.

        .. deprecated:: 0.6.0
        """
        log.warning('stream_output() is deprecated and will be removed in'
                    ' v0.7.0. use mrjob.util.to_lines(runner.cat_output())'
                    ' instead.')

        return to_lines(self.cat_output())
Example #14
0
 def test_eof_without_trailing_newline(self):
     self.assertEqual(
         list(to_lines(iter([
             b'Alouette,\ngentille',
             b' Alouette.',
             b'',  # treated as EOF
             b'Allouette,\nje te p',
             b'lumerais.',
         ]))),
         [b'Alouette,\n', b'gentille Alouette.',
          b'Allouette,\n', b'je te plumerais.'])
Example #15
0
File: wrap.py Project: Affirm/mrjob
def _cat_log_lines(fs, path):
    """Yield lines from the given log.

    Log errors rather than raising them.
    """
    try:
        if not fs.exists(path):
            return
        for line in to_lines(fs.cat(path)):
            yield to_unicode(line)
    except (IOError, OSError) as e:
        log.warning("couldn't cat() %s: %r" % (path, e))
Example #16
0
    def test_no_file_args_required(self):
        words1 = self.makefile('words1', b'kit and caboodle\n')
        words2 = self.makefile('words2', b'baubles\nbangles and beads\n')

        job = MRJobLauncher(
            args=['-r', 'local', tests.sr_wc.__file__, words1, words2])
        job.sandbox()

        with job.make_runner() as runner:
            runner.run()

            lines = list(to_lines(runner.cat_output()))
            self.assertEqual(len(lines), 1)
            self.assertEqual(int(lines[0]), 7)
Example #17
0
    def test_mixed_job(self):
        # can we run just the streaming part of a job?
        input_bytes = b'foo\nbar\n'

        job = self._harness_job(
            MRStreamingAndSpark, input_bytes=input_bytes,
            first_step_num=0, last_step_num=0)

        with job.make_runner() as runner:
            runner.run()

            # the streaming part is just an identity mapper, but it converts
            # lines to pairs of JSON
            self.assertEqual(set(to_lines(runner.cat_output())),
                             {b'null\t"foo"\n', b'null\t"bar"\n'})
Example #18
0
    def test_output_in_subdirs(self):
        # test for output being placed in subdirs, for example with nicknack
        self.makefile(os.path.join(self.output_dir, 'a', 'part-00000'),
                      b'line-a0\n')
        self.makefile(os.path.join(self.output_dir, 'a', 'part-00001'),
                      b'line-a1\n')

        self.makefile(os.path.join(self.output_dir, 'b', 'part-00000'),
                      b'line-b0\n')

        self.makefile(os.path.join(self.output_dir, 'b', '.crc.part-00000'),
                      b'42\n')

        self.assertEqual(sorted(to_lines(self.runner.cat_output())),
                         [b'line-a0\n', b'line-a1\n', b'line-b0\n'])
Example #19
0
    def test_mixed_job(self):
        # can we run just the streaming part of a job?
        input_bytes = b'foo\nbar\n'

        job = self._harness_job(MRStreamingAndSpark,
                                input_bytes=input_bytes,
                                start_step=0,
                                end_step=1)

        with job.make_runner() as runner:
            runner.run()

            # the streaming part is just an identity mapper, but it converts
            # lines to pairs of JSON
            self.assertEqual(set(to_lines(runner.cat_output())),
                             {b'null\t"foo"\n', b'null\t"bar"\n'})
Example #20
0
    def test_cat_mapper(self):
        data = b'x\ny\nz\n'
        job = MRCmdJob(['--mapper-cmd=cat', '--runner=local'])
        job.sandbox(stdin=BytesIO(data))
        with job.make_runner() as r:
            self.assertEqual(r._get_steps(), [{
                'type': 'streaming',
                'mapper': {
                    'type': 'command',
                    'command': 'cat'
                }
            }])

            r.run()
            lines = [line.strip() for line in to_lines(r.cat_output())]
            self.assertEqual(sorted(lines), sorted(data.split()))
Example #21
0
    def test_spark_script_mrjob(self):
        text = b'one fish\ntwo fish\nred fish\nblue fish\n'

        job = MRSparkScriptWordcount(['-r', 'local'])
        job.sandbox(stdin=BytesIO(text))

        counts = {}

        with job.make_runner() as runner:
            runner.run()

            for line in to_lines(runner.cat_output()):
                k, v = safeeval(line)
                counts[k] = v

        self.assertEqual(counts, dict(blue=1, fish=4, one=1, red=1, two=1))
Example #22
0
    def test_cat_mapper(self):
        data = b'x\ny\nz\n'
        job = MRCmdJob(['--mapper-cmd=cat', '--runner=local'])
        job.sandbox(stdin=BytesIO(data))
        with job.make_runner() as r:
            self.assertEqual(
                r._get_steps(),
                [{
                    'type': 'streaming',
                    'mapper': {
                        'type': 'command',
                        'command': 'cat'}}])

            r.run()
            lines = [line.strip() for line in to_lines(r.cat_output())]
            self.assertEqual(sorted(lines), sorted(data.split()))
Example #23
0
    def test_mapper_pre_filter(self):
        data = b'x\ny\nz\n'
        job = MRFilterJob(['--mapper-filter', 'cat -e', '--runner=local'])
        job.sandbox(stdin=BytesIO(data))
        with job.make_runner() as r:
            self.assertEqual(
                r._get_steps(),
                [{
                    'type': 'streaming',
                    'mapper': {
                        'type': 'script',
                        'pre_filter': 'cat -e'}}])

            r.run()

            lines = [line.strip() for line in to_lines(r.cat_output())]
            self.assertEqual(sorted(lines), [b'x$', b'y$', b'z$'])
Example #24
0
    def test_mapper_pre_filter(self):
        data = b'x\ny\nz\n'
        job = MRFilterJob(['--mapper-filter', 'cat -e', '--runner=local'])
        job.sandbox(stdin=BytesIO(data))
        with job.make_runner() as r:
            self.assertEqual(r._get_steps(), [{
                'type': 'streaming',
                'mapper': {
                    'type': 'script',
                    'pre_filter': 'cat -e'
                }
            }])

            r.run()

            lines = [line.strip() for line in to_lines(r.cat_output())]
            self.assertEqual(sorted(lines), [b'x$', b'y$', b'z$'])
Example #25
0
    def test_spark_mrjob(self):
        text = b'one fish\ntwo fish\nred fish\nblue fish\n'

        job = MRSparkWordcount(['-r', 'inline'])
        job.sandbox(stdin=BytesIO(text))

        counts = {}

        with job.make_runner() as runner:
            runner.run()

            for line in to_lines(runner.cat_output()):
                k, v = safeeval(line)
                counts[k] = v

        self.assertEqual(counts, dict(
            blue=1, fish=4, one=1, red=1, two=1))
    def test_count_words(self):
        job = MRSparkWordcount([])
        job.sandbox(
            stdin=BytesIO(b'Mary had a little lamb\nlittle lamb\nlittle lamb'))

        with job.make_runner() as runner:
            runner.run()

            output = sorted(
                safeeval(line) for line in to_lines(runner.cat_output()))

            self.assertEqual(output, [
                ('a', 1),
                ('had', 1),
                ('lamb', 3),
                ('little', 3),
                ('mary', 1),
            ])
Example #27
0
    def test_does_not_override_hadoop_input_format(self):
        input1_path = self.makefile('input1', b'potato')
        input2_path = self.makefile('input2', b'potato tomato')

        manifest_path = self.makefile('manifest')
        with open(manifest_path, 'w') as manifest:
            manifest.write('%s\n%s\n' % (input1_path, input2_path))

        job = MRNickNackWithHadoopInputFormat(['-r', 'spark', manifest_path])
        job.sandbox()

        with job.make_runner() as runner:
            runner.run()

            output_counts = dict(line.strip().split(b'\t')
                                 for line in to_lines(runner.cat_output()))

        self.assertEqual(output_counts, {b'"tomato"': b'1', b'"potato"': b'2'})
Example #28
0
    def gz_test(self, dir_path_name):
        contents_gz = [b'bar\n', b'qux\n', b'foo\n', b'bar\n',
                       b'qux\n', b'foo\n']
        contents_normal = [b'foo\n', b'bar\n', b'bar\n']
        all_contents_sorted = sorted(contents_gz + contents_normal)

        input_gz_path = join(dir_path_name, 'input.gz')
        input_gz = gzip.GzipFile(input_gz_path, 'wb')
        input_gz.write(b''.join(contents_gz))
        input_gz.close()
        input_path2 = join(dir_path_name, 'input2')
        with open(input_path2, 'wb') as input_file:
            input_file.write(b''.join(contents_normal))

        runner = LocalMRJobRunner(conf_paths=[])

        # split into 3 files
        file_splits = runner._get_file_splits([input_gz_path, input_path2], 3)

        # Make sure that input.gz occurs in a single split that starts at
        # its beginning and ends at its end
        for split_info in file_splits.values():
            if split_info['orig_name'] == input_gz_path:
                self.assertEqual(split_info['start'], 0)
                self.assertEqual(split_info['length'],
                                 os.stat(input_gz_path)[stat.ST_SIZE])

        # make sure we get 3 files
        self.assertEqual(len(file_splits), 3)

        # make sure all the data is preserved
        content = []
        for file_name in file_splits:
            with open(file_name, 'rb') as f:
                lines = list(to_lines(decompress(f, file_name)))

            # make sure the input_gz split got its entire contents
            if file_name == input_gz_path:
                self.assertEqual(lines, contents_gz)

            content.extend(lines)

        self.assertEqual(sorted(content),
                         all_contents_sorted)
Example #29
0
    def gz_test(self, dir_path_name):
        contents_gz = [
            b'bar\n', b'qux\n', b'foo\n', b'bar\n', b'qux\n', b'foo\n'
        ]
        contents_normal = [b'foo\n', b'bar\n', b'bar\n']
        all_contents_sorted = sorted(contents_gz + contents_normal)

        input_gz_path = join(dir_path_name, 'input.gz')
        input_gz = gzip.GzipFile(input_gz_path, 'wb')
        input_gz.write(b''.join(contents_gz))
        input_gz.close()
        input_path2 = join(dir_path_name, 'input2')
        with open(input_path2, 'wb') as input_file:
            input_file.write(b''.join(contents_normal))

        runner = LocalMRJobRunner(conf_paths=[])

        # split into 3 files
        file_splits = runner._get_file_splits([input_gz_path, input_path2], 3)

        # Make sure that input.gz occurs in a single split that starts at
        # its beginning and ends at its end
        for split_info in file_splits.values():
            if split_info['orig_name'] == input_gz_path:
                self.assertEqual(split_info['start'], 0)
                self.assertEqual(split_info['length'],
                                 os.stat(input_gz_path)[stat.ST_SIZE])

        # make sure we get 3 files
        self.assertEqual(len(file_splits), 3)

        # make sure all the data is preserved
        content = []
        for file_name in file_splits:
            with open(file_name, 'rb') as f:
                lines = list(to_lines(decompress(f, file_name)))

            # make sure the input_gz split got its entire contents
            if file_name == input_gz_path:
                self.assertEqual(lines, contents_gz)

            content.extend(lines)

        self.assertEqual(sorted(content), all_contents_sorted)
Example #30
0
    def test_copy_files_with_rename_to_local_wd_mirror(self):
        # see test_upload_files_with_rename() in test_local for comparison

        fish_path = self.makefile('fish', b'salmon')
        fowl_path = self.makefile('fowl', b'goose')

        # use _LOCAL_CLUSTER_MASTER because the default master (local[*])
        # doesn't have a working directory
        job = MRSparkOSWalk(['-r', 'spark',
                             '--spark-master', _LOCAL_CLUSTER_MASTER,
                             '--file', fish_path + '#ghoti',
                             '--file', fowl_path])
        job.sandbox()

        file_sizes = {}

        with job.make_runner() as runner:
            runner.run()

            # check working dir mirror
            wd_mirror = runner._wd_mirror()
            self.assertIsNotNone(wd_mirror)
            self.assertFalse(is_uri(wd_mirror))

            self.assertTrue(exists(wd_mirror))
            # only files which needed to be renamed should be in wd_mirror
            self.assertTrue(exists(join(wd_mirror, 'ghoti')))
            self.assertFalse(exists(join(wd_mirror, 'fish')))
            self.assertFalse(exists(join(wd_mirror, 'fowl')))

            for line in to_lines(runner.cat_output()):
                path, size = safeeval(line)
                file_sizes[path] = size

        # check that files were uploaded to working dir
        self.assertIn('fowl', file_sizes)
        self.assertEqual(file_sizes['fowl'], 5)

        self.assertIn('ghoti', file_sizes)
        self.assertEqual(file_sizes['ghoti'], 6)

        # fish was uploaded as "ghoti"
        self.assertNotIn('fish', file_sizes)
Example #31
0
    def test_copy_files_with_rename_to_local_wd_mirror(self):
        # see test_upload_files_with_rename() in test_local for comparison

        fish_path = self.makefile('fish', b'salmon')
        fowl_path = self.makefile('fowl', b'goose')

        # use _LOCAL_CLUSTER_MASTER because the default master (local[*])
        # doesn't have a working directory
        job = MRSparkOSWalk([
            '-r', 'spark', '--spark-master', _LOCAL_CLUSTER_MASTER, '--files',
            '%s#ghoti,%s' % (fish_path, fowl_path)
        ])
        job.sandbox()

        file_sizes = {}

        with job.make_runner() as runner:
            runner.run()

            # check working dir mirror
            wd_mirror = runner._wd_mirror()
            self.assertIsNotNone(wd_mirror)
            self.assertFalse(is_uri(wd_mirror))

            self.assertTrue(exists(wd_mirror))
            # only files which needed to be renamed should be in wd_mirror
            self.assertTrue(exists(join(wd_mirror, 'ghoti')))
            self.assertFalse(exists(join(wd_mirror, 'fish')))
            self.assertFalse(exists(join(wd_mirror, 'fowl')))

            for line in to_lines(runner.cat_output()):
                path, size = safeeval(line)
                file_sizes[path] = size

        # check that files were uploaded to working dir
        self.assertIn('fowl', file_sizes)
        self.assertEqual(file_sizes['fowl'], 5)

        self.assertIn('ghoti', file_sizes)
        self.assertEqual(file_sizes['ghoti'], 6)

        # fish was uploaded as "ghoti"
        self.assertNotIn('fish', file_sizes)
Example #32
0
    def test_hadoop_output_format(self):
        input_bytes = b'ee eye ee eye oh'

        job = MRNickNack(['-r', 'spark'])
        job.sandbox(stdin=BytesIO(input_bytes))

        with job.make_runner() as runner:
            runner.run()

            # nicknack.MultipleValueOutputFormat should put output in subdirs
            self.assertTrue(
                runner.fs.exists(runner.fs.join(runner.get_output_dir(), 'e')))

            self.assertTrue(
                runner.fs.exists(runner.fs.join(runner.get_output_dir(), 'o')))

            # check for expected output
            self.assertEqual(sorted(to_lines(runner.cat_output())),
                             [b'"ee"\t2\n', b'"eye"\t2\n', b'"oh"\t1\n'])
Example #33
0
    def _read_input(self):
        """Read from stdin, or one more files, or directories.
        Yield one line at time.

        - Resolve globs (``foo_*.gz``).
        - Decompress ``.gz`` and ``.bz2`` files.
        - If path is ``-``, read from STDIN.
        - Recursively read all files in a directory
        """
        paths = self.options.args or ['-']

        for path in paths:
            if path == '-':
                for line in self.stdin:
                    yield line
            else:
                with open(path, 'rb') as f:
                    for line in to_lines(decompress(f, path)):
                        yield line
Example #34
0
    def test_typical_output(self):
        # actual output
        self.makefile(os.path.join(self.output_dir, 'part-00000'), b'line0\n')
        self.makefile(os.path.join(self.output_dir, 'part-00001'), b'line1\n')

        # hidden .crc file
        self.makefile(os.path.join(self.output_dir, '.crc.part-00000'),
                      b'42\n')

        # hidden _SUCCESS file (ignore)
        self.makefile(os.path.join(self.output_dir, '_SUCCESS'),
                      b'such a relief!\n')

        # hidden _logs dir
        self.makefile(os.path.join(self.output_dir, '_logs', 'log.xml'),
                      b'pretty much the usual\n')

        self.assertEqual(sorted(to_lines(self.runner.cat_output())),
                         [b'line0\n', b'line1\n'])
Example #35
0
File: job.py Project: Yelp/mrjob
    def _read_input(self):
        """Read from stdin, or one more files, or directories.
        Yield one line at time.

        - Resolve globs (``foo_*.gz``).
        - Decompress ``.gz`` and ``.bz2`` files.
        - If path is ``-``, read from STDIN.
        - Recursively read all files in a directory
        """
        paths = self.options.args or ['-']

        for path in paths:
            if path == '-':
                for line in self.stdin:
                    yield line
            else:
                with open(path, 'rb') as f:
                    for line in to_lines(decompress(f, path)):
                        yield line
Example #36
0
    def test_hadoop_output_format(self):
        input_bytes = b'ee eye ee eye oh'

        job = MRNickNack(['-r', 'spark'])
        job.sandbox(stdin=BytesIO(input_bytes))

        with job.make_runner() as runner:
            runner.run()

            # nicknack.MultipleValueOutputFormat should put output in subdirs
            self.assertTrue(runner.fs.exists(
                runner.fs.join(runner.get_output_dir(), 'e')))

            self.assertTrue(runner.fs.exists(
                runner.fs.join(runner.get_output_dir(), 'o')))

            # check for expected output
            self.assertEqual(
                sorted(to_lines(runner.cat_output())),
                [b'"ee"\t2\n', b'"eye"\t2\n', b'"oh"\t1\n'])
Example #37
0
    def test_cat_reducer(self):
        data = b'x\ny\nz\n'
        job = MRCmdJob(['--reducer-cmd', 'cat -e', '--runner=local'])
        job.sandbox(stdin=BytesIO(data))
        with job.make_runner() as r:
            self.assertEqual(
                r._get_steps(),
                [{
                    'type': 'streaming',
                    'mapper': {
                        'type': 'script',
                    },
                    'reducer': {
                        'type': 'command',
                        'command': 'cat -e'}}])

            r.run()

            lines = list(to_lines(r.cat_output()))
            self.assertEqual(sorted(lines), [b'x$\n', b'y$\n', b'z$\n'])
Example #38
0
    def test_pre_filter_failure(self):
        # regression test for #1524

        data = b'x\ny\nz\n'
        # grep will return exit code 1 because there are no matches
        job = MRFilterJob(['--mapper-filter', 'grep w', '--runner=local'])
        job.sandbox(stdin=BytesIO(data))
        with job.make_runner() as r:
            self.assertEqual(r._get_steps(), [{
                'type': 'streaming',
                'mapper': {
                    'type': 'script',
                    'pre_filter': 'grep w'
                }
            }])

            r.run()

            lines = [line.strip() for line in to_lines(r.cat_output())]
            self.assertEqual(sorted(lines), [])
Example #39
0
    def test_pre_filter_failure(self):
        # regression test for #1524

        data = b'x\ny\nz\n'
        # grep will return exit code 1 because there are no matches
        job = MRFilterJob(['--mapper-filter', 'grep w', '--runner=local'])
        job.sandbox(stdin=BytesIO(data))
        with job.make_runner() as r:
            self.assertEqual(
                r._get_steps(),
                [{
                    'type': 'streaming',
                    'mapper': {
                        'type': 'script',
                        'pre_filter': 'grep w'}}])

            r.run()

            lines = [line.strip() for line in to_lines(r.cat_output())]
            self.assertEqual(sorted(lines), [])
Example #40
0
    def test_cat_reducer(self):
        data = b'x\ny\nz\n'
        job = MRCmdJob(['--reducer-cmd', 'cat -e', '--runner=local'])
        job.sandbox(stdin=BytesIO(data))
        with job.make_runner() as r:
            self.assertEqual(r._get_steps(), [{
                'type': 'streaming',
                'mapper': {
                    'type': 'script',
                },
                'reducer': {
                    'type': 'command',
                    'command': 'cat -e'
                }
            }])

            r.run()

            lines = list(to_lines(r.cat_output()))
            self.assertEqual(sorted(lines), [b'x$\n', b'y$\n', b'z$\n'])
Example #41
0
    def test_mixed_job(self):
        # test a combination of streaming and spark steps
        job = MRStreamingAndSpark(['-r', 'spark'])
        job.sandbox(stdin=BytesIO(b'foo\nbar\n'))

        with job.make_runner() as runner:
            runner.run()

            # converts to 'null\t"foo"', 'null\t"bar"' and then counts chars
            self.assertEqual(sorted(to_lines(runner.cat_output())), [
                b'\t 2\n',
                b'" 4\n',
                b'a 1\n',
                b'b 1\n',
                b'f 1\n',
                b'l 4\n',
                b'n 2\n',
                b'o 2\n',
                b'r 1\n',
                b'u 2\n',
            ])
Example #42
0
    def test_typical_output(self):
        # actual output
        self.makefile(os.path.join(self.output_dir, 'part-00000'),
                      b'line0\n')
        self.makefile(os.path.join(self.output_dir, 'part-00001'),
                      b'line1\n')

        # hidden .crc file
        self.makefile(os.path.join(self.output_dir, '.crc.part-00000'),
                      b'42\n')

        # hidden _SUCCESS file (ignore)
        self.makefile(os.path.join(self.output_dir, '_SUCCESS'),
                      b'such a relief!\n')

        # hidden _logs dir
        self.makefile(os.path.join(self.output_dir, '_logs', 'log.xml'),
                      b'pretty much the usual\n')

        self.assertEqual(sorted(to_lines(self.runner.cat_output())),
                         [b'line0\n', b'line1\n'])
Example #43
0
    def test_python_dash_v_as_python_bin(self):
        python_cmd = cmd_line([sys.executable or 'python', '-v'])
        mr_job = MRTwoStepJob(
            ['--python-bin', python_cmd, '--no-conf', '-r', 'local'])
        mr_job.sandbox(stdin=[b'bar\n'])

        with mr_job.make_runner() as runner:
            runner.run()

            # expect python -v crud in stderr

            with open(runner._task_stderr_path('mapper', 0, 0)) as lines:
                self.assertTrue(
                    any('import mrjob' in line or  # Python 2
                        "import 'mrjob'" in line for line in lines))

            with open(runner._task_stderr_path('mapper', 0, 0)) as lines:
                self.assertTrue(any('#' in line for line in lines))

            # should still get expected results
            self.assertEqual(sorted(to_lines(runner.cat_output())),
                             sorted([b'1\tnull\n', b'1\t"bar"\n']))
Example #44
0
    def test_pre_filter_on_compressed_data(self):
        # regression test for #1061
        input_gz_path = self.makefile('data.gz')
        input_gz = gzip.GzipFile(input_gz_path, 'wb')
        input_gz.write(b'x\ny\nz\n')
        input_gz.close()

        job = MRFilterJob(
            ['--mapper-filter', 'cat -e', '--runner=local', input_gz_path])
        with job.make_runner() as r:
            self.assertEqual(r._get_steps(), [{
                'type': 'streaming',
                'mapper': {
                    'type': 'script',
                    'pre_filter': 'cat -e'
                }
            }])

            r.run()

            lines = [line.strip() for line in to_lines(r.cat_output())]
            self.assertEqual(sorted(lines), [b'x$', b'y$', b'z$'])
Example #45
0
    def test_upload_files_with_rename(self):
        fish_path = self.makefile('fish', b'salmon')
        fowl_path = self.makefile('fowl', b'goose')

        job = MRSparkOSWalk(['-r', 'local',
                             '--file', fish_path + '#ghoti',
                             '--file', fowl_path])
        job.sandbox()

        file_sizes = {}

        with job.make_runner() as runner:
            runner.run()

            # check working dir mirror
            wd_mirror = runner._wd_mirror()
            self.assertIsNotNone(wd_mirror)
            self.assertFalse(is_uri(wd_mirror))

            self.assertTrue(os.path.exists(wd_mirror))
            # only files which needed to be renamed should be in wd_mirror
            self.assertTrue(os.path.exists(os.path.join(wd_mirror, 'ghoti')))
            self.assertFalse(os.path.exists(os.path.join(wd_mirror, 'fish')))
            self.assertFalse(os.path.exists(os.path.join(wd_mirror, 'fowl')))

            for line in to_lines(runner.cat_output()):
                path, size = safeeval(line)
                file_sizes[path] = size

        # check that files were uploaded to working dir
        self.assertIn('fowl', file_sizes)
        self.assertEqual(file_sizes['fowl'], 5)

        self.assertIn('ghoti', file_sizes)
        self.assertEqual(file_sizes['ghoti'], 6)

        # fish was uploaded as "ghoti"
        self.assertNotIn('fish', file_sizes)
Example #46
0
    def test_upload_files_with_rename(self):
        fish_path = self.makefile('fish', b'salmon')
        fowl_path = self.makefile('fowl', b'goose')

        job = MRSparkOSWalk(
            ['-r', 'local', '--files',
             '%s#ghoti,%s' % (fish_path, fowl_path)])
        job.sandbox()

        file_sizes = {}

        with job.make_runner() as runner:
            runner.run()

            # check working dir mirror
            wd_mirror = runner._wd_mirror()
            self.assertIsNotNone(wd_mirror)
            self.assertFalse(is_uri(wd_mirror))

            self.assertTrue(os.path.exists(wd_mirror))
            # only files which needed to be renamed should be in wd_mirror
            self.assertTrue(os.path.exists(os.path.join(wd_mirror, 'ghoti')))
            self.assertFalse(os.path.exists(os.path.join(wd_mirror, 'fish')))
            self.assertFalse(os.path.exists(os.path.join(wd_mirror, 'fowl')))

            for line in to_lines(runner.cat_output()):
                path, size = safeeval(line)
                file_sizes[path] = size

        # check that files were uploaded to working dir
        self.assertIn('fowl', file_sizes)
        self.assertEqual(file_sizes['fowl'], 5)

        self.assertIn('ghoti', file_sizes)
        self.assertEqual(file_sizes['ghoti'], 6)

        # fish was uploaded as "ghoti"
        self.assertNotIn('fish', file_sizes)
Example #47
0
def run_runner(mr_job, i, folder):
    with mr_job.make_runner() as runner:
        runner.run()
        new_assignation = {}
        new_prototype = {}
        for line in to_lines(runner.cat_output()):
            key, value = mr_job.parse_output_line(line)

            new_assignation[key] = value[0]
            new_prototype[key] = value[1]

        cwd = os.getcwd()
        with open(f'{cwd}/{folder}/assignments{i + 1}.txt', mode='w') as new_assignation_file:
            for key, values in new_assignation.items():
                aux = reduce(lambda acc, x: f'{acc} {x}', values, f'{key}:')
                new_assignation_file.write(f'{aux}\n')

        global assignation
        if assignation == new_assignation:
            return i, new_prototype
        else:
            assignation = new_assignation
            write_prototype(new_prototype, f'{cwd}/{folder}/prototypes{i + 1}.txt')
Example #48
0
    def test_hadoop_output_format(self):
        input_bytes = b'ee eye ee eye oh'

        job = self._harness_job(MRNickNack,
                                input_bytes=input_bytes,
                                runner_alias='local',
                                spark_conf=self.SPARK_CONF)

        with job.make_runner() as runner:
            runner.run()

            self.assertTrue(
                runner.fs.exists(join(runner.get_output_dir(), 'e')))

            self.assertTrue(
                runner.fs.exists(join(runner.get_output_dir(), 'o')))

            output_counts = dict((line.strip().split(b'\t')
                                  for line in to_lines(runner.cat_output())))

        expected_output_counts = {b'"ee"': b'2', b'"eye"': b'2', b'"oh"': b'1'}

        self.assertEqual(expected_output_counts, output_counts)
Example #49
0
    def test_upload_files_with_rename(self):
        # see test_upload_files_with_rename() in test_local for comparison

        fish_path = self.makefile('fish', b'salmon')
        fowl_path = self.makefile('fowl', b'goose')

        # --use-driver-cwd gets around issues with the shared JVM not changing
        # executors' working directory to match the driver on local master
        job = MRSparkOSWalk([
            '-r', 'inline', '--use-driver-cwd', '--file', fish_path + '#ghoti',
            '--file', fowl_path
        ])
        job.sandbox()

        file_sizes = {}

        with job.make_runner() as runner:
            runner.run()

            # there is no working dir mirror in inline mode; inline
            # mode simulates the working dir itself
            wd_mirror = runner._wd_mirror()
            self.assertIsNone(wd_mirror)

            for line in to_lines(runner.cat_output()):
                path, size = safeeval(line)
                file_sizes[path] = size

        # check that files were uploaded to working dir
        self.assertIn('fowl', file_sizes)
        self.assertEqual(file_sizes['fowl'], 5)

        self.assertIn('ghoti', file_sizes)
        self.assertEqual(file_sizes['ghoti'], 6)

        # fish was uploaded as "ghoti"
        self.assertNotIn('fish', file_sizes)
Example #50
0
    def test_upload_files_with_rename(self):
        # see test_upload_files_with_rename() in test_local for comparison

        fish_path = self.makefile('fish', b'salmon')
        fowl_path = self.makefile('fowl', b'goose')

        # --use-driver-cwd gets around issues with the shared JVM not changing
        # executors' working directory to match the driver on local master
        job = MRSparkOSWalk(['-r', 'inline',
                             '--use-driver-cwd',
                             '--file', fish_path + '#ghoti',
                             '--file', fowl_path])
        job.sandbox()

        file_sizes = {}

        with job.make_runner() as runner:
            runner.run()

            # there is no working dir mirror in inline mode; inline
            # mode simulates the working dir itself
            wd_mirror = runner._wd_mirror()
            self.assertIsNone(wd_mirror)

            for line in to_lines(runner.cat_output()):
                path, size = safeeval(line)
                file_sizes[path] = size

        # check that files were uploaded to working dir
        self.assertIn('fowl', file_sizes)
        self.assertEqual(file_sizes['fowl'], 5)

        self.assertIn('ghoti', file_sizes)
        self.assertEqual(file_sizes['ghoti'], 6)

        # fish was uploaded as "ghoti"
        self.assertNotIn('fish', file_sizes)
Example #51
0
    def test_hadoop_output_format_with_compression(self):
        input_bytes = b"one two one two"
        compression_codec = 'org.apache.hadoop.io.compress.GzipCodec'

        job = self._harness_job(
            MRNickNack, input_bytes=input_bytes, runner_alias='local',
            spark_conf=self.SPARK_CONF, compression_codec=compression_codec)

        with job.make_runner() as runner:
            runner.run()

            self.assertTrue(runner.fs.exists(
                join(runner.get_output_dir(), 'o', 'part*.gz')))

            self.assertTrue(runner.fs.exists(
                join(runner.get_output_dir(), 't', 'part*.gz')))

            output_counts = dict(
                line.strip().split(b'\t')
                for line in to_lines(runner.cat_output()))

        expected_output_counts = {b'"one"': b'2', b'"two"': b'2'}

        self.assertEqual(expected_output_counts, output_counts)
Example #52
0
 def test_empty(self):
     self.assertEqual(list(to_lines(_ for _ in ())), [])
Example #53
0
 def test_no_trailing_newline(self):
     self.assertEqual(
         list(to_lines(chunk for chunk in
                       [b'Alouette,\ngentille',
                        b' Alouette.'])),
         [b'Alouette,\n', b'gentille Alouette.'])
Example #54
0
 def test_no_trailing_newline(self):
     self.assertEqual(
         list(to_lines(iter([
             b'Alouette,\ngentille',
             b' Alouette.',
         ]))), [b'Alouette,\n', b'gentille Alouette.'])