Ejemplo n.º 1
0
    def test_range_type(self):
        # ranges have different reprs on Python 2 vs. Python 3, and
        # can't be checked for equality until Python 3.3+

        if PY2:
            range_type = xrange
        else:
            range_type = range

        self.assertEqual(repr(safeeval(repr(range_type(3)))), repr(range_type(3)))

        if sys.version_info >= (3, 3):
            self.assertEqual(safeeval(repr(range_type(3))), range_type(3))
Ejemplo n.º 2
0
    def test_range_type(self):
        # ranges have different reprs on Python 2 vs. Python 3, and
        # can't be checked for equality until Python 3.3+

        if PY2:
            range_type = xrange
        else:
            range_type = range

        self.assertEqual(repr(safeeval(repr(range_type(3)))),
                         repr(range_type(3)))

        if sys.version_info >= (3, 3):
            self.assertEqual(safeeval(repr(range_type(3))), range_type(3))
Ejemplo n.º 3
0
    def test_end_to_end(self):
        script_path = spark_wordcount_script.__file__
        if script_path.endswith('.pyc'):
            script_path = script_path[:-1]

        input_path = self.makefile(
            'input', b'one fish\ntwo fish\nred fish\nblue fish\n')

        # don't create this path, let Spark do it
        output_path = join(self.tmp_dir, 'output')
        self.assertFalse(exists(output_path))

        spark_submit_main(
            ['-r', 'local', script_path, input_path, output_path])

        self.assertTrue(exists(output_path))

        word_counts = {}

        for path in glob(join(output_path, 'part-*')):
            with open(path) as f:
                for line in f:
                    word, count = safeeval(line)
                    word_counts[word] = count

        self.assertEqual(word_counts, dict(blue=1, fish=4, one=1, red=1,
                                           two=1))
Ejemplo n.º 4
0
    def test_end_to_end(self):
        script_path = spark_wordcount_script.__file__
        if script_path.endswith('.pyc'):
            script_path = script_path[:-1]

        input_path = self.makefile(
            'input', b'one fish\ntwo fish\nred fish\nblue fish\n')

        # don't create this path, let Spark do it
        output_path = join(self.tmp_dir, 'output')
        self.assertFalse(exists(output_path))

        spark_submit_main(
            ['-r', 'local', script_path, input_path, output_path])

        self.assertTrue(exists(output_path))

        word_counts = {}

        for path in glob(join(output_path, 'part-*')):
            with open(path) as f:
                for line in f:
                    word, count = safeeval(line)
                    word_counts[word] = count

        self.assertEqual(word_counts,
                         dict(blue=1, fish=4, one=1, red=1, two=1))
Ejemplo n.º 5
0
    def test_archive_emulation(self):
        f_dir = self.makedirs('f')
        self.makefile(join(f_dir, 'fish'), b'salmon')
        self.makefile(join(f_dir, 'fowl'), b'goose')

        f_tar_gz = make_archive(join(self.tmp_dir, 'f'), 'gztar', f_dir)

        job = MRSparkOSWalk([
            '-r', 'local', '--archives',
            '%s#f-unpacked' % f_tar_gz, '--dirs', f_dir
        ])
        job.sandbox()

        file_sizes = {}

        with job.make_runner() as runner:
            runner.run()

            for line in to_lines(runner.cat_output()):
                path, size = safeeval(line)
                file_sizes[path] = size

        self.assertIn('f/fish', file_sizes)
        self.assertEqual(file_sizes['f/fish'], 6)
        self.assertIn('f/fowl', file_sizes)
        self.assertEqual(file_sizes['f/fowl'], 5)

        self.assertIn('f-unpacked/fish', file_sizes)
        self.assertEqual(file_sizes['f-unpacked/fish'], 6)
        self.assertIn('f-unpacked/fowl', file_sizes)
        self.assertEqual(file_sizes['f-unpacked/fowl'], 5)

        # archives should have been uploaded as files
        self.assertIn('f.tar.gz.file', file_sizes)
        self.assertIn('f-1.tar.gz.file', file_sizes)
Ejemplo n.º 6
0
    def test_spark_mrjob(self):
        text = b'one fish\ntwo fish\nred fish\nblue fish\n'

        job = MRSparkWordcount(['-r', 'inline'])
        job.sandbox(stdin=BytesIO(text))

        counts = {}

        with job.make_runner() as runner:
            runner.run()

            for line in to_lines(runner.cat_output()):
                k, v = safeeval(line)
                counts[k] = v

        self.assertEqual(counts, dict(blue=1, fish=4, one=1, red=1, two=1))
Ejemplo n.º 7
0
    def test_spark_mrjob(self):
        text = b'one fish\ntwo fish\nred fish\nblue fish\n'

        job = MRSparkWordcount(['-r', 'inline'])
        job.sandbox(stdin=BytesIO(text))

        counts = {}

        with job.make_runner() as runner:
            runner.run()

            for line in to_lines(runner.cat_output()):
                k, v = safeeval(line)
                counts[k] = v

        self.assertEqual(counts, dict(
            blue=1, fish=4, one=1, red=1, two=1))
Ejemplo n.º 8
0
    def test_count_words(self):
        job = MRSparkWordcount([])
        job.sandbox(
            stdin=BytesIO(b'Mary had a little lamb\nlittle lamb\nlittle lamb'))

        with job.make_runner() as runner:
            runner.run()

            output = sorted(
                safeeval(line) for line in to_lines(runner.cat_output()))

            self.assertEqual(output, [
                ('a', 1),
                ('had', 1),
                ('lamb', 3),
                ('little', 3),
                ('mary', 1),
            ])
Ejemplo n.º 9
0
    def test_copy_files_with_rename_to_local_wd_mirror(self):
        # see test_upload_files_with_rename() in test_local for comparison

        fish_path = self.makefile('fish', b'salmon')
        fowl_path = self.makefile('fowl', b'goose')

        # use _LOCAL_CLUSTER_MASTER because the default master (local[*])
        # doesn't have a working directory
        job = MRSparkOSWalk(['-r', 'spark',
                             '--spark-master', _LOCAL_CLUSTER_MASTER,
                             '--file', fish_path + '#ghoti',
                             '--file', fowl_path])
        job.sandbox()

        file_sizes = {}

        with job.make_runner() as runner:
            runner.run()

            # check working dir mirror
            wd_mirror = runner._wd_mirror()
            self.assertIsNotNone(wd_mirror)
            self.assertFalse(is_uri(wd_mirror))

            self.assertTrue(exists(wd_mirror))
            # only files which needed to be renamed should be in wd_mirror
            self.assertTrue(exists(join(wd_mirror, 'ghoti')))
            self.assertFalse(exists(join(wd_mirror, 'fish')))
            self.assertFalse(exists(join(wd_mirror, 'fowl')))

            for line in to_lines(runner.cat_output()):
                path, size = safeeval(line)
                file_sizes[path] = size

        # check that files were uploaded to working dir
        self.assertIn('fowl', file_sizes)
        self.assertEqual(file_sizes['fowl'], 5)

        self.assertIn('ghoti', file_sizes)
        self.assertEqual(file_sizes['ghoti'], 6)

        # fish was uploaded as "ghoti"
        self.assertNotIn('fish', file_sizes)
Ejemplo n.º 10
0
    def test_copy_files_with_rename_to_local_wd_mirror(self):
        # see test_upload_files_with_rename() in test_local for comparison

        fish_path = self.makefile('fish', b'salmon')
        fowl_path = self.makefile('fowl', b'goose')

        # use _LOCAL_CLUSTER_MASTER because the default master (local[*])
        # doesn't have a working directory
        job = MRSparkOSWalk([
            '-r', 'spark', '--spark-master', _LOCAL_CLUSTER_MASTER, '--files',
            '%s#ghoti,%s' % (fish_path, fowl_path)
        ])
        job.sandbox()

        file_sizes = {}

        with job.make_runner() as runner:
            runner.run()

            # check working dir mirror
            wd_mirror = runner._wd_mirror()
            self.assertIsNotNone(wd_mirror)
            self.assertFalse(is_uri(wd_mirror))

            self.assertTrue(exists(wd_mirror))
            # only files which needed to be renamed should be in wd_mirror
            self.assertTrue(exists(join(wd_mirror, 'ghoti')))
            self.assertFalse(exists(join(wd_mirror, 'fish')))
            self.assertFalse(exists(join(wd_mirror, 'fowl')))

            for line in to_lines(runner.cat_output()):
                path, size = safeeval(line)
                file_sizes[path] = size

        # check that files were uploaded to working dir
        self.assertIn('fowl', file_sizes)
        self.assertEqual(file_sizes['fowl'], 5)

        self.assertIn('ghoti', file_sizes)
        self.assertEqual(file_sizes['ghoti'], 6)

        # fish was uploaded as "ghoti"
        self.assertNotIn('fish', file_sizes)
Ejemplo n.º 11
0
    def test_upload_files_with_rename(self):
        fish_path = self.makefile('fish', b'salmon')
        fowl_path = self.makefile('fowl', b'goose')

        job = MRSparkOSWalk(
            ['-r', 'local', '--files',
             '%s#ghoti,%s' % (fish_path, fowl_path)])
        job.sandbox()

        file_sizes = {}

        with job.make_runner() as runner:
            runner.run()

            # check working dir mirror
            wd_mirror = runner._wd_mirror()
            self.assertIsNotNone(wd_mirror)
            self.assertFalse(is_uri(wd_mirror))

            self.assertTrue(os.path.exists(wd_mirror))
            # only files which needed to be renamed should be in wd_mirror
            self.assertTrue(os.path.exists(os.path.join(wd_mirror, 'ghoti')))
            self.assertFalse(os.path.exists(os.path.join(wd_mirror, 'fish')))
            self.assertFalse(os.path.exists(os.path.join(wd_mirror, 'fowl')))

            for line in to_lines(runner.cat_output()):
                path, size = safeeval(line)
                file_sizes[path] = size

        # check that files were uploaded to working dir
        self.assertIn('fowl', file_sizes)
        self.assertEqual(file_sizes['fowl'], 5)

        self.assertIn('ghoti', file_sizes)
        self.assertEqual(file_sizes['ghoti'], 6)

        # fish was uploaded as "ghoti"
        self.assertNotIn('fish', file_sizes)
Ejemplo n.º 12
0
    def test_upload_files_with_rename(self):
        fish_path = self.makefile('fish', b'salmon')
        fowl_path = self.makefile('fowl', b'goose')

        job = MRSparkOSWalk(['-r', 'local',
                             '--file', fish_path + '#ghoti',
                             '--file', fowl_path])
        job.sandbox()

        file_sizes = {}

        with job.make_runner() as runner:
            runner.run()

            # check working dir mirror
            wd_mirror = runner._wd_mirror()
            self.assertIsNotNone(wd_mirror)
            self.assertFalse(is_uri(wd_mirror))

            self.assertTrue(os.path.exists(wd_mirror))
            # only files which needed to be renamed should be in wd_mirror
            self.assertTrue(os.path.exists(os.path.join(wd_mirror, 'ghoti')))
            self.assertFalse(os.path.exists(os.path.join(wd_mirror, 'fish')))
            self.assertFalse(os.path.exists(os.path.join(wd_mirror, 'fowl')))

            for line in to_lines(runner.cat_output()):
                path, size = safeeval(line)
                file_sizes[path] = size

        # check that files were uploaded to working dir
        self.assertIn('fowl', file_sizes)
        self.assertEqual(file_sizes['fowl'], 5)

        self.assertIn('ghoti', file_sizes)
        self.assertEqual(file_sizes['ghoti'], 6)

        # fish was uploaded as "ghoti"
        self.assertNotIn('fish', file_sizes)
Ejemplo n.º 13
0
    def test_upload_files_with_rename(self):
        # see test_upload_files_with_rename() in test_local for comparison

        fish_path = self.makefile('fish', b'salmon')
        fowl_path = self.makefile('fowl', b'goose')

        # --use-driver-cwd gets around issues with the shared JVM not changing
        # executors' working directory to match the driver on local master
        job = MRSparkOSWalk(['-r', 'inline',
                             '--use-driver-cwd',
                             '--file', fish_path + '#ghoti',
                             '--file', fowl_path])
        job.sandbox()

        file_sizes = {}

        with job.make_runner() as runner:
            runner.run()

            # there is no working dir mirror in inline mode; inline
            # mode simulates the working dir itself
            wd_mirror = runner._wd_mirror()
            self.assertIsNone(wd_mirror)

            for line in to_lines(runner.cat_output()):
                path, size = safeeval(line)
                file_sizes[path] = size

        # check that files were uploaded to working dir
        self.assertIn('fowl', file_sizes)
        self.assertEqual(file_sizes['fowl'], 5)

        self.assertIn('ghoti', file_sizes)
        self.assertEqual(file_sizes['ghoti'], 6)

        # fish was uploaded as "ghoti"
        self.assertNotIn('fish', file_sizes)
Ejemplo n.º 14
0
    def test_upload_files_with_rename(self):
        # see test_upload_files_with_rename() in test_local for comparison

        fish_path = self.makefile('fish', b'salmon')
        fowl_path = self.makefile('fowl', b'goose')

        # --use-driver-cwd gets around issues with the shared JVM not changing
        # executors' working directory to match the driver on local master
        job = MRSparkOSWalk([
            '-r', 'inline', '--use-driver-cwd', '--file', fish_path + '#ghoti',
            '--file', fowl_path
        ])
        job.sandbox()

        file_sizes = {}

        with job.make_runner() as runner:
            runner.run()

            # there is no working dir mirror in inline mode; inline
            # mode simulates the working dir itself
            wd_mirror = runner._wd_mirror()
            self.assertIsNone(wd_mirror)

            for line in to_lines(runner.cat_output()):
                path, size = safeeval(line)
                file_sizes[path] = size

        # check that files were uploaded to working dir
        self.assertIn('fowl', file_sizes)
        self.assertEqual(file_sizes['fowl'], 5)

        self.assertIn('ghoti', file_sizes)
        self.assertEqual(file_sizes['ghoti'], 6)

        # fish was uploaded as "ghoti"
        self.assertNotIn('fish', file_sizes)
Ejemplo n.º 15
0
 def read(cls, line):
     key, value = line.split('\t')
     return safeeval(key), safeeval(value)
Ejemplo n.º 16
0
 def test_globals_and_locals(self):
     # test passing in globals, locals
     a = -0.2
     self.assertEqual(
         abs(a), safeeval('abs(a)', globals={'abs': abs}, locals={'a': a}))
Ejemplo n.º 17
0
 def test_globals_and_locals(self):
     # test passing in globals, locals
     a = -0.2
     self.assertEqual(abs(a), safeeval("abs(a)", globals={"abs": abs}, locals={"a": a}))
Ejemplo n.º 18
0
 def read(cls, line):
     return (None, safeeval(line))
Ejemplo n.º 19
0
 def test_simple_data_structures(self):
     # try unrepr-ing a bunch of simple data structures
     for x in True, None, 1, [0, 1, 2, 3, 4], {'foo': False, 'bar': 2}:
         self.assertEqual(x, safeeval(repr(x)))
Ejemplo n.º 20
0
 def read(cls, line):
     key, value = line.split('\t')
     return safeeval(key), safeeval(value)
Ejemplo n.º 21
0
 def load_from_string(cls, value):
     return safeeval(value)
Ejemplo n.º 22
0
 def _loads(self, value):
     return safeeval(value)
Ejemplo n.º 23
0
 def read(self, line):
     return (None, safeeval(line))
Ejemplo n.º 24
0
 def test_simple_data_structures(self):
     # try unrepr-ing a bunch of simple data structures
     for x in True, None, 1, [0, 1, 2, 3, 4], {'foo': False, 'bar': 2}:
         self.assertEqual(x, safeeval(repr(x)))
Ejemplo n.º 25
0
 def _loads(self, value):
     return safeeval(value)
Ejemplo n.º 26
0
 def test_globals_and_locals(self):
     # test passing in globals, locals
     a = -0.2
     self.assertEqual(
         abs(a),
         safeeval('abs(a)', globals={'abs': abs}, locals={'a': a}))
Ejemplo n.º 27
0
 def read(cls, line):
     return (None, safeeval(line))
Ejemplo n.º 28
0
 def load_from_string(cls, value):
     return safeeval(value)
Ejemplo n.º 29
0
 def test_simple_data_structure(self):
     # try unrepr-ing a bunch of simple data structures
     for x in True, None, 1, range(5), {"foo": False, "bar": 2}:
         self.assertEqual(x, safeeval(repr(x)))
Ejemplo n.º 30
0
 def test_simple_data_structure(self):
     # try unrepr-ing a bunch of simple data structures
     for x in True, None, 1, range(5), {'foo': False, 'bar': 2}:
         assert_equal(x, safeeval(repr(x)))
Ejemplo n.º 31
0
 def read(self, line):
     return (None, safeeval(line))
Ejemplo n.º 32
0
 def test_simple_data_structure(self):
     # try unrepr-ing a bunch of simple data structures
     for x in True, None, 1, range(5), {'foo': False, 'bar': 2}:
         assert_equal(x, safeeval(repr(x)))