class PysparkFlameTest(unittest.TestCase):
    def setUp(self):
        self.dumpdir = tempfile.mkdtemp()
        conf = SparkConf().set("spark.python.profile", "true")
        self.sc = SparkContext('local[*]',
                               'test',
                               conf=conf,
                               profiler_cls=FlameProfiler,
                               environment={'pyspark_flame.interval': 0.25})

    def tearDown(self):
        self.sc.stop()
        shutil.rmtree(self.dumpdir)

    def test_pyspark_flame(self):

        self.sc.parallelize(range(4)).map(wait_a_bit).sum()
        self.sc.dump_profiles(self.dumpdir)
        dumps = os.listdir(self.dumpdir)
        self.assertEqual(1, len(dumps))
        with open(os.path.join(self.dumpdir, dumps[0])) as dumpfile:
            for line in dumpfile.readlines():
                location, count = line.split(' ')
                if 'pyspark_flame_test.py:wait_a_bit:11' in location:
                    count = int(count)
                    self.assertIn(count, range(70, 90))
                    return
            else:
                self.fail('No wait_a_bit profile line found')

    def test_propagate_exception(self):
        with self.assertRaises(Exception):
            self.sc.parallelize(range(4)).map(crash).sum()
Beispiel #2
0
class ProfilerTests(PySparkTestCase):

    def setUp(self):
        self._old_sys_path = list(sys.path)
        class_name = self.__class__.__name__
        conf = SparkConf().set("spark.python.profile", "true")
        self.sc = SparkContext('local[4]', class_name, conf=conf)

    def test_profiler(self):
        self.do_computation()

        profilers = self.sc.profiler_collector.profilers
        self.assertEqual(1, len(profilers))
        id, profiler, _ = profilers[0]
        stats = profiler.stats()
        self.assertTrue(stats is not None)
        width, stat_list = stats.get_print_list([])
        func_names = [func_name for fname, n, func_name in stat_list]
        self.assertTrue("heavy_foo" in func_names)

        old_stdout = sys.stdout
        sys.stdout = io = StringIO()
        self.sc.show_profiles()
        self.assertTrue("heavy_foo" in io.getvalue())
        sys.stdout = old_stdout

        d = tempfile.gettempdir()
        self.sc.dump_profiles(d)
        self.assertTrue("rdd_%d.pstats" % id in os.listdir(d))

    def test_custom_profiler(self):
        class TestCustomProfiler(BasicProfiler):
            def show(self, id):
                self.result = "Custom formatting"

        self.sc.profiler_collector.profiler_cls = TestCustomProfiler

        self.do_computation()

        profilers = self.sc.profiler_collector.profilers
        self.assertEqual(1, len(profilers))
        _, profiler, _ = profilers[0]
        self.assertTrue(isinstance(profiler, TestCustomProfiler))

        self.sc.show_profiles()
        self.assertEqual("Custom formatting", profiler.result)

    def do_computation(self):
        def heavy_foo(x):
            for i in range(1 << 18):
                x = 1

        rdd = self.sc.parallelize(range(100))
        rdd.foreach(heavy_foo)
Beispiel #3
0
class ProfilerTests(PySparkTestCase):

    def setUp(self):
        self._old_sys_path = list(sys.path)
        class_name = self.__class__.__name__
        conf = SparkConf().set("spark.python.profile", "true")
        self.sc = SparkContext('local[4]', class_name, conf=conf)

    def test_profiler(self):
        self.do_computation()

        profilers = self.sc.profiler_collector.profilers
        self.assertEqual(1, len(profilers))
        id, profiler, _ = profilers[0]
        stats = profiler.stats()
        self.assertTrue(stats is not None)
        width, stat_list = stats.get_print_list([])
        func_names = [func_name for fname, n, func_name in stat_list]
        self.assertTrue("heavy_foo" in func_names)

        old_stdout = sys.stdout
        sys.stdout = io = StringIO()
        self.sc.show_profiles()
        self.assertTrue("heavy_foo" in io.getvalue())
        sys.stdout = old_stdout

        d = tempfile.gettempdir()
        self.sc.dump_profiles(d)
        self.assertTrue("rdd_%d.pstats" % id in os.listdir(d))

    def test_custom_profiler(self):
        class TestCustomProfiler(BasicProfiler):
            def show(self, id):
                self.result = "Custom formatting"

        self.sc.profiler_collector.profiler_cls = TestCustomProfiler

        self.do_computation()

        profilers = self.sc.profiler_collector.profilers
        self.assertEqual(1, len(profilers))
        _, profiler, _ = profilers[0]
        self.assertTrue(isinstance(profiler, TestCustomProfiler))

        self.sc.show_profiles()
        self.assertEqual("Custom formatting", profiler.result)

    def do_computation(self):
        def heavy_foo(x):
            for i in range(1 << 18):
                x = 1

        rdd = self.sc.parallelize(range(100))
        rdd.foreach(heavy_foo)
Beispiel #4
0
 def test_profiler_disabled(self):
     sc = SparkContext(conf=SparkConf().set("spark.python.profile", "false"))
     try:
         self.assertRaisesRegexp(
             RuntimeError,
             "'spark.python.profile' configuration must be set",
             lambda: sc.show_profiles())
         self.assertRaisesRegexp(
             RuntimeError,
             "'spark.python.profile' configuration must be set",
             lambda: sc.dump_profiles("/tmp/abc"))
     finally:
         sc.stop()
Beispiel #5
0
 def test_profiler_disabled(self):
     sc = SparkContext(conf=SparkConf().set("spark.python.profile", "false"))
     try:
         self.assertRaisesRegexp(
             RuntimeError,
             "'spark.python.profile' configuration must be set",
             lambda: sc.show_profiles())
         self.assertRaisesRegexp(
             RuntimeError,
             "'spark.python.profile' configuration must be set",
             lambda: sc.dump_profiles("/tmp/abc"))
     finally:
         sc.stop()
Beispiel #6
0
class UDFProfilerTests(unittest.TestCase):
    def setUp(self):
        self._old_sys_path = list(sys.path)
        class_name = self.__class__.__name__
        conf = SparkConf().set("spark.python.profile", "true")
        self.sc = SparkContext("local[4]", class_name, conf=conf)
        self.spark = SparkSession.builder._sparkContext(self.sc).getOrCreate()

    def tearDown(self):
        self.spark.stop()
        sys.path = self._old_sys_path

    def test_udf_profiler(self):
        self.do_computation()

        profilers = self.sc.profiler_collector.profilers
        self.assertEqual(3, len(profilers))

        old_stdout = sys.stdout
        try:
            sys.stdout = io = StringIO()
            self.sc.show_profiles()
        finally:
            sys.stdout = old_stdout

        d = tempfile.gettempdir()
        self.sc.dump_profiles(d)

        for i, udf_name in enumerate(["add1", "add2", "add1"]):
            id, profiler, _ = profilers[i]
            with self.subTest(id=id, udf_name=udf_name):
                stats = profiler.stats()
                self.assertTrue(stats is not None)
                width, stat_list = stats.get_print_list([])
                func_names = [func_name for fname, n, func_name in stat_list]
                self.assertTrue(udf_name in func_names)

                self.assertTrue(udf_name in io.getvalue())
                self.assertTrue("udf_%d.pstats" % id in os.listdir(d))

    def test_custom_udf_profiler(self):
        class TestCustomProfiler(UDFBasicProfiler):
            def show(self, id):
                self.result = "Custom formatting"

        self.sc.profiler_collector.udf_profiler_cls = TestCustomProfiler

        self.do_computation()

        profilers = self.sc.profiler_collector.profilers
        self.assertEqual(3, len(profilers))
        _, profiler, _ = profilers[0]
        self.assertTrue(isinstance(profiler, TestCustomProfiler))

        self.sc.show_profiles()
        self.assertEqual("Custom formatting", profiler.result)

    def do_computation(self):
        @udf
        def add1(x):
            return x + 1

        @udf
        def add2(x):
            return x + 2

        df = self.spark.range(10)
        df.select(add1("id"), add2("id"), add1("id")).collect()
Beispiel #7
0
from pyspark import SparkContext, SparkConf
import numpy as np

conf = SparkConf()
conf.set('master', 'spark://hadoop-maste:7077')
conf.set('spark.python.profile', 'true')
context = SparkContext(conf=conf)
rdd = context.parallelize(np.arange(10), 3)
print(rdd.collect())
print(context.show_profiles())
context.dump_profiles('/datas/profiles/')
context.stop()
Beispiel #8
0
import time
import random
from pyspark_flame import FlameProfiler
from pyspark import SparkConf, SparkContext


def multiply_inefficiently(x):
    for i in range(1000):
        time.sleep(0.0001 * random.random())
        time.sleep(0.0001 * random.random())
    return x * 2


conf = SparkConf().set("spark.python.profile", "true")#.set("spark.python.profile.dump", ".")
sc = SparkContext('local', 'test', conf=conf, profiler_cls=FlameProfiler, environment={'pyspark_flame.interval': 0.25})
sc.parallelize(range(1000)).map(multiply_inefficiently).take(10)
sc.show_profiles()
sc.dump_profiles('.')
sc.stop()