Python SparkContext.dump_profiles Beispiele

Programmiersprache: Python

Namespace / Paketname: pyspark

Klasse / Typ: SparkContext

Methode / Funktion: dump_profiles

Beispiele auf hotexamples.com: 8

Python SparkContext.dump_profiles - 8 Beispiele gefunden. Dies sind die am besten bewerteten Python Beispiele für die pyspark.SparkContext.dump_profiles, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Häufig verwendete Methoden

Anzeigen Verbergen

setLogLevel(30)

setSystemProperty(30)

setCheckpointDir(30)

getConf(30)

parallelize(30)

pickleFile(30)

broadcast(30)

emptyRDD(30)

newAPIHadoopFile(30)

binaryFiles(30)

addPyFile(30)

addFile(30)

accumulator(30)

getOrCreate(30)

SparkContext(30)

sequenceFile(30)

newAPIHadoopRDD(25)

_ensure_initialized(14)

createDataFrame(11)

hadoopFile(10)

show_profiles(9)

range(8)

dump_profiles(6)

mongoRDD(6)

binaryRecords(6)

map(4)

setLocalProperty(3)

runJob(3)

flatMap(2)

cassandraTable(2)

collect(2)

close(2)

setJobGroup(2)

paralellize(1)

neo4jTable(1)

neo4jConfig(1)

parallelise(1)

BSONFileRDD(1)

parallelized(1)

parallize(1)

reduceByKey(1)

sample(1)

mongoPairRDD(1)

setMaster(1)

show_profile(1)

sortBy(1)

saveAsTextFile(1)

hadoopConfiguration(1)

mixin(1)

filter(1)

Beispiel #1

Datei anzeigen

Datei: pyspark_flame_test.py Projekt: stuarteberg/pyspark-flame

class PysparkFlameTest(unittest.TestCase):
    def setUp(self):
        self.dumpdir = tempfile.mkdtemp()
        conf = SparkConf().set("spark.python.profile", "true")
        self.sc = SparkContext('local[*]',
                               'test',
                               conf=conf,
                               profiler_cls=FlameProfiler,
                               environment={'pyspark_flame.interval': 0.25})

    def tearDown(self):
        self.sc.stop()
        shutil.rmtree(self.dumpdir)

    def test_pyspark_flame(self):

        self.sc.parallelize(range(4)).map(wait_a_bit).sum()
        self.sc.dump_profiles(self.dumpdir)
        dumps = os.listdir(self.dumpdir)
        self.assertEqual(1, len(dumps))
        with open(os.path.join(self.dumpdir, dumps[0])) as dumpfile:
            for line in dumpfile.readlines():
                location, count = line.split(' ')
                if 'pyspark_flame_test.py:wait_a_bit:11' in location:
                    count = int(count)
                    self.assertIn(count, range(70, 90))
                    return
            else:
                self.fail('No wait_a_bit profile line found')

    def test_propagate_exception(self):
        with self.assertRaises(Exception):
            self.sc.parallelize(range(4)).map(crash).sum()

Beispiel #2

Datei anzeigen

class ProfilerTests(PySparkTestCase):

    def setUp(self):
        self._old_sys_path = list(sys.path)
        class_name = self.__class__.__name__
        conf = SparkConf().set("spark.python.profile", "true")
        self.sc = SparkContext('local[4]', class_name, conf=conf)

    def test_profiler(self):
        self.do_computation()

        profilers = self.sc.profiler_collector.profilers
        self.assertEqual(1, len(profilers))
        id, profiler, _ = profilers[0]
        stats = profiler.stats()
        self.assertTrue(stats is not None)
        width, stat_list = stats.get_print_list([])
        func_names = [func_name for fname, n, func_name in stat_list]
        self.assertTrue("heavy_foo" in func_names)

        old_stdout = sys.stdout
        sys.stdout = io = StringIO()
        self.sc.show_profiles()
        self.assertTrue("heavy_foo" in io.getvalue())
        sys.stdout = old_stdout

        d = tempfile.gettempdir()
        self.sc.dump_profiles(d)
        self.assertTrue("rdd_%d.pstats" % id in os.listdir(d))

    def test_custom_profiler(self):
        class TestCustomProfiler(BasicProfiler):
            def show(self, id):
                self.result = "Custom formatting"

        self.sc.profiler_collector.profiler_cls = TestCustomProfiler

        self.do_computation()

        profilers = self.sc.profiler_collector.profilers
        self.assertEqual(1, len(profilers))
        _, profiler, _ = profilers[0]
        self.assertTrue(isinstance(profiler, TestCustomProfiler))

        self.sc.show_profiles()
        self.assertEqual("Custom formatting", profiler.result)

    def do_computation(self):
        def heavy_foo(x):
            for i in range(1 << 18):
                x = 1

        rdd = self.sc.parallelize(range(100))
        rdd.foreach(heavy_foo)

Beispiel #3

Datei anzeigen

Datei: test_profiler.py Projekt: Brett-A/spark

class ProfilerTests(PySparkTestCase):

    def setUp(self):
        self._old_sys_path = list(sys.path)
        class_name = self.__class__.__name__
        conf = SparkConf().set("spark.python.profile", "true")
        self.sc = SparkContext('local[4]', class_name, conf=conf)

    def test_profiler(self):
        self.do_computation()

        profilers = self.sc.profiler_collector.profilers
        self.assertEqual(1, len(profilers))
        id, profiler, _ = profilers[0]
        stats = profiler.stats()
        self.assertTrue(stats is not None)
        width, stat_list = stats.get_print_list([])
        func_names = [func_name for fname, n, func_name in stat_list]
        self.assertTrue("heavy_foo" in func_names)

        old_stdout = sys.stdout
        sys.stdout = io = StringIO()
        self.sc.show_profiles()
        self.assertTrue("heavy_foo" in io.getvalue())
        sys.stdout = old_stdout

        d = tempfile.gettempdir()
        self.sc.dump_profiles(d)
        self.assertTrue("rdd_%d.pstats" % id in os.listdir(d))

    def test_custom_profiler(self):
        class TestCustomProfiler(BasicProfiler):
            def show(self, id):
                self.result = "Custom formatting"

        self.sc.profiler_collector.profiler_cls = TestCustomProfiler

        self.do_computation()

        profilers = self.sc.profiler_collector.profilers
        self.assertEqual(1, len(profilers))
        _, profiler, _ = profilers[0]
        self.assertTrue(isinstance(profiler, TestCustomProfiler))

        self.sc.show_profiles()
        self.assertEqual("Custom formatting", profiler.result)

    def do_computation(self):
        def heavy_foo(x):
            for i in range(1 << 18):
                x = 1

        rdd = self.sc.parallelize(range(100))
        rdd.foreach(heavy_foo)

Beispiel #4

Datei anzeigen

 def test_profiler_disabled(self):
     sc = SparkContext(conf=SparkConf().set("spark.python.profile", "false"))
     try:
         self.assertRaisesRegexp(
             RuntimeError,
             "'spark.python.profile' configuration must be set",
             lambda: sc.show_profiles())
         self.assertRaisesRegexp(
             RuntimeError,
             "'spark.python.profile' configuration must be set",
             lambda: sc.dump_profiles("/tmp/abc"))
     finally:
         sc.stop()

Beispiel #5

Datei anzeigen

Datei: test_profiler.py Projekt: Brett-A/spark

 def test_profiler_disabled(self):
     sc = SparkContext(conf=SparkConf().set("spark.python.profile", "false"))
     try:
         self.assertRaisesRegexp(
             RuntimeError,
             "'spark.python.profile' configuration must be set",
             lambda: sc.show_profiles())
         self.assertRaisesRegexp(
             RuntimeError,
             "'spark.python.profile' configuration must be set",
             lambda: sc.dump_profiles("/tmp/abc"))
     finally:
         sc.stop()

Beispiel #6

Datei anzeigen

Datei: test_udf_profiler.py Projekt: zoelin7/spark

class UDFProfilerTests(unittest.TestCase):
    def setUp(self):
        self._old_sys_path = list(sys.path)
        class_name = self.__class__.__name__
        conf = SparkConf().set("spark.python.profile", "true")
        self.sc = SparkContext("local[4]", class_name, conf=conf)
        self.spark = SparkSession.builder._sparkContext(self.sc).getOrCreate()

    def tearDown(self):
        self.spark.stop()
        sys.path = self._old_sys_path

    def test_udf_profiler(self):
        self.do_computation()

        profilers = self.sc.profiler_collector.profilers
        self.assertEqual(3, len(profilers))

        old_stdout = sys.stdout
        try:
            sys.stdout = io = StringIO()
            self.sc.show_profiles()
        finally:
            sys.stdout = old_stdout

        d = tempfile.gettempdir()
        self.sc.dump_profiles(d)

        for i, udf_name in enumerate(["add1", "add2", "add1"]):
            id, profiler, _ = profilers[i]
            with self.subTest(id=id, udf_name=udf_name):
                stats = profiler.stats()
                self.assertTrue(stats is not None)
                width, stat_list = stats.get_print_list([])
                func_names = [func_name for fname, n, func_name in stat_list]
                self.assertTrue(udf_name in func_names)

                self.assertTrue(udf_name in io.getvalue())
                self.assertTrue("udf_%d.pstats" % id in os.listdir(d))

    def test_custom_udf_profiler(self):
        class TestCustomProfiler(UDFBasicProfiler):
            def show(self, id):
                self.result = "Custom formatting"

        self.sc.profiler_collector.udf_profiler_cls = TestCustomProfiler

        self.do_computation()

        profilers = self.sc.profiler_collector.profilers
        self.assertEqual(3, len(profilers))
        _, profiler, _ = profilers[0]
        self.assertTrue(isinstance(profiler, TestCustomProfiler))

        self.sc.show_profiles()
        self.assertEqual("Custom formatting", profiler.result)

    def do_computation(self):
        @udf
        def add1(x):
            return x + 1

        @udf
        def add2(x):
            return x + 2

        df = self.spark.range(10)
        df.select(add1("id"), add2("id"), add1("id")).collect()

Beispiel #7

Datei anzeigen

from pyspark import SparkContext, SparkConf
import numpy as np

conf = SparkConf()
conf.set('master', 'spark://hadoop-maste:7077')
conf.set('spark.python.profile', 'true')
context = SparkContext(conf=conf)
rdd = context.parallelize(np.arange(10), 3)
print(rdd.collect())
print(context.show_profiles())
context.dump_profiles('/datas/profiles/')
context.stop()

Beispiel #8

Datei anzeigen

import time
import random
from pyspark_flame import FlameProfiler
from pyspark import SparkConf, SparkContext


def multiply_inefficiently(x):
    for i in range(1000):
        time.sleep(0.0001 * random.random())
        time.sleep(0.0001 * random.random())
    return x * 2


conf = SparkConf().set("spark.python.profile", "true")#.set("spark.python.profile.dump", ".")
sc = SparkContext('local', 'test', conf=conf, profiler_cls=FlameProfiler, environment={'pyspark_flame.interval': 0.25})
sc.parallelize(range(1000)).map(multiply_inefficiently).take(10)
sc.show_profiles()
sc.dump_profiles('.')
sc.stop()