Python SparkContext.show_profiles Beispiele

Programmiersprache: Python

Namespace / Paketname: pyspark

Klasse / Typ: SparkContext

Methode / Funktion: show_profiles

Beispiele auf hotexamples.com: 11

Python SparkContext.show_profiles - 11 Beispiele gefunden. Dies sind die am besten bewerteten Python Beispiele für die pyspark.SparkContext.show_profiles, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Häufig verwendete Methoden

Anzeigen Verbergen

setLogLevel(30)

setSystemProperty(30)

setCheckpointDir(30)

getConf(30)

parallelize(30)

pickleFile(30)

broadcast(30)

emptyRDD(30)

newAPIHadoopFile(30)

binaryFiles(30)

addPyFile(30)

addFile(30)

accumulator(30)

getOrCreate(30)

SparkContext(30)

sequenceFile(30)

newAPIHadoopRDD(25)

_ensure_initialized(14)

createDataFrame(11)

hadoopFile(10)

show_profiles(9)

range(8)

dump_profiles(6)

mongoRDD(6)

binaryRecords(6)

map(4)

setLocalProperty(3)

runJob(3)

flatMap(2)

cassandraTable(2)

collect(2)

close(2)

setJobGroup(2)

paralellize(1)

neo4jTable(1)

neo4jConfig(1)

parallelise(1)

BSONFileRDD(1)

parallelized(1)

parallize(1)

reduceByKey(1)

sample(1)

mongoPairRDD(1)

setMaster(1)

show_profile(1)

sortBy(1)

saveAsTextFile(1)

hadoopConfiguration(1)

mixin(1)

filter(1)

Beispiel #1

Datei anzeigen

Datei: test_profiler.py Projekt: Brett-A/spark

class ProfilerTests(PySparkTestCase):

    def setUp(self):
        self._old_sys_path = list(sys.path)
        class_name = self.__class__.__name__
        conf = SparkConf().set("spark.python.profile", "true")
        self.sc = SparkContext('local[4]', class_name, conf=conf)

    def test_profiler(self):
        self.do_computation()

        profilers = self.sc.profiler_collector.profilers
        self.assertEqual(1, len(profilers))
        id, profiler, _ = profilers[0]
        stats = profiler.stats()
        self.assertTrue(stats is not None)
        width, stat_list = stats.get_print_list([])
        func_names = [func_name for fname, n, func_name in stat_list]
        self.assertTrue("heavy_foo" in func_names)

        old_stdout = sys.stdout
        sys.stdout = io = StringIO()
        self.sc.show_profiles()
        self.assertTrue("heavy_foo" in io.getvalue())
        sys.stdout = old_stdout

        d = tempfile.gettempdir()
        self.sc.dump_profiles(d)
        self.assertTrue("rdd_%d.pstats" % id in os.listdir(d))

    def test_custom_profiler(self):
        class TestCustomProfiler(BasicProfiler):
            def show(self, id):
                self.result = "Custom formatting"

        self.sc.profiler_collector.profiler_cls = TestCustomProfiler

        self.do_computation()

        profilers = self.sc.profiler_collector.profilers
        self.assertEqual(1, len(profilers))
        _, profiler, _ = profilers[0]
        self.assertTrue(isinstance(profiler, TestCustomProfiler))

        self.sc.show_profiles()
        self.assertEqual("Custom formatting", profiler.result)

    def do_computation(self):
        def heavy_foo(x):
            for i in range(1 << 18):
                x = 1

        rdd = self.sc.parallelize(range(100))
        rdd.foreach(heavy_foo)

Beispiel #2

Datei anzeigen

class ProfilerTests(PySparkTestCase):

    def setUp(self):
        self._old_sys_path = list(sys.path)
        class_name = self.__class__.__name__
        conf = SparkConf().set("spark.python.profile", "true")
        self.sc = SparkContext('local[4]', class_name, conf=conf)

    def test_profiler(self):
        self.do_computation()

        profilers = self.sc.profiler_collector.profilers
        self.assertEqual(1, len(profilers))
        id, profiler, _ = profilers[0]
        stats = profiler.stats()
        self.assertTrue(stats is not None)
        width, stat_list = stats.get_print_list([])
        func_names = [func_name for fname, n, func_name in stat_list]
        self.assertTrue("heavy_foo" in func_names)

        old_stdout = sys.stdout
        sys.stdout = io = StringIO()
        self.sc.show_profiles()
        self.assertTrue("heavy_foo" in io.getvalue())
        sys.stdout = old_stdout

        d = tempfile.gettempdir()
        self.sc.dump_profiles(d)
        self.assertTrue("rdd_%d.pstats" % id in os.listdir(d))

    def test_custom_profiler(self):
        class TestCustomProfiler(BasicProfiler):
            def show(self, id):
                self.result = "Custom formatting"

        self.sc.profiler_collector.profiler_cls = TestCustomProfiler

        self.do_computation()

        profilers = self.sc.profiler_collector.profilers
        self.assertEqual(1, len(profilers))
        _, profiler, _ = profilers[0]
        self.assertTrue(isinstance(profiler, TestCustomProfiler))

        self.sc.show_profiles()
        self.assertEqual("Custom formatting", profiler.result)

    def do_computation(self):
        def heavy_foo(x):
            for i in range(1 << 18):
                x = 1

        rdd = self.sc.parallelize(range(100))
        rdd.foreach(heavy_foo)

Beispiel #3

Datei anzeigen

Datei: index.py Projekt: rahulchhangani/cosr-back

def spark_main():
    """ Main Spark entry point """

    conf = SparkConf().setAll(
        (("spark.python.profile", "true" if args.profile else "false"),
         ("spark.task.maxFailures", "20")))

    # TODO could this be set somewhere in cosr-ops instead?
    executor_environment = {}
    if config["ENV"] == "prod":
        executor_environment = {
            "PYTHONPATH": "/cosr/back",
            "PYSPARK_PYTHON": "/cosr/back/venv/bin/python",
            "LD_LIBRARY_PATH": "/usr/local/lib"
        }

    sc = SparkContext(appName="Common Search Index",
                      conf=conf,
                      environment=executor_environment)

    # First, generate a list of all WARC files
    warc_filenames = list_warc_filenames()

    # Then split their indexing in Spark workers
    warc_records = sc.parallelize(warc_filenames,
                                  len(warc_filenames)).flatMap(iter_records)

    if args.save_linkgraph_domains:

        # Here we begin using the real power of Spark: get all unique (from, to) tuples
        # from all the links in all the pages
        warc_links = warc_records.flatMap(iter_links_domain).distinct().map(
            lambda row: "%s %s" % row).coalesce(1)

        # warc_links.foreach(print_rows)

        warc_links.saveAsTextFile(args.save_linkgraph_domains)

    else:

        # This .count() call is what triggers the whole Spark pipeline
        print "Indexed %s WARC records" % warc_records.count()

    if args.profile:
        sc.show_profiles()

    sc.stop()

Beispiel #4

Datei anzeigen

Datei: sparkcc.py Projekt: ihor-nahuliak/cc-pyspark

    def run(self):
        self.args = self.parse_arguments()

        conf = SparkConf()

        if self.args.spark_profiler:
            conf = conf.set("spark.python.profile", "true")

        sc = SparkContext(appName=self.name, conf=conf)
        sqlc = SQLContext(sparkContext=sc)

        self.init_accumulators(sc)

        self.run_job(sc, sqlc)

        if self.args.spark_profiler:
            sc.show_profiles()

        sc.stop()

Beispiel #5

Datei anzeigen

 def test_profiler_disabled(self):
     sc = SparkContext(conf=SparkConf().set("spark.python.profile", "false"))
     try:
         self.assertRaisesRegexp(
             RuntimeError,
             "'spark.python.profile' configuration must be set",
             lambda: sc.show_profiles())
         self.assertRaisesRegexp(
             RuntimeError,
             "'spark.python.profile' configuration must be set",
             lambda: sc.dump_profiles("/tmp/abc"))
     finally:
         sc.stop()

Beispiel #6

Datei anzeigen

Datei: test_profiler.py Projekt: Brett-A/spark

 def test_profiler_disabled(self):
     sc = SparkContext(conf=SparkConf().set("spark.python.profile", "false"))
     try:
         self.assertRaisesRegexp(
             RuntimeError,
             "'spark.python.profile' configuration must be set",
             lambda: sc.show_profiles())
         self.assertRaisesRegexp(
             RuntimeError,
             "'spark.python.profile' configuration must be set",
             lambda: sc.dump_profiles("/tmp/abc"))
     finally:
         sc.stop()

Beispiel #7

Datei anzeigen

Datei: sparkcc.py Projekt: tylerkovacs/cc-pyspark

    def run(self):
        self.args = self.parse_arguments()

        conf = SparkConf().setAll((
            ("spark.task.maxFailures", "10"),
            ("spark.locality.wait", "20s"),
            ("spark.serializer", "org.apache.spark.serializer.KryoSerializer"),
        ))

        if self.args.spark_profiler:
            conf = conf.set("spark.python.profile", "true")

        sc = SparkContext(
            appName=self.name,
            conf=conf)
        sqlc = SQLContext(sparkContext=sc)

        self.init_accumulators(sc)

        self.run_job(sc, sqlc)
        if self.args.spark_profiler:
            sc.show_profiles()

        sc.stop()

Beispiel #8

Datei anzeigen

    new_df.show()

    # dataToArray = new_df.select(columns: _*).collect.map(_.toSeq)

    # # Trains a k-means model.
    # kmeans = KMeans().setK(2).setSeed(1)
    # model = kmeans.fit(new_df.select('features'))

    cost = np.zeros(20)
    conf = SparkConf().set("spark.python.profile", "true")
    for k in range(2, 20):
        kmeans = KMeans().setK(k).setSeed(1).setFeaturesCol("features")
        model = kmeans.fit(new_df.sample(False, 0.1, seed=42))
        cost[k] = model.computeCost(new_df)  # requires Spark 2.0 or later
        print("cost of ", k, " centroids is ", cost[k])
    sc.show_profiles()

    fig, ax = plt.subplots(1, 1, figsize=(8, 6))
    ax.plot(range(2, 20), cost[2:20])
    ax.set_xlabel('k')
    ax.set_ylabel('cost')
    plt.show()
    transformed = model.transform(new_df)
    transformed.show(100)

    sc.stop()
    # Evaluate clustering by computing Silhouette score
    # evaluator = ClusteringEvaluator()
    #
    # silhouette = evaluator.evaluate(predictions)
    # print("Silhouette with squared euclidean distance = " + str(silhouette))

Beispiel #9

Datei anzeigen

def main():
    sc = SparkContext('local[15]', 'haha')
    # sc._conf.set("spark.python.profile", "true")

    print(sc.getConf().getAll())

    d = load(sc)
    data_train_lp, data_dev_p, label_dev_gt, test_p = d['train_tfidf_lp'], d['dev_tfidf'], d['dev_gt'], d['test_tfidf']
    data_train_p, label_train_gt = d['train_tfidf'], d['train_gt']
    data_train, data_dev, data_test = d['train_raw'], d['dev_raw'], d['test_raw']

    data_train_lp = data_train_lp.sample(False, 0.01)
    
    # print(sum(data_train_lp.first()[0]))
    # print(data_train_lp.zipWithIndex().collect())
    print(data_train_lp.take(2))
    print("___________train_bayes_____________")
    sys.stdout.flush()
    nb = NaiveBayes.train(data_train_lp)
    print("___________trained_bayes___________")
    sys.stdout.flush()
    # nb.save(sc, 'bayes.model')
    bayes_result_dev = nb.predict(data_dev_p).map(int)
    bayes_result_dev.count()
    bayes_result_train = nb.predict(data_train_p).map(int)
    bayes_result_train.count()
    bayes_result_test = nb.predict(test_p).map(int)
    bayes_result_test.count()
    
    print("train info:")
    valid(bayes_result_train, label_train_gt)
    print("dev info:")
    valid(bayes_result_dev, label_dev_gt)

    print("___________train_logistic_____________")
    sys.stdout.flush()
    lg = LogisticRegressionWithSGD.train(data_train_lp, step=0.005)
    print("___________trained_logisitc___________")
    sys.stdout.flush()
    # lg.save(sc, 'logistic.model')
    logistic_result_dev = lg.predict(data_dev_p).map(int)
    logistic_result_train = lg.predict(data_train_p).map(int)
    logistic_result_test = lg.predict(test_p).map(int)

    print("train info:")
    valid(logistic_result_train, label_train_gt)
    print("dev info:")
    valid(logistic_result_dev, label_dev_gt)

    fused_train_p = stack_label([bayes_result_train, logistic_result_train])
    fused_dev_p = stack_label([bayes_result_dev, logistic_result_dev])
    fused_test_p = stack_label([bayes_result_test, logistic_result_test])

    fused_train_lp = label(data_train, fused_train_p)

    print("___________train_GBDT___________")
    sys.stdout.flush()
    gbdt = GradientBoostedTrees.trainClassifier(fused_train_lp, {})
    print('___________trained_GBDT_________')
    sys.stdout.flush()

    fused_result_train = gbdt.predict(fused_train_p)
    fused_result_dev = gbdt.predict(fused_dev_p)
    fused_result_test = gbdt.predict(fused_test_p)

    print("train info:")
    valid(fused_result_train, label_train_gt)
    print("dev info:")
    valid(fused_result_dev, label_dev_gt)

    dump(fused_result_test.map(int).collect())

    sc.show_profiles()

Beispiel #10

Datei anzeigen

Datei: test_udf_profiler.py Projekt: zoelin7/spark

class UDFProfilerTests(unittest.TestCase):
    def setUp(self):
        self._old_sys_path = list(sys.path)
        class_name = self.__class__.__name__
        conf = SparkConf().set("spark.python.profile", "true")
        self.sc = SparkContext("local[4]", class_name, conf=conf)
        self.spark = SparkSession.builder._sparkContext(self.sc).getOrCreate()

    def tearDown(self):
        self.spark.stop()
        sys.path = self._old_sys_path

    def test_udf_profiler(self):
        self.do_computation()

        profilers = self.sc.profiler_collector.profilers
        self.assertEqual(3, len(profilers))

        old_stdout = sys.stdout
        try:
            sys.stdout = io = StringIO()
            self.sc.show_profiles()
        finally:
            sys.stdout = old_stdout

        d = tempfile.gettempdir()
        self.sc.dump_profiles(d)

        for i, udf_name in enumerate(["add1", "add2", "add1"]):
            id, profiler, _ = profilers[i]
            with self.subTest(id=id, udf_name=udf_name):
                stats = profiler.stats()
                self.assertTrue(stats is not None)
                width, stat_list = stats.get_print_list([])
                func_names = [func_name for fname, n, func_name in stat_list]
                self.assertTrue(udf_name in func_names)

                self.assertTrue(udf_name in io.getvalue())
                self.assertTrue("udf_%d.pstats" % id in os.listdir(d))

    def test_custom_udf_profiler(self):
        class TestCustomProfiler(UDFBasicProfiler):
            def show(self, id):
                self.result = "Custom formatting"

        self.sc.profiler_collector.udf_profiler_cls = TestCustomProfiler

        self.do_computation()

        profilers = self.sc.profiler_collector.profilers
        self.assertEqual(3, len(profilers))
        _, profiler, _ = profilers[0]
        self.assertTrue(isinstance(profiler, TestCustomProfiler))

        self.sc.show_profiles()
        self.assertEqual("Custom formatting", profiler.result)

    def do_computation(self):
        @udf
        def add1(x):
            return x + 1

        @udf
        def add2(x):
            return x + 2

        df = self.spark.range(10)
        df.select(add1("id"), add2("id"), add1("id")).collect()

Beispiel #11

Datei anzeigen

from pyspark import SparkContext, SparkConf
import numpy as np

conf = SparkConf()
conf.set('master', 'spark://hadoop-maste:7077')
conf.set('spark.python.profile', 'true')
context = SparkContext(conf=conf)
rdd = context.parallelize(np.arange(10), 3)
print(rdd.collect())
print(context.show_profiles())
context.dump_profiles('/datas/profiles/')
context.stop()