Beispiel #1
0
class ProfilerTests(PySparkTestCase):

    def setUp(self):
        self._old_sys_path = list(sys.path)
        class_name = self.__class__.__name__
        conf = SparkConf().set("spark.python.profile", "true")
        self.sc = SparkContext('local[4]', class_name, conf=conf)

    def test_profiler(self):
        self.do_computation()

        profilers = self.sc.profiler_collector.profilers
        self.assertEqual(1, len(profilers))
        id, profiler, _ = profilers[0]
        stats = profiler.stats()
        self.assertTrue(stats is not None)
        width, stat_list = stats.get_print_list([])
        func_names = [func_name for fname, n, func_name in stat_list]
        self.assertTrue("heavy_foo" in func_names)

        old_stdout = sys.stdout
        sys.stdout = io = StringIO()
        self.sc.show_profiles()
        self.assertTrue("heavy_foo" in io.getvalue())
        sys.stdout = old_stdout

        d = tempfile.gettempdir()
        self.sc.dump_profiles(d)
        self.assertTrue("rdd_%d.pstats" % id in os.listdir(d))

    def test_custom_profiler(self):
        class TestCustomProfiler(BasicProfiler):
            def show(self, id):
                self.result = "Custom formatting"

        self.sc.profiler_collector.profiler_cls = TestCustomProfiler

        self.do_computation()

        profilers = self.sc.profiler_collector.profilers
        self.assertEqual(1, len(profilers))
        _, profiler, _ = profilers[0]
        self.assertTrue(isinstance(profiler, TestCustomProfiler))

        self.sc.show_profiles()
        self.assertEqual("Custom formatting", profiler.result)

    def do_computation(self):
        def heavy_foo(x):
            for i in range(1 << 18):
                x = 1

        rdd = self.sc.parallelize(range(100))
        rdd.foreach(heavy_foo)
Beispiel #2
0
class ProfilerTests(PySparkTestCase):

    def setUp(self):
        self._old_sys_path = list(sys.path)
        class_name = self.__class__.__name__
        conf = SparkConf().set("spark.python.profile", "true")
        self.sc = SparkContext('local[4]', class_name, conf=conf)

    def test_profiler(self):
        self.do_computation()

        profilers = self.sc.profiler_collector.profilers
        self.assertEqual(1, len(profilers))
        id, profiler, _ = profilers[0]
        stats = profiler.stats()
        self.assertTrue(stats is not None)
        width, stat_list = stats.get_print_list([])
        func_names = [func_name for fname, n, func_name in stat_list]
        self.assertTrue("heavy_foo" in func_names)

        old_stdout = sys.stdout
        sys.stdout = io = StringIO()
        self.sc.show_profiles()
        self.assertTrue("heavy_foo" in io.getvalue())
        sys.stdout = old_stdout

        d = tempfile.gettempdir()
        self.sc.dump_profiles(d)
        self.assertTrue("rdd_%d.pstats" % id in os.listdir(d))

    def test_custom_profiler(self):
        class TestCustomProfiler(BasicProfiler):
            def show(self, id):
                self.result = "Custom formatting"

        self.sc.profiler_collector.profiler_cls = TestCustomProfiler

        self.do_computation()

        profilers = self.sc.profiler_collector.profilers
        self.assertEqual(1, len(profilers))
        _, profiler, _ = profilers[0]
        self.assertTrue(isinstance(profiler, TestCustomProfiler))

        self.sc.show_profiles()
        self.assertEqual("Custom formatting", profiler.result)

    def do_computation(self):
        def heavy_foo(x):
            for i in range(1 << 18):
                x = 1

        rdd = self.sc.parallelize(range(100))
        rdd.foreach(heavy_foo)
Beispiel #3
0
def spark_main():
    """ Main Spark entry point """

    conf = SparkConf().setAll(
        (("spark.python.profile", "true" if args.profile else "false"),
         ("spark.task.maxFailures", "20")))

    # TODO could this be set somewhere in cosr-ops instead?
    executor_environment = {}
    if config["ENV"] == "prod":
        executor_environment = {
            "PYTHONPATH": "/cosr/back",
            "PYSPARK_PYTHON": "/cosr/back/venv/bin/python",
            "LD_LIBRARY_PATH": "/usr/local/lib"
        }

    sc = SparkContext(appName="Common Search Index",
                      conf=conf,
                      environment=executor_environment)

    # First, generate a list of all WARC files
    warc_filenames = list_warc_filenames()

    # Then split their indexing in Spark workers
    warc_records = sc.parallelize(warc_filenames,
                                  len(warc_filenames)).flatMap(iter_records)

    if args.save_linkgraph_domains:

        # Here we begin using the real power of Spark: get all unique (from, to) tuples
        # from all the links in all the pages
        warc_links = warc_records.flatMap(iter_links_domain).distinct().map(
            lambda row: "%s %s" % row).coalesce(1)

        # warc_links.foreach(print_rows)

        warc_links.saveAsTextFile(args.save_linkgraph_domains)

    else:

        # This .count() call is what triggers the whole Spark pipeline
        print "Indexed %s WARC records" % warc_records.count()

    if args.profile:
        sc.show_profiles()

    sc.stop()
Beispiel #4
0
    def run(self):
        self.args = self.parse_arguments()

        conf = SparkConf()

        if self.args.spark_profiler:
            conf = conf.set("spark.python.profile", "true")

        sc = SparkContext(appName=self.name, conf=conf)
        sqlc = SQLContext(sparkContext=sc)

        self.init_accumulators(sc)

        self.run_job(sc, sqlc)

        if self.args.spark_profiler:
            sc.show_profiles()

        sc.stop()
Beispiel #5
0
 def test_profiler_disabled(self):
     sc = SparkContext(conf=SparkConf().set("spark.python.profile", "false"))
     try:
         self.assertRaisesRegexp(
             RuntimeError,
             "'spark.python.profile' configuration must be set",
             lambda: sc.show_profiles())
         self.assertRaisesRegexp(
             RuntimeError,
             "'spark.python.profile' configuration must be set",
             lambda: sc.dump_profiles("/tmp/abc"))
     finally:
         sc.stop()
Beispiel #6
0
 def test_profiler_disabled(self):
     sc = SparkContext(conf=SparkConf().set("spark.python.profile", "false"))
     try:
         self.assertRaisesRegexp(
             RuntimeError,
             "'spark.python.profile' configuration must be set",
             lambda: sc.show_profiles())
         self.assertRaisesRegexp(
             RuntimeError,
             "'spark.python.profile' configuration must be set",
             lambda: sc.dump_profiles("/tmp/abc"))
     finally:
         sc.stop()
Beispiel #7
0
    def run(self):
        self.args = self.parse_arguments()

        conf = SparkConf().setAll((
            ("spark.task.maxFailures", "10"),
            ("spark.locality.wait", "20s"),
            ("spark.serializer", "org.apache.spark.serializer.KryoSerializer"),
        ))

        if self.args.spark_profiler:
            conf = conf.set("spark.python.profile", "true")

        sc = SparkContext(
            appName=self.name,
            conf=conf)
        sqlc = SQLContext(sparkContext=sc)

        self.init_accumulators(sc)

        self.run_job(sc, sqlc)
        if self.args.spark_profiler:
            sc.show_profiles()

        sc.stop()
Beispiel #8
0
    new_df.show()

    # dataToArray = new_df.select(columns: _*).collect.map(_.toSeq)

    # # Trains a k-means model.
    # kmeans = KMeans().setK(2).setSeed(1)
    # model = kmeans.fit(new_df.select('features'))

    cost = np.zeros(20)
    conf = SparkConf().set("spark.python.profile", "true")
    for k in range(2, 20):
        kmeans = KMeans().setK(k).setSeed(1).setFeaturesCol("features")
        model = kmeans.fit(new_df.sample(False, 0.1, seed=42))
        cost[k] = model.computeCost(new_df)  # requires Spark 2.0 or later
        print("cost of ", k, " centroids is ", cost[k])
    sc.show_profiles()

    fig, ax = plt.subplots(1, 1, figsize=(8, 6))
    ax.plot(range(2, 20), cost[2:20])
    ax.set_xlabel('k')
    ax.set_ylabel('cost')
    plt.show()
    transformed = model.transform(new_df)
    transformed.show(100)

    sc.stop()
    # Evaluate clustering by computing Silhouette score
    # evaluator = ClusteringEvaluator()
    #
    # silhouette = evaluator.evaluate(predictions)
    # print("Silhouette with squared euclidean distance = " + str(silhouette))
Beispiel #9
0
def main():
    sc = SparkContext('local[15]', 'haha')
    # sc._conf.set("spark.python.profile", "true")

    print(sc.getConf().getAll())

    d = load(sc)
    data_train_lp, data_dev_p, label_dev_gt, test_p = d['train_tfidf_lp'], d['dev_tfidf'], d['dev_gt'], d['test_tfidf']
    data_train_p, label_train_gt = d['train_tfidf'], d['train_gt']
    data_train, data_dev, data_test = d['train_raw'], d['dev_raw'], d['test_raw']

    data_train_lp = data_train_lp.sample(False, 0.01)
    
    # print(sum(data_train_lp.first()[0]))
    # print(data_train_lp.zipWithIndex().collect())
    print(data_train_lp.take(2))
    print("___________train_bayes_____________")
    sys.stdout.flush()
    nb = NaiveBayes.train(data_train_lp)
    print("___________trained_bayes___________")
    sys.stdout.flush()
    # nb.save(sc, 'bayes.model')
    bayes_result_dev = nb.predict(data_dev_p).map(int)
    bayes_result_dev.count()
    bayes_result_train = nb.predict(data_train_p).map(int)
    bayes_result_train.count()
    bayes_result_test = nb.predict(test_p).map(int)
    bayes_result_test.count()
    
    print("train info:")
    valid(bayes_result_train, label_train_gt)
    print("dev info:")
    valid(bayes_result_dev, label_dev_gt)

    print("___________train_logistic_____________")
    sys.stdout.flush()
    lg = LogisticRegressionWithSGD.train(data_train_lp, step=0.005)
    print("___________trained_logisitc___________")
    sys.stdout.flush()
    # lg.save(sc, 'logistic.model')
    logistic_result_dev = lg.predict(data_dev_p).map(int)
    logistic_result_train = lg.predict(data_train_p).map(int)
    logistic_result_test = lg.predict(test_p).map(int)

    print("train info:")
    valid(logistic_result_train, label_train_gt)
    print("dev info:")
    valid(logistic_result_dev, label_dev_gt)

    fused_train_p = stack_label([bayes_result_train, logistic_result_train])
    fused_dev_p = stack_label([bayes_result_dev, logistic_result_dev])
    fused_test_p = stack_label([bayes_result_test, logistic_result_test])

    fused_train_lp = label(data_train, fused_train_p)

    print("___________train_GBDT___________")
    sys.stdout.flush()
    gbdt = GradientBoostedTrees.trainClassifier(fused_train_lp, {})
    print('___________trained_GBDT_________')
    sys.stdout.flush()

    fused_result_train = gbdt.predict(fused_train_p)
    fused_result_dev = gbdt.predict(fused_dev_p)
    fused_result_test = gbdt.predict(fused_test_p)

    print("train info:")
    valid(fused_result_train, label_train_gt)
    print("dev info:")
    valid(fused_result_dev, label_dev_gt)

    dump(fused_result_test.map(int).collect())

    sc.show_profiles()
Beispiel #10
0
class UDFProfilerTests(unittest.TestCase):
    def setUp(self):
        self._old_sys_path = list(sys.path)
        class_name = self.__class__.__name__
        conf = SparkConf().set("spark.python.profile", "true")
        self.sc = SparkContext("local[4]", class_name, conf=conf)
        self.spark = SparkSession.builder._sparkContext(self.sc).getOrCreate()

    def tearDown(self):
        self.spark.stop()
        sys.path = self._old_sys_path

    def test_udf_profiler(self):
        self.do_computation()

        profilers = self.sc.profiler_collector.profilers
        self.assertEqual(3, len(profilers))

        old_stdout = sys.stdout
        try:
            sys.stdout = io = StringIO()
            self.sc.show_profiles()
        finally:
            sys.stdout = old_stdout

        d = tempfile.gettempdir()
        self.sc.dump_profiles(d)

        for i, udf_name in enumerate(["add1", "add2", "add1"]):
            id, profiler, _ = profilers[i]
            with self.subTest(id=id, udf_name=udf_name):
                stats = profiler.stats()
                self.assertTrue(stats is not None)
                width, stat_list = stats.get_print_list([])
                func_names = [func_name for fname, n, func_name in stat_list]
                self.assertTrue(udf_name in func_names)

                self.assertTrue(udf_name in io.getvalue())
                self.assertTrue("udf_%d.pstats" % id in os.listdir(d))

    def test_custom_udf_profiler(self):
        class TestCustomProfiler(UDFBasicProfiler):
            def show(self, id):
                self.result = "Custom formatting"

        self.sc.profiler_collector.udf_profiler_cls = TestCustomProfiler

        self.do_computation()

        profilers = self.sc.profiler_collector.profilers
        self.assertEqual(3, len(profilers))
        _, profiler, _ = profilers[0]
        self.assertTrue(isinstance(profiler, TestCustomProfiler))

        self.sc.show_profiles()
        self.assertEqual("Custom formatting", profiler.result)

    def do_computation(self):
        @udf
        def add1(x):
            return x + 1

        @udf
        def add2(x):
            return x + 2

        df = self.spark.range(10)
        df.select(add1("id"), add2("id"), add1("id")).collect()
Beispiel #11
0
from pyspark import SparkContext, SparkConf
import numpy as np

conf = SparkConf()
conf.set('master', 'spark://hadoop-maste:7077')
conf.set('spark.python.profile', 'true')
context = SparkContext(conf=conf)
rdd = context.parallelize(np.arange(10), 3)
print(rdd.collect())
print(context.show_profiles())
context.dump_profiles('/datas/profiles/')
context.stop()