class ProfilerTests(PySparkTestCase): def setUp(self): self._old_sys_path = list(sys.path) class_name = self.__class__.__name__ conf = SparkConf().set("spark.python.profile", "true") self.sc = SparkContext('local[4]', class_name, conf=conf) def test_profiler(self): self.do_computation() profilers = self.sc.profiler_collector.profilers self.assertEqual(1, len(profilers)) id, profiler, _ = profilers[0] stats = profiler.stats() self.assertTrue(stats is not None) width, stat_list = stats.get_print_list([]) func_names = [func_name for fname, n, func_name in stat_list] self.assertTrue("heavy_foo" in func_names) old_stdout = sys.stdout sys.stdout = io = StringIO() self.sc.show_profiles() self.assertTrue("heavy_foo" in io.getvalue()) sys.stdout = old_stdout d = tempfile.gettempdir() self.sc.dump_profiles(d) self.assertTrue("rdd_%d.pstats" % id in os.listdir(d)) def test_custom_profiler(self): class TestCustomProfiler(BasicProfiler): def show(self, id): self.result = "Custom formatting" self.sc.profiler_collector.profiler_cls = TestCustomProfiler self.do_computation() profilers = self.sc.profiler_collector.profilers self.assertEqual(1, len(profilers)) _, profiler, _ = profilers[0] self.assertTrue(isinstance(profiler, TestCustomProfiler)) self.sc.show_profiles() self.assertEqual("Custom formatting", profiler.result) def do_computation(self): def heavy_foo(x): for i in range(1 << 18): x = 1 rdd = self.sc.parallelize(range(100)) rdd.foreach(heavy_foo)
class ProfilerTests(PySparkTestCase): def setUp(self): self._old_sys_path = list(sys.path) class_name = self.__class__.__name__ conf = SparkConf().set("spark.python.profile", "true") self.sc = SparkContext('local[4]', class_name, conf=conf) def test_profiler(self): self.do_computation() profilers = self.sc.profiler_collector.profilers self.assertEqual(1, len(profilers)) id, profiler, _ = profilers[0] stats = profiler.stats() self.assertTrue(stats is not None) width, stat_list = stats.get_print_list([]) func_names = [func_name for fname, n, func_name in stat_list] self.assertTrue("heavy_foo" in func_names) old_stdout = sys.stdout sys.stdout = io = StringIO() self.sc.show_profiles() self.assertTrue("heavy_foo" in io.getvalue()) sys.stdout = old_stdout d = tempfile.gettempdir() self.sc.dump_profiles(d) self.assertTrue("rdd_%d.pstats" % id in os.listdir(d)) def test_custom_profiler(self): class TestCustomProfiler(BasicProfiler): def show(self, id): self.result = "Custom formatting" self.sc.profiler_collector.profiler_cls = TestCustomProfiler self.do_computation() profilers = self.sc.profiler_collector.profilers self.assertEqual(1, len(profilers)) _, profiler, _ = profilers[0] self.assertTrue(isinstance(profiler, TestCustomProfiler)) self.sc.show_profiles() self.assertEqual("Custom formatting", profiler.result) def do_computation(self): def heavy_foo(x): for i in range(1 << 18): x = 1 rdd = self.sc.parallelize(range(100)) rdd.foreach(heavy_foo)
def spark_main(): """ Main Spark entry point """ conf = SparkConf().setAll( (("spark.python.profile", "true" if args.profile else "false"), ("spark.task.maxFailures", "20"))) # TODO could this be set somewhere in cosr-ops instead? executor_environment = {} if config["ENV"] == "prod": executor_environment = { "PYTHONPATH": "/cosr/back", "PYSPARK_PYTHON": "/cosr/back/venv/bin/python", "LD_LIBRARY_PATH": "/usr/local/lib" } sc = SparkContext(appName="Common Search Index", conf=conf, environment=executor_environment) # First, generate a list of all WARC files warc_filenames = list_warc_filenames() # Then split their indexing in Spark workers warc_records = sc.parallelize(warc_filenames, len(warc_filenames)).flatMap(iter_records) if args.save_linkgraph_domains: # Here we begin using the real power of Spark: get all unique (from, to) tuples # from all the links in all the pages warc_links = warc_records.flatMap(iter_links_domain).distinct().map( lambda row: "%s %s" % row).coalesce(1) # warc_links.foreach(print_rows) warc_links.saveAsTextFile(args.save_linkgraph_domains) else: # This .count() call is what triggers the whole Spark pipeline print "Indexed %s WARC records" % warc_records.count() if args.profile: sc.show_profiles() sc.stop()
def run(self): self.args = self.parse_arguments() conf = SparkConf() if self.args.spark_profiler: conf = conf.set("spark.python.profile", "true") sc = SparkContext(appName=self.name, conf=conf) sqlc = SQLContext(sparkContext=sc) self.init_accumulators(sc) self.run_job(sc, sqlc) if self.args.spark_profiler: sc.show_profiles() sc.stop()
def test_profiler_disabled(self): sc = SparkContext(conf=SparkConf().set("spark.python.profile", "false")) try: self.assertRaisesRegexp( RuntimeError, "'spark.python.profile' configuration must be set", lambda: sc.show_profiles()) self.assertRaisesRegexp( RuntimeError, "'spark.python.profile' configuration must be set", lambda: sc.dump_profiles("/tmp/abc")) finally: sc.stop()
def test_profiler_disabled(self): sc = SparkContext(conf=SparkConf().set("spark.python.profile", "false")) try: self.assertRaisesRegexp( RuntimeError, "'spark.python.profile' configuration must be set", lambda: sc.show_profiles()) self.assertRaisesRegexp( RuntimeError, "'spark.python.profile' configuration must be set", lambda: sc.dump_profiles("/tmp/abc")) finally: sc.stop()
def run(self): self.args = self.parse_arguments() conf = SparkConf().setAll(( ("spark.task.maxFailures", "10"), ("spark.locality.wait", "20s"), ("spark.serializer", "org.apache.spark.serializer.KryoSerializer"), )) if self.args.spark_profiler: conf = conf.set("spark.python.profile", "true") sc = SparkContext( appName=self.name, conf=conf) sqlc = SQLContext(sparkContext=sc) self.init_accumulators(sc) self.run_job(sc, sqlc) if self.args.spark_profiler: sc.show_profiles() sc.stop()
new_df.show() # dataToArray = new_df.select(columns: _*).collect.map(_.toSeq) # # Trains a k-means model. # kmeans = KMeans().setK(2).setSeed(1) # model = kmeans.fit(new_df.select('features')) cost = np.zeros(20) conf = SparkConf().set("spark.python.profile", "true") for k in range(2, 20): kmeans = KMeans().setK(k).setSeed(1).setFeaturesCol("features") model = kmeans.fit(new_df.sample(False, 0.1, seed=42)) cost[k] = model.computeCost(new_df) # requires Spark 2.0 or later print("cost of ", k, " centroids is ", cost[k]) sc.show_profiles() fig, ax = plt.subplots(1, 1, figsize=(8, 6)) ax.plot(range(2, 20), cost[2:20]) ax.set_xlabel('k') ax.set_ylabel('cost') plt.show() transformed = model.transform(new_df) transformed.show(100) sc.stop() # Evaluate clustering by computing Silhouette score # evaluator = ClusteringEvaluator() # # silhouette = evaluator.evaluate(predictions) # print("Silhouette with squared euclidean distance = " + str(silhouette))
def main(): sc = SparkContext('local[15]', 'haha') # sc._conf.set("spark.python.profile", "true") print(sc.getConf().getAll()) d = load(sc) data_train_lp, data_dev_p, label_dev_gt, test_p = d['train_tfidf_lp'], d['dev_tfidf'], d['dev_gt'], d['test_tfidf'] data_train_p, label_train_gt = d['train_tfidf'], d['train_gt'] data_train, data_dev, data_test = d['train_raw'], d['dev_raw'], d['test_raw'] data_train_lp = data_train_lp.sample(False, 0.01) # print(sum(data_train_lp.first()[0])) # print(data_train_lp.zipWithIndex().collect()) print(data_train_lp.take(2)) print("___________train_bayes_____________") sys.stdout.flush() nb = NaiveBayes.train(data_train_lp) print("___________trained_bayes___________") sys.stdout.flush() # nb.save(sc, 'bayes.model') bayes_result_dev = nb.predict(data_dev_p).map(int) bayes_result_dev.count() bayes_result_train = nb.predict(data_train_p).map(int) bayes_result_train.count() bayes_result_test = nb.predict(test_p).map(int) bayes_result_test.count() print("train info:") valid(bayes_result_train, label_train_gt) print("dev info:") valid(bayes_result_dev, label_dev_gt) print("___________train_logistic_____________") sys.stdout.flush() lg = LogisticRegressionWithSGD.train(data_train_lp, step=0.005) print("___________trained_logisitc___________") sys.stdout.flush() # lg.save(sc, 'logistic.model') logistic_result_dev = lg.predict(data_dev_p).map(int) logistic_result_train = lg.predict(data_train_p).map(int) logistic_result_test = lg.predict(test_p).map(int) print("train info:") valid(logistic_result_train, label_train_gt) print("dev info:") valid(logistic_result_dev, label_dev_gt) fused_train_p = stack_label([bayes_result_train, logistic_result_train]) fused_dev_p = stack_label([bayes_result_dev, logistic_result_dev]) fused_test_p = stack_label([bayes_result_test, logistic_result_test]) fused_train_lp = label(data_train, fused_train_p) print("___________train_GBDT___________") sys.stdout.flush() gbdt = GradientBoostedTrees.trainClassifier(fused_train_lp, {}) print('___________trained_GBDT_________') sys.stdout.flush() fused_result_train = gbdt.predict(fused_train_p) fused_result_dev = gbdt.predict(fused_dev_p) fused_result_test = gbdt.predict(fused_test_p) print("train info:") valid(fused_result_train, label_train_gt) print("dev info:") valid(fused_result_dev, label_dev_gt) dump(fused_result_test.map(int).collect()) sc.show_profiles()
class UDFProfilerTests(unittest.TestCase): def setUp(self): self._old_sys_path = list(sys.path) class_name = self.__class__.__name__ conf = SparkConf().set("spark.python.profile", "true") self.sc = SparkContext("local[4]", class_name, conf=conf) self.spark = SparkSession.builder._sparkContext(self.sc).getOrCreate() def tearDown(self): self.spark.stop() sys.path = self._old_sys_path def test_udf_profiler(self): self.do_computation() profilers = self.sc.profiler_collector.profilers self.assertEqual(3, len(profilers)) old_stdout = sys.stdout try: sys.stdout = io = StringIO() self.sc.show_profiles() finally: sys.stdout = old_stdout d = tempfile.gettempdir() self.sc.dump_profiles(d) for i, udf_name in enumerate(["add1", "add2", "add1"]): id, profiler, _ = profilers[i] with self.subTest(id=id, udf_name=udf_name): stats = profiler.stats() self.assertTrue(stats is not None) width, stat_list = stats.get_print_list([]) func_names = [func_name for fname, n, func_name in stat_list] self.assertTrue(udf_name in func_names) self.assertTrue(udf_name in io.getvalue()) self.assertTrue("udf_%d.pstats" % id in os.listdir(d)) def test_custom_udf_profiler(self): class TestCustomProfiler(UDFBasicProfiler): def show(self, id): self.result = "Custom formatting" self.sc.profiler_collector.udf_profiler_cls = TestCustomProfiler self.do_computation() profilers = self.sc.profiler_collector.profilers self.assertEqual(3, len(profilers)) _, profiler, _ = profilers[0] self.assertTrue(isinstance(profiler, TestCustomProfiler)) self.sc.show_profiles() self.assertEqual("Custom formatting", profiler.result) def do_computation(self): @udf def add1(x): return x + 1 @udf def add2(x): return x + 2 df = self.spark.range(10) df.select(add1("id"), add2("id"), add1("id")).collect()
from pyspark import SparkContext, SparkConf import numpy as np conf = SparkConf() conf.set('master', 'spark://hadoop-maste:7077') conf.set('spark.python.profile', 'true') context = SparkContext(conf=conf) rdd = context.parallelize(np.arange(10), 3) print(rdd.collect()) print(context.show_profiles()) context.dump_profiles('/datas/profiles/') context.stop()