def test_run_spark(): # Run a Spark job and then check to make sure we got the result. # To get the result back, we have to save it in a file. But we only want to call # NamedTemporaryFile once, so we store the temporary file name in an environment variable. # For the same reason, we can't open the file in truncate mode. if not cspark.spark_available(): return # don't test if no Spark is available if TEST_RUN_SPARK_FILENAME not in os.environ: import tempfile f = tempfile.NamedTemporaryFile(delete=False, mode='w+') os.environ[TEST_RUN_SPARK_FILENAME] = f.name f.close() with open(os.environ[TEST_RUN_SPARK_FILENAME], "w+") as f: if cspark.spark_submit(loglevel='error', pyfiles=[CSPARK_PATH], argv=[__file__]): from pyspark import SparkContext, SparkConf import operator conf = SparkConf().setAppName("cspark_test:test_run_spark") sc = SparkContext(conf=conf) sc.setLogLevel("ERROR") mysum = sc.parallelize(range(1000000)).reduce(operator.add) f.truncate(0) f.write("{}\n".format(mysum)) f.close() exit(0) # Spark job is finished f.seek(0) data = f.read() assert data == '499999500000\n' print("spark ran successfully") os.unlink(os.environ[TEST_RUN_SPARK_FILENAME])
def test_spark_submit(): # Run a Spark job and then check to make sure we got the result. # To get the result back, we have to save it in a file. But we only want to call # NamedTemporaryFile once, so we store the temporary file name in an environment variable. # For the same reason, we can't open the file in truncate mode. return raise RuntimeWarning("""WARNING: this test can make all test suite exit, likely because of the use of os.execvp in cspark.py. See comments inline in the test""") if not cspark.spark_available(): return # don't test if no Spark is available # spark-submit will run in a subprocess if TEST_RUN_SPARK_FILENAME not in os.environ: import tempfile f = tempfile.NamedTemporaryFile(delete=False, mode='w+') os.environ[TEST_RUN_SPARK_FILENAME] = f.name f.close() """
'pid': os.getpid(), 'x': x, 'y': y, 'func': 'myadder', 'applicationId': applicationId() })) return x + y if __name__ == "__main__": import argparse parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) args = parser.parse_args() if not cspark.spark_available(): print("Spark is not available.") exit(0) print("Running spark with 16 executors.... My PID is {}".format( os.getpid())) sc = cspark.spark_session( num_executors=16, pyfiles=[ os.path.join(os.path.dirname(os.path.abspath(__file__)), 'clogging.py') ]).sparkContext print("Spark Context Obtained. sc={} My PID is now {}".format( sc, os.getpid())) print("application id:", sc.applicationId)