Esempio n. 1
0
def test_run_spark():
    # Run a Spark job and then check to make sure we got the result.
    # To get the result back, we have to save it in a file. But we only want to call
    # NamedTemporaryFile once, so we store the temporary file name in an environment variable.
    # For the same reason, we can't open the file in truncate mode.

    if not cspark.spark_available():
        return  # don't test if no Spark is available

    if TEST_RUN_SPARK_FILENAME not in os.environ:
        import tempfile
        f = tempfile.NamedTemporaryFile(delete=False, mode='w+')
        os.environ[TEST_RUN_SPARK_FILENAME] = f.name
        f.close()

    with open(os.environ[TEST_RUN_SPARK_FILENAME], "w+") as f:
        if cspark.spark_submit(loglevel='error',
                               pyfiles=[CSPARK_PATH],
                               argv=[__file__]):
            from pyspark import SparkContext, SparkConf
            import operator
            conf = SparkConf().setAppName("cspark_test:test_run_spark")
            sc = SparkContext(conf=conf)
            sc.setLogLevel("ERROR")
            mysum = sc.parallelize(range(1000000)).reduce(operator.add)
            f.truncate(0)
            f.write("{}\n".format(mysum))
            f.close()
            exit(0)  # Spark job is finished
        f.seek(0)
        data = f.read()
        assert data == '499999500000\n'
        print("spark ran successfully")
    os.unlink(os.environ[TEST_RUN_SPARK_FILENAME])
Esempio n. 2
0
def test_spark_submit():
    # Run a Spark job and then check to make sure we got the result.
    # To get the result back, we have to save it in a file. But we only want to call
    # NamedTemporaryFile once, so we store the temporary file name in an environment variable.
    # For the same reason, we can't open the file in truncate mode.

    return

    raise RuntimeWarning("""WARNING: this test can make all test suite exit, likely because of the use of os.execvp in cspark.py. See comments inline in the test""")

    if not cspark.spark_available():
        return                  # don't test if no Spark is available

    # spark-submit will run in a subprocess

    if TEST_RUN_SPARK_FILENAME not in os.environ:
        import tempfile
        f = tempfile.NamedTemporaryFile(delete=False, mode='w+')
        os.environ[TEST_RUN_SPARK_FILENAME] = f.name
        f.close()

    """
Esempio n. 3
0
            'pid': os.getpid(),
            'x': x,
            'y': y,
            'func': 'myadder',
            'applicationId': applicationId()
        }))
    return x + y


if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    args = parser.parse_args()

    if not cspark.spark_available():
        print("Spark is not available.")
        exit(0)

    print("Running spark with 16 executors.... My PID is {}".format(
        os.getpid()))
    sc = cspark.spark_session(
        num_executors=16,
        pyfiles=[
            os.path.join(os.path.dirname(os.path.abspath(__file__)),
                         'clogging.py')
        ]).sparkContext
    print("Spark Context Obtained. sc={}  My PID is now {}".format(
        sc, os.getpid()))
    print("application id:", sc.applicationId)