Ejemplo n.º 1
0
        def create_spark_session(app_name="SparkApplication_{}".format(
            datetime.utcfromtimestamp(
                time.time()).strftime('%Y-%m-%d %H:%M:%S'))):
            def init_calc_props(spark_ctx_conf):
                if {'parallelism-to-tasks-factor', 'total-executor-cores'
                    } <= set(dict(spark_ctx_conf.getAll())):
                    total_executor_cores = spark_utils.calc_max_cores(
                        int(spark_ctx_conf.get('total-executor-cores')))
                    parallelism_to_tasks_factor = int(
                        spark_ctx_conf.get('parallelism-to-tasks-factor'))
                    value = str(total_executor_cores *
                                parallelism_to_tasks_factor)
                    logging.info(
                        "total_executor_cores: {0}, parallelism_to_tasks_factor: {1},value: {2}"
                        .format(total_executor_cores,
                                parallelism_to_tasks_factor, value))
                    spark_ctx_conf.set('spark.default.parallelism', value)
                    spark_ctx_conf.set('spark.sql.shuffle.partitions', value)
                    logging.debug('Starting local spark with conf: {0}'.format(
                        "\n".join(str(v) for v in spark_ctx_conf.getAll())))

            from pyspark.sql import SparkSession
            from pyspark.context import SparkConf
            spark_conf = SparkConf()
            spark_conf.setAll(spark_conf_dict.items())
            init_calc_props(spark_conf)
            sc = spark_utils.generate_spark_context(spark_conf=spark_conf)
            spark = SparkSession(sc)

            spark.sparkContext.setLogLevel("WARN")

            return spark
Ejemplo n.º 2
0
def initialize(target_partitions=None):
    """Returns SparkContext and SQLContext."""
    conf = SparkConf()
    extra_settings = {
        'spark.serializer': 'org.apache.spark.serializer.KryoSerializer',
        'spark.executor.extraJavaOptions': '-XX:+UseG1GC'
    }
    if target_partitions:
        extra_settings['spark.default.parallelism'] = target_partitions

    conf.setAll(extra_settings.items())
    environment = {'PYTHON_EGG_CACHE': '/tmp/python-eggs'}
    sc = SparkContext(conf=conf, environment=environment)

    sqlContext = SQLContext(sc)
    if target_partitions:
        sqlContext.setConf('spark.sql.shuffle.partitions', target_partitions)

    jvm_logger = sc._jvm.org.apache.log4j
    jvm_logger.LogManager.getLogger("org").setLevel(jvm_logger.Level.ERROR)
    jvm_logger.LogManager.getLogger("akka").setLevel(jvm_logger.Level.ERROR)
    return sc, sqlContext