def create_spark_session(app_name="SparkApplication_{}".format( datetime.utcfromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'))): def init_calc_props(spark_ctx_conf): if {'parallelism-to-tasks-factor', 'total-executor-cores' } <= set(dict(spark_ctx_conf.getAll())): total_executor_cores = spark_utils.calc_max_cores( int(spark_ctx_conf.get('total-executor-cores'))) parallelism_to_tasks_factor = int( spark_ctx_conf.get('parallelism-to-tasks-factor')) value = str(total_executor_cores * parallelism_to_tasks_factor) logging.info( "total_executor_cores: {0}, parallelism_to_tasks_factor: {1},value: {2}" .format(total_executor_cores, parallelism_to_tasks_factor, value)) spark_ctx_conf.set('spark.default.parallelism', value) spark_ctx_conf.set('spark.sql.shuffle.partitions', value) logging.debug('Starting local spark with conf: {0}'.format( "\n".join(str(v) for v in spark_ctx_conf.getAll()))) from pyspark.sql import SparkSession from pyspark.context import SparkConf spark_conf = SparkConf() spark_conf.setAll(spark_conf_dict.items()) init_calc_props(spark_conf) sc = spark_utils.generate_spark_context(spark_conf=spark_conf) spark = SparkSession(sc) spark.sparkContext.setLogLevel("WARN") return spark
def initialize(target_partitions=None): """Returns SparkContext and SQLContext.""" conf = SparkConf() extra_settings = { 'spark.serializer': 'org.apache.spark.serializer.KryoSerializer', 'spark.executor.extraJavaOptions': '-XX:+UseG1GC' } if target_partitions: extra_settings['spark.default.parallelism'] = target_partitions conf.setAll(extra_settings.items()) environment = {'PYTHON_EGG_CACHE': '/tmp/python-eggs'} sc = SparkContext(conf=conf, environment=environment) sqlContext = SQLContext(sc) if target_partitions: sqlContext.setConf('spark.sql.shuffle.partitions', target_partitions) jvm_logger = sc._jvm.org.apache.log4j jvm_logger.LogManager.getLogger("org").setLevel(jvm_logger.Level.ERROR) jvm_logger.LogManager.getLogger("akka").setLevel(jvm_logger.Level.ERROR) return sc, sqlContext