def start_spark(app_name='my_spark_app', master='local[*]', jar_packages=[], spark_config={}, ssc_config={}, kafka_config={}, callback=None): spark_conf = SparkConf().setAppName(app_name).setMaster(master) if (spark_config): for config in spark_config: spark_conf.setIfMissing(config, spark_config[config]) sc = SparkContext(conf=spark_conf) sc.setLogLevel("WARN") spark = SparkSession(sparkContext=sc) ssc = StreamingContext(sc, ssc_config['batchDuration']) kafka_stream = KafkaUtils.createDirectStream( ssc, topics=kafka_config['topics'], kafkaParams=kafka_config['config']).map(lambda msg: json.loads(msg[1])) if callback: callback(kafka_stream=kafka_stream, sc=sc, spark_conf=spark_conf) ssc.start() ssc.awaitTermination()
def start_spark(app_name='my_spark_app', master='local[*]', jar_packages=[], spark_config={}, ssc_config={'batchDuration':5}, callback=None): spark_conf = SparkConf().setAppName(app_name).setMaster(master) if(spark_config): for config in spark_config: spark_conf.setIfMissing(config,spark_config[config]) if jar_packages: spark_jar_packages = '--packages ' + ','.join(jar_packages) os.environ["PYSPARK_SUBMIT_ARGS"] = spark_jar_packages sc = SparkContext(conf=spark_conf) spark = SparkSession(sparkContext=sc) # create session and retrieve Spark logger object # spark_logger = logging.Log4j(spark) ssc = StreamingContext(sc, ssc_config['batchDuration']) if callback: callback(spark=spark,ssc=ssc) ssc.start() ssc.awaitTermination()
def getSparkConf(): config = PipelineUtils.getConfig()['spark'] packages = '--packages ' + ','.join(config['packages']) os.environ['PYSPARK_SUBMIT_ARGS'] = (packages + ' pyspark-shell') spark_conf = SparkConf().setAppName(config['appName']).setMaster( config['master']) if (config['conf']): for key in config['conf']: spark_conf.setIfMissing(key, config['conf'][key]) return spark_conf
def configure(app_name="Sparkling Water Demo"): conf = SparkConf() conf.setAppName(app_name) conf.setIfMissing("spark.master", os.getenv("spark.master", "local[*]")) return conf