def get_data(): findspark.add_packages('mysql:mysql-connector-java:8.0.11') spark = SparkSession.builder.appName('pipeline').getOrCreate() with open('/home/tom/mysql_creds.json', 'r') as f: data = json.load(f) hostname = 'localhost' jdbcPort = 3306 password = data['password'] dbname = 'my_company' jdbc_url = "jdbc:mysql://{0}:{1}/{2}?user={3}&password={4}".format( hostname, jdbcPort, dbname, username, password) query = "(select * from syria_data) t1_alias" df = spark.read.format('jdbc').options(driver='com.mysql.jdbc.Driver', url=jdbc_url, dbtable=query).load() df.write.parquet('/home/tom/Documents/csv_files/syria_parquet.parquet')
def load_data(): findspark.add_packages('mysql:mysql-connector-java:8.0.11' ) #adding JDBC driver to connect to MySQL spark = SparkSession.builder.appName('pipeline').config( "spark.ui.port", "4050").getOrCreate() hostname = "localhost" dbname = "dag_data" jdbcPort = 3306 username = "******" password = SQL_PASSWORD jdbc_url = "jdbc:mysql://{0}:{1}/{2}?user={3}&password={4}".format( hostname, jdbcPort, dbname, username, password) df = spark.read.parquet( '/home/tom/Documents/csv_files/book_parquet.parquet') df.write.format('jdbc').options( url=jdbc_url, driver='com.mysql.jdbc.Driver', dbtable='bookstore_data', user=username, password=password).mode('overwrite').save() #saving to MySQL
import findspark try: from pyspark import context except ImportError: # Add PySpark to the library path based on the value of SPARK_HOME if # pyspark is not already in our path findspark.init() findspark.add_packages(['com.databricks:spark-csv_2.10:1.4.0'])
import json import sys, os, re, ast import findspark # Add the streaming package and initialize findspark.add_packages( ["org.apache.spark:spark-streaming-kafka-0-8_2.11:2.2.0"]) findspark.init() import pyspark import pyspark.streaming from pyspark import SparkConf, SparkContext from pyspark.streaming import StreamingContext from pyspark.streaming.kafka import KafkaUtils def main(): PERIOD = 10 BROKERS = 'localhost:9092' TOPIC = 'twitterstream' duration = 100 conf = SparkConf().set("spark.default.paralleism", 1) sc = SparkContext(appName='Streamer', conf=conf) #create a streaming context with batch interval 10 sec ssc = StreamingContext(sc, PERIOD) #ssc.checkpoint("checkpoint") stream = KafkaUtils.createDirectStream(ssc, [TOPIC], { "metadata.broker.list": BROKERS, }) tweets = stream.map(lambda x: json.loads(x[1])).map( lambda x: json.loads(x)) text = tweets.map(lambda x: x['text'])
def main(base_path): APP_NAME = "make_predictions_streaming.py" # Process data every 10 seconds PERIOD = 10 BROKERS = 'localhost:9092' PREDICTION_TOPIC = 'flight_delay_classification_request' try: sc and ssc except NameError as e: import findspark # Add the streaming package and initialize findspark.add_packages( ["org.apache.spark:spark-streaming-kafka-0-8_2.11:2.1.0"]) findspark.init() import pyspark import pyspark.sql import pyspark.streaming conf = SparkConf().set("spark.default.parallelism", 1) sc = SparkContext( appName="Agile Data Science: PySpark Streaming 'Hello, World!'", conf=conf) ssc = StreamingContext(sc, PERIOD) spark = pyspark.sql.SparkSession(sc).builder.appName( APP_NAME).getOrCreate() # # Load all models to be used in making predictions # # Load the arrival delay bucketizer from pyspark.ml.feature import Bucketizer arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format( base_path) arrival_bucketizer = Bucketizer.load(arrival_bucketizer_path) # Load all the string field vectorizer pipelines into a dict from pyspark.ml.feature import StringIndexerModel string_indexer_models = {} for column in [ "Carrier", "DayOfMonth", "DayOfWeek", "DayOfYear", "Origin", "Dest", "Route" ]: string_indexer_model_path = "{}/models/string_indexer_model_{}.bin".format( base_path, column) string_indexer_model = StringIndexerModel.load( string_indexer_model_path) string_indexer_models[column] = string_indexer_model # Load the numeric vector assembler from pyspark.ml.feature import VectorAssembler vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format( base_path) vector_assembler = VectorAssembler.load(vector_assembler_path) # Load the classifier model from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel random_forest_model_path = "{}/models/spark_random_forest_classifier.flight_delays.5.0.bin".format( base_path) rfc = RandomForestClassificationModel.load(random_forest_model_path) # # Process Prediction Requests in Streaming # stream = KafkaUtils.createDirectStream(ssc, [PREDICTION_TOPIC], { "metadata.broker.list": BROKERS, "group.id": "0", }) object_stream = stream.map(lambda x: json.loads(x[1])) object_stream.pprint() row_stream = object_stream.map( lambda x: Row(FlightDate=iso8601.parse_date(x['FlightDate']), Origin=x['Origin'], Distance=x['Distance'], DayOfMonth=x['DayOfMonth'], DayOfYear=x['DayOfYear'], UUID=x['UUID'], DepDelay=x['DepDelay'], DayOfWeek=x['DayOfWeek'], FlightNum=x['FlightNum'], Dest=x['Dest'], Timestamp=iso8601.parse_date(x['Timestamp']), Carrier=x['Carrier'])) row_stream.pprint() # # Create a dataframe from the RDD-based object stream # def classify_prediction_requests(rdd): from pyspark.sql.types import StringType, IntegerType, DoubleType, DateType, TimestampType from pyspark.sql.types import StructType, StructField prediction_request_schema = StructType([ StructField("Carrier", StringType(), True), StructField("DayOfMonth", IntegerType(), True), StructField("DayOfWeek", IntegerType(), True), StructField("DayOfYear", IntegerType(), True), StructField("DepDelay", DoubleType(), True), StructField("Dest", StringType(), True), StructField("Distance", DoubleType(), True), StructField("FlightDate", DateType(), True), StructField("FlightNum", StringType(), True), StructField("Origin", StringType(), True), StructField("Timestamp", TimestampType(), True), StructField("UUID", StringType(), True), ]) prediction_requests_df = spark.createDataFrame( rdd, schema=prediction_request_schema) prediction_requests_df.show() # # Add a Route variable to replace FlightNum # from pyspark.sql.functions import lit, concat prediction_requests_with_route = prediction_requests_df.withColumn( 'Route', concat(prediction_requests_df.Origin, lit('-'), prediction_requests_df.Dest)) prediction_requests_with_route.show(6) # Vectorize string fields with the corresponding pipeline for that column # Turn category fields into categoric feature vectors, then drop intermediate fields for column in [ "Carrier", "DayOfMonth", "DayOfWeek", "DayOfYear", "Origin", "Dest", "Route" ]: string_indexer_model = string_indexer_models[column] prediction_requests_with_route = string_indexer_model.transform( prediction_requests_with_route) # Vectorize numeric columns: DepDelay, Distance and index columns final_vectorized_features = vector_assembler.transform( prediction_requests_with_route) # Inspect the vectors final_vectorized_features.show() # Drop the individual index columns index_columns = [ "Carrier_index", "DayOfMonth_index", "DayOfWeek_index", "DayOfYear_index", "Origin_index", "Dest_index", "Route_index" ] for column in index_columns: final_vectorized_features = final_vectorized_features.drop(column) # Inspect the finalized features final_vectorized_features.show() # Make the prediction predictions = rfc.transform(final_vectorized_features) # Drop the features vector and prediction metadata to give the original fields predictions = predictions.drop("Features_vec") final_predictions = predictions.drop("indices").drop("values").drop( "rawPrediction").drop("probability") # Inspect the output final_predictions.show() # Store to Mongo if final_predictions.count() > 0: final_predictions.rdd.map(lambda x: x.asDict()).saveToMongoDB( "mongodb://localhost:27017/agile_data_science.flight_delay_classification_response" ) # Do the classification and store to Mongo row_stream.foreachRDD(classify_prediction_requests) ssc.start() ssc.awaitTermination()
from pyspark.sql.types import StructType, StructField, StringType, IntegerType from analyze.datefrme_mongo import Trade SPARK_HOME = '/Users/luodongshen/Documents/soft/spark-3.0.0-bin-hadoop3.2' import findspark findspark.add_packages('org.mongodb.spark:mongo-spark-connector_2.12:3.0.0') findspark.init(SPARK_HOME) from pyspark.sql import SparkSession import datetime import logging import base import constant from analyze.GrahamTendency import GrahamPeTTM, CHINA_AAA import orm.mongobase as om spark = SparkSession.builder.appName('MyApp')\ .config('spark.mongodb.input.uri', 'mongodb://127.0.0.1/stock.k_data') \ .getOrCreate() schema = StructType([ StructField("code", StringType()), StructField("start_date", StringType()), StructField("end_date", StringType()) ])
def main(base_path): APP_NAME = "make_predictions_streaming.py" # 10초마다 데이터 처리 PERIOD = 10 BROKERS = 'localhost:9092' PREDICTION_TOPIC = 'flight_delay_classification_request' try: sc and ssc except NameError as e: import findspark # 스트리밍 패키지 추가 및 초기화 findspark.add_packages( ["org.apache.spark:spark-streaming-kafka-0-8_2.11:2.1.0"]) findspark.init() import pyspark import pyspark.sql import pyspark.streaming conf = SparkConf().set("spark.default.parallelism", 1) sc = SparkContext( appName="Agile Data Science: PySpark Streaming 'Hello, World!'", conf=conf) ssc = StreamingContext(sc, PERIOD) spark = pyspark.sql.SparkSession(sc).builder.appName( APP_NAME).getOrCreate() # # 예측 생성에 사용된 모든 모델 적재 # # 도착 지연 구간화 모델 적재 from pyspark.ml.feature import Bucketizer arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format( base_path) arrival_bucketizer = Bucketizer.load(arrival_bucketizer_path) # 모든 문자열 필드 벡터화 파이프라인을 dict에 적재 from pyspark.ml.feature import StringIndexerModel string_indexer_models = {} for column in ["Carrier", "Origin", "Dest", "Route"]: string_indexer_model_path = "{}/models/string_indexer_model_{}.bin".format( base_path, column) string_indexer_model = StringIndexerModel.load( string_indexer_model_path) string_indexer_models[column] = string_indexer_model # 숫자 벡터 어셈블러 적재 from pyspark.ml.feature import VectorAssembler vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format( base_path) vector_assembler = VectorAssembler.load(vector_assembler_path) # 분류 모델 적재 from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel random_forest_model_path = "{}/models/spark_random_forest_classifier.flight_delays.5.0.bin".format( base_path) rfc = RandomForestClassificationModel.load(random_forest_model_path) # # 스트리밍에서 예측 요청 처리 # stream = KafkaUtils.createDirectStream(ssc, [PREDICTION_TOPIC], { "metadata.broker.list": BROKERS, "group.id": "0", }) object_stream = stream.map(lambda x: json.loads(x[1])) object_stream.pprint() row_stream = object_stream.map( lambda x: Row(FlightDate=iso8601.parse_date(x['FlightDate']), Origin=x['Origin'], Distance=x['Distance'], DayOfMonth=x['DayOfMonth'], DayOfYear=x['DayOfYear'], UUID=x['UUID'], DepDelay=x['DepDelay'], DayOfWeek=x['DayOfWeek'], FlightNum=x['FlightNum'], Dest=x['Dest'], Timestamp=iso8601.parse_date(x['Timestamp']), Carrier=x['Carrier'])) row_stream.pprint() # # RDD 기반 객체 스트림에서 dataframe 생성 # def classify_prediction_requests(rdd): from pyspark.sql.types import StringType, IntegerType, DoubleType, DateType, TimestampType from pyspark.sql.types import StructType, StructField prediction_request_schema = StructType([ StructField("Carrier", StringType(), True), StructField("DayOfMonth", IntegerType(), True), StructField("DayOfWeek", IntegerType(), True), StructField("DayOfYear", IntegerType(), True), StructField("DepDelay", DoubleType(), True), StructField("Dest", StringType(), True), StructField("Distance", DoubleType(), True), StructField("FlightDate", DateType(), True), StructField("FlightNum", StringType(), True), StructField("Origin", StringType(), True), StructField("Timestamp", TimestampType(), True), StructField("UUID", StringType(), True), ]) prediction_requests_df = spark.createDataFrame( rdd, schema=prediction_request_schema) prediction_requests_df.show() # # FlightNum을 대체할 Route 변수 추가 # from pyspark.sql.functions import lit, concat prediction_requests_with_route = prediction_requests_df.withColumn( 'Route', concat(prediction_requests_df.Origin, lit('-'), prediction_requests_df.Dest)) prediction_requests_with_route.show(6) # 문자열 필드를 해당 열에 대응하는 파이프라인으로 벡터화 # 범주 필드를 범주형 특징 벡터로 변환한 다음 중간 결과 필드 삭제 for column in ["Carrier", "Origin", "Dest", "Route"]: string_indexer_model = string_indexer_models[column] prediction_requests_with_route = string_indexer_model.transform( prediction_requests_with_route) # 숫사 열 벡터화: DepDelay, Distance, 인덱스 열 final_vectorized_features = vector_assembler.transform( prediction_requests_with_route) # 벡터 검사 final_vectorized_features.show() # 개별 인덱스 열 제거 index_columns = [ "Carrier_index", "Origin_index", "Dest_index", "Route_index" ] for column in index_columns: final_vectorized_features = final_vectorized_features.drop(column) # 확정된 특징 검사 final_vectorized_features.show() # 예측 생성 predictions = rfc.transform(final_vectorized_features) # 원 필드에 제공하기 위해 특징 벡터와 예측 메타데이터 제거 predictions = predictions.drop("Features_vec") final_predictions = predictions.drop("indices").drop("values").drop( "rawPrediction").drop("probability") # 결과 검사 final_predictions.show() # 몽고DB에 저장 if final_predictions.count() > 0: final_predictions.rdd.map(lambda x: x.asDict()).saveToMongoDB( "mongodb://localhost:27017/agile_data_science.flight_delay_classification_response" ) # 분류를 수행하고 몽고 DB에 저장 row_stream.foreachRDD(classify_prediction_requests) ssc.start() ssc.awaitTermination()
from pyspark.sql import SparkSession from pyspark.sql.functions import * from pyspark.sql.types import * import mysql_connect_class from generator.server_log_generator import ServerLogGenerator import findspark # Utilize find spark in case mysql package not found findspark.add_packages('mysql:mysql-connector-java:8.0.11') # Set Kafka config kafka_broker = "b-2.log.02msna.c8.kafka.us-west-2.amazonaws.com:9092," \ "b-1.log.02msna.c8.kafka.us-west-2.amazonaws.com:9092" kafka_topic_input = "server-logs" # MySQL Connection Parameters mysql_host = 'stream-database.cp2rkjojtqyn.us-west-2.rds.amazonaws.com' mysql_port = '3306' def get_defined_values(): # Create a ServerLogGenerator instance to get predefined values s = ServerLogGenerator() countries = s._location_country # Pre-defined countries event_types = s._event_type # Pre-defined events devices = ["ANDROID", "IOS"] # Pre-defined devices return [countries, event_types, devices]
def main(base_path): APP_NAME = "make_predictions_streaming.py" # Process data every 10 seconds PERIOD = 10 BROKERS = 'localhost:9092' PREDICTION_TOPIC = 'flight_delay_classification_request' try: sc and ssc except NameError as e: import findspark # Add the streaming package and initialize findspark.add_packages(["org.apache.spark:spark-streaming-kafka-0-8_2.11:2.1.0"]) findspark.init() import pyspark import pyspark.sql import pyspark.streaming conf = SparkConf().set("spark.default.parallelism", 1) sc = SparkContext(appName="Agile Data Science: PySpark Streaming 'Hello, World!'", conf=conf) ssc = StreamingContext(sc, PERIOD) spark = pyspark.sql.SparkSession(sc).builder.appName(APP_NAME).getOrCreate() # # Load all models to be used in making predictions # # Load the arrival delay bucketizer from pyspark.ml.feature import Bucketizer arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(base_path) arrival_bucketizer = Bucketizer.load(arrival_bucketizer_path) # Load all the string field vectorizer pipelines into a dict from pyspark.ml.feature import StringIndexerModel string_indexer_models = {} for column in ["Carrier", "Origin", "Dest", "Route"]: string_indexer_model_path = "{}/models/string_indexer_model_{}.bin".format( base_path, column ) string_indexer_model = StringIndexerModel.load(string_indexer_model_path) string_indexer_models[column] = string_indexer_model # Load the numeric vector assembler from pyspark.ml.feature import VectorAssembler vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format(base_path) vector_assembler = VectorAssembler.load(vector_assembler_path) # Load the classifier model from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel random_forest_model_path = "{}/models/spark_random_forest_classifier.flight_delays.5.0.bin".format( base_path ) rfc = RandomForestClassificationModel.load( random_forest_model_path ) # # Process Prediction Requests in Streaming # stream = KafkaUtils.createDirectStream( ssc, [PREDICTION_TOPIC], { "metadata.broker.list": BROKERS, "group.id": "0", } ) object_stream = stream.map(lambda x: json.loads(x[1])) object_stream.pprint() row_stream = object_stream.map( lambda x: Row( FlightDate=iso8601.parse_date(x['FlightDate']), Origin=x['Origin'], Distance=x['Distance'], DayOfMonth=x['DayOfMonth'], DayOfYear=x['DayOfYear'], UUID=x['UUID'], DepDelay=x['DepDelay'], DayOfWeek=x['DayOfWeek'], FlightNum=x['FlightNum'], Dest=x['Dest'], Timestamp=iso8601.parse_date(x['Timestamp']), Carrier=x['Carrier'] ) ) row_stream.pprint() # # Create a dataframe from the RDD-based object stream # def classify_prediction_requests(rdd): from pyspark.sql.types import StringType, IntegerType, DoubleType, DateType, TimestampType from pyspark.sql.types import StructType, StructField prediction_request_schema = StructType([ StructField("Carrier", StringType(), True), StructField("DayOfMonth", IntegerType(), True), StructField("DayOfWeek", IntegerType(), True), StructField("DayOfYear", IntegerType(), True), StructField("DepDelay", DoubleType(), True), StructField("Dest", StringType(), True), StructField("Distance", DoubleType(), True), StructField("FlightDate", DateType(), True), StructField("FlightNum", StringType(), True), StructField("Origin", StringType(), True), StructField("Timestamp", TimestampType(), True), StructField("UUID", StringType(), True), ]) prediction_requests_df = spark.createDataFrame(rdd, schema=prediction_request_schema) prediction_requests_df.show() # # Add a Route variable to replace FlightNum # from pyspark.sql.functions import lit, concat prediction_requests_with_route = prediction_requests_df.withColumn( 'Route', concat( prediction_requests_df.Origin, lit('-'), prediction_requests_df.Dest ) ) prediction_requests_with_route.show(6) # Vectorize string fields with the corresponding pipeline for that column # Turn category fields into categoric feature vectors, then drop intermediate fields for column in ["Carrier", "Origin", "Dest", "Route"]: string_indexer_model = string_indexer_models[column] prediction_requests_with_route = string_indexer_model.transform(prediction_requests_with_route) # Vectorize numeric columns: DepDelay, Distance and index columns final_vectorized_features = vector_assembler.transform(prediction_requests_with_route) # Inspect the vectors final_vectorized_features.show() # Drop the individual index columns index_columns = ["Carrier_index", "Origin_index", "Dest_index", "Route_index"] for column in index_columns: final_vectorized_features = final_vectorized_features.drop(column) # Inspect the finalized features final_vectorized_features.show() # Make the prediction predictions = rfc.transform(final_vectorized_features) # Drop the features vector and prediction metadata to give the original fields predictions = predictions.drop("Features_vec") final_predictions = predictions.drop("indices").drop("values").drop("rawPrediction").drop("probability") # Inspect the output final_predictions.show() # Store to Mongo if final_predictions.count() > 0: final_predictions.rdd.map(lambda x: x.asDict()).saveToMongoDB( "mongodb://localhost:27017/agile_data_science.flight_delay_classification_response" ) # Do the classification and store to Mongo row_stream.foreachRDD(classify_prediction_requests) ssc.start() ssc.awaitTermination()
SPARK_HOME = '/Users/luodongshen/Documents/soft/spark-3.0.0-bin-hadoop3.2' import findspark findspark.add_packages('org.mongodb.spark:mongo-spark-connector_2.11:2.3.1') findspark.init(SPARK_HOME) from pyspark.sql import SparkSession logFile = "/Users/luodongshen/Documents/stock_logs/stock_info.log" # Should be some file on your system spark = SparkSession.builder.appName("SimpleApp").getOrCreate() logData = spark.read.text(logFile).cache() numAs = logData.filter(logData.value.contains('a')).count() numBs = logData.filter(logData.value.contains('b')).count() print("Lines with a: %i, lines with b: %i" % (numAs, numBs)) spark.stop()
import os os.environ["PYSPARK_SUBMIT_ARGS"] = "" import findspark findspark.init() # to install: # $SPARK_HOME/bin/pyspark --packages org.mongodb.spark:mongo-spark-connector_2.11:2.2.0 # to load the mongoconnector package: findspark.add_packages(["org.mongodb.spark:mongo-spark-connector_2.11:2.2.0"]) from pyspark.sql import SparkSession FORMAT = "com.mongodb.spark.sql.DefaultSource" URI = "mongodb://127.0.0.1:27017/{db}.{col}" def get_session(database, collection): uri = URI.format(db=database, col=collection) return SparkSession \ .builder \ .appName("myApp") \ .config("spark.mongodb.input.uri", uri) \ .config("spark.mongodb.output.uri", uri) \ .getOrCreate()
import json import ast import os from scipy.spatial.distance import euclidean # from kafka import KafkaConsumer # # consumer1 = KafkaConsumer('test_5', bootstrap_servers=['172.17.0.1:9092']) # # for message in consumer1: # print message import findspark findspark.init('/home/oliver/Documents/spark-2.0.0-bin-hadoop2.7') findspark.add_packages(['org.apache.spark:spark-streaming-kafka-0-8-assembly_2.11:2.0.0-preview']) from pyspark import SparkContext from pyspark.streaming.kafka import KafkaUtils from pyspark.streaming import StreamingContext import pandas as pd offsetRanges = [] wifi_hotspots = [{ 'name': '2WIRE413', 'mac_add': '28:16:2e:a4:c4:41', 'lat_lng': (333, 333) }, { 'name': 'ATTUuVi3A2', 'mac_add': '78:96:84:6e:6f:a0', 'lat_lng': (333, 666)
import findspark import json import redis import os from pyspark.sql import SparkSession """Script takes data from Redis and loads it into Spark, where JDBC is used to move data to MySQL instance hosted on EC2""" findspark.add_packages( "mysql:mysql-connector-java:8.0.11") #Adding MySQL driver to Spark #retrieving data from Redis --- r = redis.Redis(host="redis", port=6379) data = json.loads(r.get("wine_data").decode("utf8")) #reading data into Spark --- spark = SparkSession.builder.appName("pipeline").getOrCreate() with open("/usr/local/airflow/dags/ETL/schema.txt", "r") as f: SCHEMA = f.read() df = spark.createDataFrame(data, schema=SCHEMA) #Writing data to MySQL --- user = os.environ.get("MYSQL_USER") password = os.environ.get("MYSQL_PASSWORD") db = os.environ.get("MYSQL_DB")
args = arg_parse() if args.mode == "local": import findspark # os.environ["JAVA_HOME"] = r"/usr/lib/jvm/java-1.8.0-openjdk-amd64" # os.environ["SPARK_HOME"] = r"/mnt/c/projects/spark2.4.5" # os.environ['PYSPARK_SUBMIT_ARGS'] = "" # findspark.init(r"/mnt/c/projects/spark2.4.5") # findspark.add_packages(["org.apache.spark:spark-streaming-kinesis-asl_2.11:2.4.5"]) import findspark os.environ["JAVA_HOME"] = r"C:\Program Files\Java\jdk1.8.0_241" os.environ["SPARK_HOME"] = r"C:\spark-2.4.5-bin-hadoop2.7" os.environ['PYSPARK_SUBMIT_ARGS'] = "" findspark.init(r"C:\spark-2.4.5-bin-hadoop2.7") findspark.add_packages( ["org.apache.spark:spark-streaming-kinesis-asl_2.11:2.4.5"]) from pyspark.streaming import StreamingContext from pyspark.sql import SparkSession, DataFrame from pyspark import RDD from pyspark import SparkConf, SparkContext from pyspark.streaming.kinesis import KinesisUtils, InitialPositionInStream from pyspark.sql.context import SQLContext from pyspark.sql.types import StructType schema = StructType.fromJson(app_config["all_data_scheme"]) connect = create_db_connection(args.redshift_host, args.redshift_port, args.redshift_user, args.redshift_password, args.redshift_db_name)