def __init__(self, config: SparkConfiguration): self.config = config self.database = config.get_config(Constants.POSTGRESQL_DB) self.host = config.get_config(Constants.POSTGRESQL_HOST) self.username = config.get_config(Constants.POSTGRESQL_USER) self.password = config.get_config(Constants.POSTGRESQL_PASSWORD) self.jdbc = "jdbc:postgresql://{0}/{1}?user={2}&password={3}".format( self.host, self.database, self.username, self.password)
def main(): # Configure Spark Session config = { "spark.jars.packages": "io.delta:delta-core_2.12:0.8.0," "org.postgresql:postgresql:9.4.1211," "org.apache.spark:spark-streaming-kafka-0-10_2.12:3.0.0," "org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.0", "spark.sql.extensions": "io.delta.sql.DeltaSparkSessionExtension", "spark.driver.memory": "8g", "spark.sql.catalog.spark_catalog": "org.apache.spark.sql.delta.catalog.DeltaCatalog", Constants.DELTA_SRC_PATH: Constants.DELTA_LOCATION, Constants.POSTGRESQL_DB: Constants.POSTGRESQL_DB_VALUE, Constants.POSTGRESQL_USER: Constants.POSTGRESQL_USER_VALUE, Constants.POSTGRESQL_PASSWORD: Constants.POSTGRESQL_PASSWORD_VALUE, Constants.POSTGRESQL_HOST: Constants.POSTGRESQL_HOST_VALUE, Constants.KAFKA_SERVER: Constants.KAFKA_SERVER_NAME, } spark_configuration = SparkConfiguration( app_name="visits_ads_event_ingestion", spark_master="local[4]", log_level="WARN", configuration=config) import main.orchestrator as Orchestrator ######################## # Visit events ingestion ######################## visits_schema = StructType([ StructField('id_user', IntegerType(), False), StructField('id_video', IntegerType(), False), StructField('id_device', IntegerType(), False), StructField('id_location', IntegerType(), False), StructField('visit_date', TimestampType(), True) ]) visits_stream = KafkaConnector(spark_configuration).get_stream( 'visits', start_from_begining=False).load() visits_stream = extract_json_data(visits_stream, visits_schema) # For each micro-batch of visit events visits_stream.writeStream \ .option("checkpointLocation", "checkpoint/visits") \ .foreachBatch(lambda visits_batch, index: Orchestrator.ingest_visits(visits_batch, spark_configuration, index))\ .start() # Await stream termination spark_configuration.spark_session.streams.awaitAnyTermination()
def main(): # Configure Spark Session config = { "spark.jars.packages": "io.delta:delta-core_2.12:0.8.0," "org.postgresql:postgresql:9.4.1211," "org.apache.spark:spark-streaming-kafka-0-10_2.12:3.0.0," "org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.0", "spark.sql.extensions": "io.delta.sql.DeltaSparkSessionExtension", "spark.driver.memory": "8g", "spark.sql.catalog.spark_catalog": "org.apache.spark.sql.delta.catalog.DeltaCatalog", Constants.DELTA_SRC_PATH: Constants.DELTA_LOCATION, Constants.POSTGRESQL_DB: Constants.POSTGRESQL_DB_VALUE, Constants.POSTGRESQL_USER: Constants.POSTGRESQL_USER_VALUE, Constants.POSTGRESQL_PASSWORD: Constants.POSTGRESQL_PASSWORD_VALUE, Constants.POSTGRESQL_HOST: Constants.POSTGRESQL_HOST_VALUE, Constants.KAFKA_SERVER: Constants.KAFKA_SERVER_NAME, } spark_configuration = SparkConfiguration(app_name="visits_video_processor", spark_master="local[2]", log_level="WARN", configuration=config) from main.connectors.delta_connector import DeltaConnector import main.orchestrator as Orchestrator videos_ref_df = PostgreSQLConnector(spark_configuration).get_table( Constants.VIDEOS_TABLE).cache() visits_video = DeltaConnector(spark_configuration).get_stream( Constants.VISITSXVIDEO_TABLE) # For each micro-batch of visit events visits_video.writeStream \ .option("checkpointLocation", "checkpoint/visits_video") \ .foreachBatch(lambda visits_video_batch, index: Orchestrator.ingest_video_visits(visits_video_batch, spark_configuration, videos_ref_df, index )) \ .trigger(processingTime='30 seconds') \ .start() # Await stream termination spark_configuration.spark_session.streams.awaitAnyTermination()
def main(): config = { "spark.jars.packages": "org.postgresql:postgresql:9.4.1211", Constants.CURRENT_DATA_DELTA_TABLE_NAME: Constants.CURRENT_DATA, Constants.DELTA_SRC_PATH: Constants.DELTA_LOCATION, Constants.POSTGRESQL_DB: Constants.POSTGRESQL_DB_VALUE, Constants.POSTGRESQL_USER: Constants.POSTGRESQL_USER_VALUE, Constants.POSTGRESQL_PASSWORD: Constants.POSTGRESQL_PASSWORD_VALUE, Constants.POSTGRESQL_HOST: Constants.POSTGRESQL_HOST_VALUE, Constants.KAFKA_SERVER: Constants.KAFKA_SERVER_NAME } spark_configuration = SparkConfiguration( app_name="reference_data_generation", spark_master="local[*]", log_level="INFO", configuration=config) visitors_data = random_generator.generate_random_users(10**6) rdd_visitors = spark_configuration.spark_session.sparkContext.parallelize([ (us["username"], us["email"], us["birth_date"], us["gender"], us["phone_number"], us["id_country"]) for us in visitors_data ]) schema_visitor = StructType([ StructField('username', StringType(), False), StructField('email', StringType(), False), StructField('birth_date', TimestampType(), False), StructField('gender', StringType(), False), StructField('phone_number', StringType(), True), StructField('id_country', IntegerType(), False) ]) # Create data frame visitors_df = spark_configuration.spark_session.createDataFrame( rdd_visitors, schema_visitor) PostgreSQLConnector(spark_configuration).store(visitors_df, "visitors") videos_data = random_generator.generate_random_videos(10**5) videos_rdd = spark_configuration.spark_session.sparkContext.parallelize([ (vid["id_user"], vid["title"], vid["id_language"], vid["category"]) for vid in videos_data ]) schema_videos = StructType([ StructField('id_user', IntegerType(), False), StructField('title', StringType(), False), StructField('id_language', IntegerType(), False), StructField('category', StringType(), False), ]) # Create data frame videos_df = spark_configuration.spark_session.createDataFrame( videos_rdd, schema_videos) PostgreSQLConnector(spark_configuration).store(videos_df, "video")
def main(): # Configure Spark Session config = { "spark.jars.packages": "io.delta:delta-core_2.12:0.8.0," "org.postgresql:postgresql:9.4.1211," "org.apache.spark:spark-sql-kafka-0-10_2.12:3.1.1", "spark.driver.memory": "8g", "spark.sql.extensions": "io.delta.sql.DeltaSparkSessionExtension", "spark.sql.catalog.spark_catalog": "org.apache.spark.sql.delta.catalog.DeltaCatalog", Constants.CURRENT_DATA_DELTA_TABLE_NAME: Constants.CURRENT_DATA, Constants.DELTA_SRC: Constants.DELTA_LOCATION, # Constants.POSTGRESQL_DB: Constants.POSTGRESQL_DB_VALUE, # Constants.POSTGRESQL_USER: Constants.POSTGRESQL_USER_VALUE, # Constants.POSTGRESQL_PASSWORD: Constants.POSTGRESQL_PASSWORD_VALUE, } spark_configuration = SparkConfiguration(app_name="New Data Ingest", spark_master="local[*]", log_level="INFO", configuration=config) import main.connectors.mock_database_connector as DatabaseConnector import main.orchestrator as Orchestrator # EXTRACT NEW DATA new_data = DatabaseConnector.get_new_data( spark_configuration=spark_configuration) # new_data = PostgreSQLConnector(spark_configuration).get_table('some_table') # INGEST NEW DATA Orchestrator.ingest(new_data=new_data, spark_configuration=spark_configuration)
def main(): config = { "spark.jars.packages": "org.postgresql:postgresql:9.4.1211", "spark.driver.memory": "8g", Constants.DELTA_SRC_PATH: Constants.DELTA_LOCATION, Constants.POSTGRESQL_DB: Constants.POSTGRESQL_DB_VALUE, Constants.POSTGRESQL_USER: Constants.POSTGRESQL_USER_VALUE, Constants.POSTGRESQL_PASSWORD: Constants.POSTGRESQL_PASSWORD_VALUE, Constants.POSTGRESQL_HOST: Constants.POSTGRESQL_HOST_VALUE, Constants.KAFKA_SERVER: Constants.KAFKA_SERVER_NAME } spark_configuration = SparkConfiguration(app_name="visits_events_generator", spark_master="local[2]", log_level="INFO", configuration=config) postgres_driver = PostgreSQLConnector(spark_configuration) videos_df = postgres_driver.get_table('video') users_df = postgres_driver.get_table('visitors') devices_df = postgres_driver.get_table('device') locations_df = postgres_driver.get_table('locations') visits_generator = VisitGenerator(videos_df, users_df, devices_df, locations_df) spark_configuration.spark_session.stop() while True: sleep_time = input("Sleep ms (default 10ms):") sleep_time = int(sleep_time) if sleep_time != '' else 10 generated_visits = visits_generator.generate_visits() loops = input("Number of loops (default 1):") loops = int(loops) if loops != '' else 1 for _ in range(loops): for i in range(len(generated_visits)): generated_visits[i]["visit_date"] = datetime.datetime.now() value_to_send = json.dumps(generated_visits[i], default=str).encode('utf-8') print(value_to_send) kafka_producer.send(KAFKA_TOPIC, value=value_to_send) time.sleep(sleep_time / 1000)
def spark(): return SparkConfiguration(app_name="Tests")
def __init__(self, config: SparkConfiguration): self.config = config self.kafka_server = config.get_config(Constants.KAFKA_SERVER) if not self.kafka_server: raise ConfigNotFoundError()