Ejemplo n.º 1
0
 def __init__(self, config: SparkConfiguration):
     self.config = config
     self.database = config.get_config(Constants.POSTGRESQL_DB)
     self.host = config.get_config(Constants.POSTGRESQL_HOST)
     self.username = config.get_config(Constants.POSTGRESQL_USER)
     self.password = config.get_config(Constants.POSTGRESQL_PASSWORD)
     self.jdbc = "jdbc:postgresql://{0}/{1}?user={2}&password={3}".format(
         self.host, self.database, self.username, self.password)
Ejemplo n.º 2
0
def main():
    # Configure Spark Session
    config = {
        "spark.jars.packages":
        "io.delta:delta-core_2.12:0.8.0,"
        "org.postgresql:postgresql:9.4.1211,"
        "org.apache.spark:spark-streaming-kafka-0-10_2.12:3.0.0,"
        "org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.0",
        "spark.sql.extensions":
        "io.delta.sql.DeltaSparkSessionExtension",
        "spark.driver.memory":
        "8g",
        "spark.sql.catalog.spark_catalog":
        "org.apache.spark.sql.delta.catalog.DeltaCatalog",
        Constants.DELTA_SRC_PATH:
        Constants.DELTA_LOCATION,
        Constants.POSTGRESQL_DB:
        Constants.POSTGRESQL_DB_VALUE,
        Constants.POSTGRESQL_USER:
        Constants.POSTGRESQL_USER_VALUE,
        Constants.POSTGRESQL_PASSWORD:
        Constants.POSTGRESQL_PASSWORD_VALUE,
        Constants.POSTGRESQL_HOST:
        Constants.POSTGRESQL_HOST_VALUE,
        Constants.KAFKA_SERVER:
        Constants.KAFKA_SERVER_NAME,
    }
    spark_configuration = SparkConfiguration(
        app_name="visits_ads_event_ingestion",
        spark_master="local[4]",
        log_level="WARN",
        configuration=config)
    import main.orchestrator as Orchestrator

    ########################
    # Visit events ingestion
    ########################

    visits_schema = StructType([
        StructField('id_user', IntegerType(), False),
        StructField('id_video', IntegerType(), False),
        StructField('id_device', IntegerType(), False),
        StructField('id_location', IntegerType(), False),
        StructField('visit_date', TimestampType(), True)
    ])
    visits_stream = KafkaConnector(spark_configuration).get_stream(
        'visits', start_from_begining=False).load()
    visits_stream = extract_json_data(visits_stream, visits_schema)

    # For each micro-batch of visit events
    visits_stream.writeStream \
        .option("checkpointLocation", "checkpoint/visits") \
        .foreachBatch(lambda visits_batch, index: Orchestrator.ingest_visits(visits_batch, spark_configuration, index))\
        .start()

    # Await stream termination
    spark_configuration.spark_session.streams.awaitAnyTermination()
Ejemplo n.º 3
0
def main():
    # Configure Spark Session
    config = {
        "spark.jars.packages":
        "io.delta:delta-core_2.12:0.8.0,"
        "org.postgresql:postgresql:9.4.1211,"
        "org.apache.spark:spark-streaming-kafka-0-10_2.12:3.0.0,"
        "org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.0",
        "spark.sql.extensions":
        "io.delta.sql.DeltaSparkSessionExtension",
        "spark.driver.memory":
        "8g",
        "spark.sql.catalog.spark_catalog":
        "org.apache.spark.sql.delta.catalog.DeltaCatalog",
        Constants.DELTA_SRC_PATH:
        Constants.DELTA_LOCATION,
        Constants.POSTGRESQL_DB:
        Constants.POSTGRESQL_DB_VALUE,
        Constants.POSTGRESQL_USER:
        Constants.POSTGRESQL_USER_VALUE,
        Constants.POSTGRESQL_PASSWORD:
        Constants.POSTGRESQL_PASSWORD_VALUE,
        Constants.POSTGRESQL_HOST:
        Constants.POSTGRESQL_HOST_VALUE,
        Constants.KAFKA_SERVER:
        Constants.KAFKA_SERVER_NAME,
    }
    spark_configuration = SparkConfiguration(app_name="visits_video_processor",
                                             spark_master="local[2]",
                                             log_level="WARN",
                                             configuration=config)
    from main.connectors.delta_connector import DeltaConnector
    import main.orchestrator as Orchestrator

    videos_ref_df = PostgreSQLConnector(spark_configuration).get_table(
        Constants.VIDEOS_TABLE).cache()
    visits_video = DeltaConnector(spark_configuration).get_stream(
        Constants.VISITSXVIDEO_TABLE)

    # For each micro-batch of visit events
    visits_video.writeStream \
        .option("checkpointLocation", "checkpoint/visits_video") \
        .foreachBatch(lambda visits_video_batch, index: Orchestrator.ingest_video_visits(visits_video_batch,
                                                                                         spark_configuration,
                                                                                         videos_ref_df,
                                                                                         index
                                                                                         )) \
        .trigger(processingTime='30 seconds') \
        .start()

    # Await stream termination
    spark_configuration.spark_session.streams.awaitAnyTermination()
def main():
    config = {
        "spark.jars.packages": "org.postgresql:postgresql:9.4.1211",
        Constants.CURRENT_DATA_DELTA_TABLE_NAME: Constants.CURRENT_DATA,
        Constants.DELTA_SRC_PATH: Constants.DELTA_LOCATION,
        Constants.POSTGRESQL_DB: Constants.POSTGRESQL_DB_VALUE,
        Constants.POSTGRESQL_USER: Constants.POSTGRESQL_USER_VALUE,
        Constants.POSTGRESQL_PASSWORD: Constants.POSTGRESQL_PASSWORD_VALUE,
        Constants.POSTGRESQL_HOST: Constants.POSTGRESQL_HOST_VALUE,
        Constants.KAFKA_SERVER: Constants.KAFKA_SERVER_NAME
    }
    spark_configuration = SparkConfiguration(
        app_name="reference_data_generation",
        spark_master="local[*]",
        log_level="INFO",
        configuration=config)

    visitors_data = random_generator.generate_random_users(10**6)
    rdd_visitors = spark_configuration.spark_session.sparkContext.parallelize([
        (us["username"], us["email"], us["birth_date"], us["gender"],
         us["phone_number"], us["id_country"]) for us in visitors_data
    ])
    schema_visitor = StructType([
        StructField('username', StringType(), False),
        StructField('email', StringType(), False),
        StructField('birth_date', TimestampType(), False),
        StructField('gender', StringType(), False),
        StructField('phone_number', StringType(), True),
        StructField('id_country', IntegerType(), False)
    ])
    # Create data frame
    visitors_df = spark_configuration.spark_session.createDataFrame(
        rdd_visitors, schema_visitor)
    PostgreSQLConnector(spark_configuration).store(visitors_df, "visitors")

    videos_data = random_generator.generate_random_videos(10**5)
    videos_rdd = spark_configuration.spark_session.sparkContext.parallelize([
        (vid["id_user"], vid["title"], vid["id_language"], vid["category"])
        for vid in videos_data
    ])
    schema_videos = StructType([
        StructField('id_user', IntegerType(), False),
        StructField('title', StringType(), False),
        StructField('id_language', IntegerType(), False),
        StructField('category', StringType(), False),
    ])
    # Create data frame
    videos_df = spark_configuration.spark_session.createDataFrame(
        videos_rdd, schema_videos)
    PostgreSQLConnector(spark_configuration).store(videos_df, "video")
Ejemplo n.º 5
0
def main():
    # Configure Spark Session
    config = {
        "spark.jars.packages":
        "io.delta:delta-core_2.12:0.8.0,"
        "org.postgresql:postgresql:9.4.1211,"
        "org.apache.spark:spark-sql-kafka-0-10_2.12:3.1.1",
        "spark.driver.memory":
        "8g",
        "spark.sql.extensions":
        "io.delta.sql.DeltaSparkSessionExtension",
        "spark.sql.catalog.spark_catalog":
        "org.apache.spark.sql.delta.catalog.DeltaCatalog",
        Constants.CURRENT_DATA_DELTA_TABLE_NAME:
        Constants.CURRENT_DATA,
        Constants.DELTA_SRC:
        Constants.DELTA_LOCATION,
        # Constants.POSTGRESQL_DB: Constants.POSTGRESQL_DB_VALUE,
        # Constants.POSTGRESQL_USER: Constants.POSTGRESQL_USER_VALUE,
        # Constants.POSTGRESQL_PASSWORD: Constants.POSTGRESQL_PASSWORD_VALUE,
    }
    spark_configuration = SparkConfiguration(app_name="New Data Ingest",
                                             spark_master="local[*]",
                                             log_level="INFO",
                                             configuration=config)

    import main.connectors.mock_database_connector as DatabaseConnector
    import main.orchestrator as Orchestrator

    # EXTRACT NEW DATA
    new_data = DatabaseConnector.get_new_data(
        spark_configuration=spark_configuration)
    # new_data = PostgreSQLConnector(spark_configuration).get_table('some_table')

    # INGEST NEW DATA
    Orchestrator.ingest(new_data=new_data,
                        spark_configuration=spark_configuration)
Ejemplo n.º 6
0
def main():
    config = {
        "spark.jars.packages": "org.postgresql:postgresql:9.4.1211",
        "spark.driver.memory": "8g",
        Constants.DELTA_SRC_PATH: Constants.DELTA_LOCATION,
        Constants.POSTGRESQL_DB: Constants.POSTGRESQL_DB_VALUE,
        Constants.POSTGRESQL_USER: Constants.POSTGRESQL_USER_VALUE,
        Constants.POSTGRESQL_PASSWORD: Constants.POSTGRESQL_PASSWORD_VALUE,
        Constants.POSTGRESQL_HOST: Constants.POSTGRESQL_HOST_VALUE,
        Constants.KAFKA_SERVER: Constants.KAFKA_SERVER_NAME
    }
    spark_configuration = SparkConfiguration(app_name="visits_events_generator", spark_master="local[2]",
                                             log_level="INFO", configuration=config)
    postgres_driver = PostgreSQLConnector(spark_configuration)

    videos_df = postgres_driver.get_table('video')
    users_df = postgres_driver.get_table('visitors')
    devices_df = postgres_driver.get_table('device')
    locations_df = postgres_driver.get_table('locations')

    visits_generator = VisitGenerator(videos_df, users_df, devices_df, locations_df)

    spark_configuration.spark_session.stop()
    while True:
        sleep_time = input("Sleep ms (default 10ms):")
        sleep_time = int(sleep_time) if sleep_time != '' else 10
        generated_visits = visits_generator.generate_visits()
        loops = input("Number of loops (default 1):")
        loops = int(loops) if loops != '' else 1
        for _ in range(loops):
            for i in range(len(generated_visits)):
                generated_visits[i]["visit_date"] = datetime.datetime.now()
                value_to_send = json.dumps(generated_visits[i], default=str).encode('utf-8')
                print(value_to_send)
                kafka_producer.send(KAFKA_TOPIC, value=value_to_send)
                time.sleep(sleep_time / 1000)
Ejemplo n.º 7
0
def spark():
    return SparkConfiguration(app_name="Tests")
Ejemplo n.º 8
0
 def __init__(self, config: SparkConfiguration):
     self.config = config
     self.kafka_server = config.get_config(Constants.KAFKA_SERVER)
     if not self.kafka_server:
         raise ConfigNotFoundError()