Example #1
0
def main():
    """main
    """
    spark = SparkSession \
        .builder \
        .appName("ExtractEventsJob") \
        .enableHiveSupport() \
        .getOrCreate()

    raw_events = spark \
        .readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", "kafka:29092") \
        .option("subscribe", "events") \
        .load()

    sword_purchases = raw_events \
        .filter(is_sword_purchase(raw_events.value.cast('string'))) \
        .select(raw_events.value.cast('string').alias('raw_event'),
                raw_events.timestamp.cast('string'),
                from_json(raw_events.value.cast('string'),
                          purchase_sword_event_schema()).alias('json')) \
        .select('raw_event', 'timestamp', 'json.*')

    spark.sql("drop table if exists sword_purchases")
    sql_string = """
        create external table if not exists sword_purchases (
            raw_event string,
            timestamp string,
            Accept string,
            Host string,
            `User-Agent` string,
            event_type string
            )
            stored as parquet
            location '/tmp/sword_purchases'
            tblproperties ("parquet.compress"="SNAPPY")
            """
    spark.sql(sql_string)

    sink = sword_purchases \
        .writeStream \
        .format("parquet") \
        .option("checkpointLocation", "/tmp/checkpoints_for_sword_purchases") \
        .option("path", "/tmp/sword_purchases") \
        .trigger(processingTime="10 seconds") \
        .start()

    sink.awaitTermination()
Example #2
0
def run_spark_job(spark):
    spark.sparkContext.setLogLevel("WARN")
    df = spark \
        .readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", "kafka01-vn00c1.vn.infra:9092,kafka02-vn00c1.vn.infra:9092,kafka03-vn00c1.vn.infra:9092") \
        .option("subscribe", "streaming.itbi.topic_test_02") \
        .option("startingOffsets", "earliest") \
        .option("maxOffsetsPerTrigger", 200) \
        .option("maxRatePerPartition", 10) \
        .load()

    df.printSchema()

    schema = StructType([
        StructField("crime_id", StringType(), True),
        StructField("original_crime_type_name", StringType(), True),
        StructField("report_date", StringType(), True),
        StructField("call_date", StringType(), True),
        StructField("offense_date", StringType(), True),
        StructField("call_time", StringType(), True),
        StructField("call_date_time", StringType(), True),
        StructField("disposition", StringType(), True),
        StructField("address", StringType(), True),
        StructField("city", StringType(), True),
        StructField("state", StringType(), True),
        StructField("agency_id", StringType(), True),
        StructField("address_type", StringType(), True),
        StructField("common_location", StringType(), True)
    ])

    # TODO extract the correct column from the kafka input resources
    # Take only value and convert it to String
    kafka_df = df.selectExpr("CAST(value AS STRING)")

    service_table = kafka_df \
        .select(psf.from_json(psf.col('value'), schema).alias("DF")) \
        .select("DF.*")

    # TODO select original_crime_type_name and disposition
    # distinct_table = (service_table
    #                   .select("original_crime_type_name", "disposition")
    #                   .distinct()
    #                  )

    query = service_table.writeStream.trigger(
        processingTime="10 seconds").format("console").option(
            "truncate", "false").start()
    query.awaitTermination()
def run_spark_job(spark):

    # Create Spark Configuration
    # Create Spark configurations with max offset of 200 per trigger
    # set up correct bootstrap server and port
    df = spark \
        .readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", "localhost:9092") \
        .option("subscribe", "service-calls") \
        .option("startingOffsets", "earliest") \
        .option("maxOffsetsPerTrigger", 200) \
        .load()

    # Show schema for the incoming resources for checks
    df.printSchema()

    # Extract the correct column from the kafka input resources
    # Take only value and convert it to String
    kafka_df = df.selectExpr("CAST(value AS STRING)")

    service_table = kafka_df\
        .select(psf.from_json(psf.col('value'), schema).alias("SERVICE_CALLS"))\
        .select("SERVICE_CALLS.*")

    distinct_table = service_table\
        .select(psf.col('crime_id'),
                psf.col('original_crime_type_name'),
                psf.to_timestamp(psf.col('call_date_time')).alias('call_datetime'),
                psf.col('address'),
                psf.col('disposition'))

    # Get different types of original_crime_type_name in 60 minutes interval
    counts_df = distinct_table \
        .withWatermark("call_datetime", "60 minutes") \
        .groupBy(
            psf.window(distinct_table.call_datetime, "10 minutes", "5 minutes"),
            distinct_table.original_crime_type_name
        ).count()

    # Write output stream
    query = counts_df \
        .writeStream \
        .outputMode('Complete') \
        .format('console') \
        .start()

    # Attach a ProgressReporter
    query.awaitTermination()
Example #4
0
def clean_MES(df):
    df_decoded = df.withColumn('Body', decode(unbase64(df.Body), 'utf-8'))
    return flatten_df(
        flatten_df(
            df_decoded.withColumn(
                'Body',
                from_json(
                    col('Body'),
                    StructType([
                        StructField("dataItemType", StringType(), True),
                        StructField("assetId", StringType(), True),
                        StructField("value", StringType(), True)
                    ], )))).drop(col('SystemProperties')).
        withColumn(
            'SystemProperties_connectionAuthMethod',
            from_json(
                col('SystemProperties_connectionAuthMethod'),
                StructType([
                    StructField("scope", StringType(), True),
                    StructField("type", StringType(), True),
                    StructField("issuer", StringType(), True),
                    StructField("acceptingIpFilterRule", StringType(), True)
                ], ))).withColumn(
                    'Body_Value',
                    from_json(
                        col('Body_Value'),
                        StructType([
                            StructField("eventId", StringType(), True),
                            StructField("assetId", StringType(), True),
                            StructField("telemetryValue", StringType(), True),
                            StructField("description", StringType(), True),
                            StructField("dateTime", StringType(),
                                        True),
                            StructField("componentName", StringType(), True),
                            StructField("status", StringType(), True)
                        ], ))))
Example #5
0
File: vep.py Project: Hoeze/firefly
    def _parse_text(self, vep_transformed_df: f.DataFrame):
        """
        Parses json-formatted VEP output string

        Args:
            vep_transformed_df: output of `self._transform()`

        Returns:
            Spark DataFrame with the schema as defined by `self.output_schema`
        """
        vep_df = (vep_transformed_df.withColumn(
            'data', f.from_json('text', self.output_schema)).select(
                f.expr("data.*")).drop("input"))

        return vep_df
def run_spark_job(spark):

    # TODO Create Spark Configuration
    # Create Spark configurations with max offset of 200 per trigger
    # set up correct bootstrap server and port
    df = spark \
        .readStream \

    # Show schema for the incoming resources for checks
    df.printSchema()

    # TODO extract the correct column from the kafka input resources
    # Take only value and convert it to String
    kafka_df = df.selectExpr("")

    service_table = kafka_df\
        .select(psf.from_json(psf.col('value'), schema).alias("DF"))\
        .select("DF.*")

    # TODO select original_crime_type_name and disposition
    distinct_table = 

    # count the number of original crime type
    agg_df = 

    # TODO Q1. Submit a screen shot of a batch ingestion of the aggregation
    # TODO write output stream
    query = agg_df \


    # TODO attach a ProgressReporter
    query.awaitTermination()

    # TODO get the right radio code json path
    radio_code_json_filepath = ""
    radio_code_df = spark.read.json(radio_code_json_filepath)

    # clean up your data so that the column names match on radio_code_df and agg_df
    # we will want to join on the disposition code

    # TODO rename disposition_code column to disposition
    radio_code_df = radio_code_df.withColumnRenamed("disposition_code", "disposition")

    # TODO join on disposition column
    join_query = agg_df.


    join_query.awaitTermination()
Example #7
0
def main():
    spark = SparkSession \
        .builder \
        .appName("ExtractEventsJob") \
        .enableHiveSupport() \
        .getOrCreate()

    #     batch - load raw events
    raw_events = spark \
        .read \
        .format("kafka") \
        .option("kafka.bootstrap.servers", "kafka:29092") \
        .option("subscribe", "events") \
        .option("startingOffsets", "earliest") \
        .option("endingOffsets", "latest") \
        .load()

    #     # streaming - load raw events
    #     raw_events = spark \
    #         .readStream \
    #         .format("kafka") \
    #         .option("kafka.bootstrap.servers", "kafka:29092") \
    #         .option("subscribe", "events") \
    #         .load()

    # filter for purchase_sword events
    purchase_events = raw_events \
        .filter(is_purchase(raw_events.value.cast('string'))) \
        .select(raw_events.value.cast('string').alias('raw_event'),
                raw_events.timestamp.cast('string'),
                from_json(raw_events.value.cast('string'),
                          purchase_sword_event_schema()).alias('json')) \
        .select('raw_event', 'timestamp', 'json.*')

    # register extracted_purchase_events to temp table
    purchase_events.registerTempTable("sword_purchase_events")

    # write to parquet file
    spark.sql("""
        create external table sword_purchase_events
        stored as parquet
        location '/tmp/sword_purchase_events'
        as
        select * from sword_purchase_events
    """)

    sword_purchase_events.write.mode('overwrite').parquet(
        '/tmp/sword_purchase_events')
Example #8
0
    def transform(self) -> DataFrame:
        with_recommendations = (
            self.dataFrame.where(~F.col("top_candidates").isNull())
            .withColumn(
                "data",
                F.explode(
                    F.from_json("top_candidates", RawDataset.top_candidates_schema)
                ),
            )
            .select("*", "data.image", "data.rating", "data.note")
            .withColumnRenamed("wiki_db", "wiki")
            .withColumnRenamed("image", "image_id")
            .withColumn("confidence_rating", self.confidence_rating)
            .withColumn("source", self.source)
            .withColumn("found_on", self.found_on)
            .select(
                "wiki",
                "page_id",
                "page_title",
                "image_id",
                "confidence_rating",
                "source",
                "instance_of",
                "found_on",
            )
        )
        without_recommendations = (
            self.dataFrame.where(F.col("top_candidates").isNull())
            .withColumnRenamed("wiki_db", "wiki")
            .withColumn("image_id", F.lit(None))
            .withColumn("confidence_rating", F.lit(None))
            .withColumn("source", F.lit(None))
            .withColumn("found_on", F.lit(None))
            .select(
                "wiki",
                "page_id",
                "page_title",
                "image_id",
                "confidence_rating",
                "source",
                "instance_of",
                "found_on",
            )
        )

        return with_recommendations.union(without_recommendations)\
            .withColumn("instance_of", self.instance_of)\
            .withColumn("is_article_page", self.is_article_page)
Example #9
0
def main():
    spark = SparkSession \
        .builder \
        .appName("Twitchatter") \
        .getOrCreate()
    # Create DataFrame representing the stream of input lines from connection to localhost:9999
    # Subscribe to 1 topic
    df = spark \
      .readStream \
      .format("kafka") \
      .option("kafka.bootstrap.servers", config.ip_address) \
      .option("subscribe", config.topic) \
             .option("startingOffsets","earliest") \
      .load()

    #Kafka streams from source are as "key":"value"..etc.
    df.printSchema()

    #Select key:value and discard others
    #value schema: {"username":"******","message":"xxx","channel":"xxx","time":"xxx"}
    schema = StructType().add("username", StringType()).add(
        "message", StringType()).add("channel",
                                     StringType()).add("time", StringType())
    ds = df.selectExpr("CAST(value AS STRING)") \
           .select(f.from_json("value",schema).alias("message")) \
           .select("message.*")
    ds.printSchema()
    # uncomment to see data flowing in
    query = ds.writeStream.outputMode("append").format("console").start()
    query.awaitTermination()

    # write is not available for streaming data; we use create_table to create table and keyspaces
    #ds.write.format("org.apache.spark.sql.cassandra").options(table="rawtable",keyspace="test").save(mode="append")

    # again structured stream does not provide savetocassandra functionality
    #ds.saveToCassandra("test","rawtable")

    # dump data to parquet files
    #query = ds.writeStream \
    #  .format("parquet") \
    #  .option("startingOffsets", "earliest") \
    #  .option("checkpointLocation", "/home/ubuntu/twitchatter/test/check/") \
    #  .option("path", "s3a://mypqrquet/") \
    #  .start()
    #query.awaitTermination()

    # Do some simple count: unique user count per channel
    user_count = ds.groupBy("channel", "username").count()
Example #10
0
def stream(spark):
    df = spark \
        .readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", "localhost:9092") \
        .option("subscribe", "TweetsTopic") \
        .load() \
        .select(from_json(col("value").cast("String"), SCHEMA).alias("data"), col("timestamp")) \
        .select("data.*", "timestamp") \
        .select("text", "tweet_time", "source", "timestamp") \
        .writeStream \
        .outputMode("update") \
        .trigger(processingTime='2 seconds') \
        .foreachBatch(save_to_hbase_mysql) \
        .start()
    return df
Example #11
0
    def _parse_device_product_associations_payload(
            cls, df, payload_column_name='payload'):
        """
        Parses the JSON payload column containing the product association information.

        :param DataFrame df: The input DataFrame
        :param str payload_column_name: (Optional) The name of the JSON payload column - defaults to `payload`
        :rtype DataFrame
        """

        payload_schema = cls._get_device_product_association_payload_schema()

        return df \
            .withColumn(payload_column_name, from_json(payload_column_name, schema=payload_schema)) \
            .select('*', '{}.*'.format(payload_column_name)) \
            .drop(payload_column_name)
def flattenDf(df):
    ht_schema = StructType([
        StructField('text', StringType(), True),
        StructField('indices', StringType(), True)
    ])
    flatDf = df \
        .withColumn("ht", f.from_json("hashtags", ht_schema)) \
        .withColumn("twtdate", f.from_unixtime(f.col("twttimestamp").substr(1,10), format='yyyy-MM-dd')) \
        .drop("hashtags") \
        .select(f.col("twtid"),
            f.col("twtdate"),
            f.col("twtlang"),
            f.col("twtsource"),
            f.col("ht.text").alias("twthashtag")
            )
    return flatDf
Example #13
0
def run_spark_job(spark):

    # TODO Create Spark Configuration
    # Create Spark configurations with max offset of 200 per trigger
    # set up correct bootstrap server and port
    # df = spark ...
    df = spark.readStream.\
    format("kafka").\ # set data ingestion format as Kafka
    option("subscribe", b'service-calls').\ #This is required although it says option.
    option("kafka.bootstrap.servers", "localhost:9092").\ #You will also need the url and port of the bootstrap server
    option("startingOffsets", "earliest") .\
    option("maxOffsetsPerTrigger", 200). \
    load()
    
    # Show schema for the incoming resources for checks
    df.printSchema()

    # TODO extract the correct column from the kafka input resources
    # Take only value and convert it to String
    kafka_df = df.selectExpr("CAST(value as STRING)")

    service_table = kafka_df\
        .select(psf.from_json(psf.col('value'), schema).alias("SERVICE_CALLS"))\
        .select("SERVICE_CALLS.*")

    distinct_table = service_table\
        .select(psf.col('crime_id'),
                psf.col('original_crime_type_name'),
                psf.to_timestamp(psf.col('call_date_time')).alias('call_datetime'),
                psf.col('address'),
                psf.col('disposition'))

    # TODO get different types of original_crime_type_name in 60 minutes interval
    #counts_df = 

    # TODO use udf to convert timestamp to right format on a call_date_time column
    converted_df = distinct_table.withColumn("DateColconv", udf_convert_time('call_date_time'))

    # TODO apply aggregations using windows function to see how many calls occurred in 2 day span
    # calls_per_2_days =

    # TODO write output stream
    # query = 


    # TODO attach a ProgressReporter
    query.awaitTermination()
Example #14
0
def run_spark_job(spark):

    df = spark \
        .readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", "localhost:9092") \
        .option("subscribe", "org.sf.police.service-calls") \
        .option("startingOffsets", "earliest") \
        .option("maxOffsetsPerTrigger", 200) \
        .option("stopGracefullyOnShutdown", "true") \
        .load()

    df.printSchema()

    kafka_df = df.selectExpr("CAST(value AS STRING)")

    service_table = kafka_df\
        .select(psf.from_json(psf.col('value'), schema).alias("DF"))\
        .select("DF.*")

    distinct_table = service_table \
        .select(psf.col('crime_id'),
                psf.col('original_crime_type_name'),
                psf.col('disposition')).distinct()

    agg_df = distinct_table \
      .groupBy(distinct_table.original_crime_type_name).count()

    query = agg_df \
        .writeStream \
        .trigger(processingTime="10 seconds") \
        .outputMode('complete') \
        .format('console') \
        .start()

    query.awaitTermination()

    radio_code_json_filepath = "./radio_code.json"
    radio_code_df = spark.read.json(radio_code_json_filepath)

    radio_code_df = radio_code_df.withColumnRenamed("disposition_code",
                                                    "disposition")
    join_query = agg_df.join(radio_code_df,
                             agg_df.disposition == radio_code_df.disposition,
                             "left_outer")

    join_query.awaitTermination()
 def startSpark(self):
     spark = SparkSession \
         .builder \
         .appName("twitter") \
         .master("local[*]") \
         .config('spark.jars.packages', 'org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.4') \
         .getOrCreate()
     hadoop_conf=spark.sparkContext._jsc.hadoopConfiguration()
     hadoop_conf.set("fs.s3n.impl", "org.apache.hadoop.fs.s3native.NativeS3FileSystem")
     hadoop_conf.set("fs.s3n.awsAccessKeyId", self.aws_key)
     hadoop_conf.set("fs.s3n.awsSecretAccessKey", self.aws_secret)
     conf = spark.sparkContext._conf.setAll([('spark.delta.logStore.class','org.apache.spark.sql.delta.storage.S3SingleDriverLogStore')])
     spark.sparkContext._conf.getAll()
     
     df = spark \
         .readStream \
         .format("kafka") \
         .option("kafka.bootstrap.servers", "localhost:9092") \
         .option("subscribe", "demo-3-twitter") \
         .option("startingOffsets", "latest") \
         .load()
 
 
     onlyValue = df.selectExpr("CAST(value AS STRING)").writeStream.format("memory").queryName("onlyValue").start()
     onlyValue.awaitTermination(timeout=10)
     
     sample = spark.table("onlyValue")
     
     kafka_schema = json.loads(sample.head().asDict()["value"])["schema"]
     spark_schema = self.transform_schema_kafka_to_spark(kafka_schema)
     schema = StructType().add("payload", spark_schema).add("schema", StructType())
     
     data = df.select(fn.from_json(df["value"].cast("string"), schema).alias("value")).alias("data")
     data.printSchema()
     
     hashtags = data.select(fn.explode("data.value.payload.entities.hashtags").alias("hashtag"),
                            fn.col("data.value.payload.coordinates").alias("coordinates"),
                            fn.col("data.value.payload.user.location").alias("location_name"),
                            fn.col("data.value.payload.text").alias("text"),
                            fn.to_timestamp("data.value.payload.created_at").alias("created_time")) \
         .select(fn.lower(fn.col("hashtag.text")).alias("hashtag"), "created_at")
     
     hashtagCount = hashtags.groupBy(fn.window(hashtags["created_at"], "10 minutes", "5 minutes"), "hashtag")     .count().orderBy(["window", "count"], ascending=[False, False])
     
     query = hashtagCount.writeStream.outputMode("append").format("delta").trigger(Trigger.ProcessingTime("300 seconds")).option('checkpointLocation', checkpoint_location).start()
     sleep(600)
     query.stop()
 def load(self, name, path, infer_schema=False):
     if name in ['ml_features_train', 'ml_features_test']:
         data = self.spark.read.parquet(path)
     else:
         schema = self.schema[name] if not infer_schema else None
         data = self.spark.read.csv(path,
                                    schema=schema,
                                    escape='"',
                                    quote='"',
                                    inferSchema=infer_schema,
                                    header=True)
     if name == 'product':
         for column, column_schema in self.metadata_schema.items():
             if column in data.columns:
                 data = data.withColumn(
                     column, F.from_json(F.col(column), column_schema))
     return data
Example #17
0
def main(spark: pyspark.sql.SparkSession):
    try:
        sdf = spark.readStream.format('kafka') \
            .option('kafka.bootstrap.servers', 'localhost:9092') \
            .option('subscribe', 'lcr-events') \
            .option("startingOffsets", "earliest") \
            .load()

        sdf.select(from_json(col('value').cast("string"), schema=LCR_RESULT_SCHEMA).alias('data')) \
            .select('data.*')\
            .writeStream \
            .format('console') \
            .start()

        spark.streams.awaitAnyTermination()
    finally:
        spark.stop()
Example #18
0
    def __init__(self, topic, spark_ip="local[2]", kafka_ip="localhost:9092"):
        self.topic = topic
        self.kafka_ip = kafka_ip
        sc = SparkContext(spark_ip, appName="WikiStream")
        self.spark = pyspark.SQLContext(sc)
        self.df = self.spark \
            .read \
            .format("kafka") \
            .option("kafka.bootstrap.servers", self.kafka_ip) \
            .option("subscribe", self.topic) \
            .option("startingOffsets", "earliest") \
            .load()

        self.events = self.df.withColumn("value", F.col("value").cast(StringType())) \
            .withColumn("value", F.from_json("value", MAIN_SCHEMA)) \
            .select("value.data.*") \
            .withColumn("time", F.col("meta.dt").cast("timestamp"))
def test1(spark):
    """
    This demonstrates reading large images from Pravega and detecting defects.
    The data field contains a base-64 encoded PNG image file.
    It uses chunked encoding to support events of 2 GiB.
    This runs out of memory because the non-Pandas runner uses fixed batches of 100.
    """
    schema = 'timestamp timestamp, frame_number int, camera int, ssrc int, data binary'

    controller = os.getenv('PRAVEGA_CONTROLLER', 'tcp://127.0.0.1:9090')
    scope = os.getenv('PRAVEGA_SCOPE', 'examples')
    df = (spark.readStream.format("pravega").option(
        "controller", controller).option("scope", scope).option(
            "stream", "video").option("encoding", "chunked_v1").load())

    df = df.withColumnRenamed('event', 'raw_event')
    df = df.select('*', decode('raw_event', 'UTF-8').alias('event_string'))
    df = df.select('*',
                   from_json('event_string', schema=schema).alias('event'))
    df = df.select('*', 'event.*')
    df = df.select('*', length('data'))
    # df = df.withWatermark('timestamp', '60 second')

    @udf(returnType=DoubleType())
    def defect_probability(data):
        """Calculate the probability of a defect."""
        # Decode the image.
        rgb = cv2.imdecode(np.array(data), -1)
        # Perform a computation on the image to determine the probability of a defect.
        # For now, we just calculate the mean pixel value.
        # We can any Python library, including NumPy and TensorFlow.
        p = rgb.mean() / 255.0
        return float(p)

    df = df.select('*', defect_probability('data').alias('defect_probability'))

    df = df.drop('raw_event', 'event_string', 'event', 'data')

    df.printSchema()

    if True:
        (df.writeStream.trigger(
            processingTime='3 seconds')  # limit trigger rate
         .outputMode('append').format('console').option(
             'truncate', 'false').start().awaitTermination())
Example #20
0
def run_spark_job(spark):

    df = spark \
        .readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", "localhost:9092") \
        .option("subscribe", "com.udacity.streams.purchases9999") \
        .option("startingOffsets", "earliest") \
        .option("maxOffsetsPerTrigger", 10) \
        .option("stopGracefullyOnShutdown", "true") \
        .load()

    # Show schema for the incoming resources for checks
    df.printSchema()
    """
        Once received from Kafka, the source will have the following schema:
        key[binary]
        value[binary]
        topic[string]
        partition[int]
        offset[long]
        timestamp[long]
        timestampType[int]
    """

    kafka_df = df.selectExpr("CAST(value AS STRING)")

    # we can run this if we don't care about schema and repsrentation
    # query = kafka_df.writeStream.outputMode("append").format("console").start()
    # query.awaitTermination()

    # build a schema
    jsonSchema = StructType([
        StructField("username", StringType(), True),
        StructField("currency", StringType(), True),
        StructField("amount", IntegerType(), True)
    ])

    # unpack collumns from json "Value"
    json_df =kafka_df\
        .select(psf.from_json(psf.col("value"), jsonSchema).alias("JSON_Topic"))\
        .select("JSON_Topic.*")

    query = json_df.writeStream.outputMode("append").format("console").start()
    query.awaitTermination()
Example #21
0
def get_read_stream(spark):
    args = parser.parse_args()

    PERSON_EVENTS = get_schema()
    df = spark \
        .readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", kafka_url) \
        .option("subscribe", args.topic) \
        .option("startingOffsets", "earliest") \
        .load() \
        .selectExpr("CAST(value AS STRING) as json") \
        .select(from_json(col("json").cast("string"), PERSON_EVENTS).alias("parsed_value")) \
        .select("parsed_value.*")

    df = normalize_fields(df)
    # df.printSchema()
    return df
def get_layer_skus_df(product_lists_df):
    """
    Take all the product lists defining the bundles, and get all unique oms_ids from those lists
    with the associated product type of each.

    :param product_lists_df:
    :return: df with two columns: (oms_id, product_type)
    """
    return product_lists_df.select(
        F.explode(F.from_json('product_list_json', ArrayType(
            StructType([
                StructField("id", StringType()),
                StructField("productType", StringType())
            ])
        ))
                  ).alias('product_object')) \
        .select(F.col('product_object.id').alias('oms_id'), F.col('product_object.productType').alias('productType')) \
        .drop_duplicates()
Example #23
0
 def job(self) -> None:
     # define transformations for input data stream and write structured data on parquet format
     query = self.stream \
         .select(F.col('value').cast('string').alias('json')) \
         .select(F.from_json('json', schema).alias('data')) \
         .select('data.*')\
         .withColumn('event_date', F.to_date('event_time')) \
         .writeStream \
         .trigger(processingTime='1 minute') \
         .outputMode("append")\
         .partitionBy('event_date')\
         .option("checkpointLocation", self.checkpoint_dir) \
         .option("startingOffsets", "earliest") \
         .option("truncate", "false") \
         .format("parquet")\
         .option("path", self.target_dir) \
         .start()
     query.awaitTermination()
Example #24
0
def run_spark_job(spark):
    df = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "localhost:9092") \
    .option("subscribe", "service-calls") \
    .option("maxOffsetPerTrigger", "200") \
    .option("startingOffsets", "earliest") \
    .load()
    df.printSchema()
    kafka_df = df.selectExpr("CAST(value AS STRING)")
    service_table = kafka_df\
        .select(psf.from_json(psf.col('value'), schema).alias("SERVICE_CALLS"))\
        .select("SERVICE_CALLS.*")
    distinct_table = service_table\
        .select(psf.col('crime_id'),
                psf.col('original_crime_type_name'),
    psf.to_timestamp(psf.col('call_date_time')).alias('call_datetime'),
                psf.col('address'),
                psf.col('disposition'))
    counts_df = distinct_table \
        .withWatermark("call_datetime", "60 minutes") \
        .groupBy(
            psf.window(distinct_table.call_datetime, "10 minutes", "5 minutes"),
            distinct_table.original_crime_type_name
            ).count()
    query = counts_df \
        .writeStream \
        .outputMode('complete') \
        .format('console') \
        .start()
    query.awaitTermination()
    countDataFrame = distinct_table \
        .withWatermark("call_datetime", "60 minutes") \
        .groupBy(
            psf.window(distinct_table.call_datetime, "10 minutes", "5 minutes"),
            distinct_table.original_crime_type_name
            ).count()
    query = countDataFrame \
        .writeStream \
        .outputMode('complete') \
        .format('console') \
        .start()
    query.awaitTermination()
Example #25
0
def main():

    # Read from kafka
    spark = SparkSession.builder.appName("Driver GPS Streaming").getOrCreate()

    # log4j
    # log4jLogger= spark._jvm.org.apache.log4j.Logger
    # logger = log4jLogger.getLogger(__name__)

    log4jLogger = spark._jvm.org.apache.log4j
    logger = log4jLogger.LogManager.getLogger("spark_app_logs")
    # logger = log4jLogger.LogManager.getRootLogger("catfish_logs")

    logger.setLevel(log4jLogger.Level.DEBUG)

    logger.info("pyspark script logger initialized")

    sdf_locations = (spark.readStream.format("kafka").option(
        "kafka.bootstrap.servers",
        brokerAddresses).option("subscribe", TOPIC_NAME).option(
            "startingOffsets",
            "earliest").load().selectExpr("CAST(value AS STRING)", ))

    sdf_locations_schema = StructType([
        StructField("driverId", StringType(), True),
        StructField("lat", StringType(), True),
        StructField("lng", StringType(), True),
        StructField("time", StringType(), True),
    ])

    sdf_locations_data = sdf_locations.select(
        from_json("value", sdf_locations_schema).alias("a")).select("a.*")

    sdf_locations_data = apply_location_requirement(sdf_locations_data)

    sdf_locations_data.printSchema()

    # sdf_locations_data.writeStream.format("console").start().awaitTermination()

    sdf_locations_data.writeStream.outputMode("append").foreachBatch(
        partial(postgres_sink, table_name="locations")).trigger(
            processingTime="15 minute").start()

    spark.streams.awaitAnyTermination()
Example #26
0
def run_spark_job(spark):
    df = spark \
        .readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", BOOTSTRAP_SERVER) \
        .option("subscribe", TOPIC_NAME) \
        .option("maxOffsetsPerTrigger", 200) \
        .option("maxRatePerPartition", 100) \
        .option("startingOffsets","earliest") \
        .load()

    df.printSchema()

    kafka_df = df.selectExpr("CAST(value AS STRING)")

    service_table = kafka_df\
        .select(psf.from_json(psf.col('value'), schema).alias("DF"))\
        .select("DF.*")

    distinct_table = service_table.select(
        "call_date_time",
        "disposition",
        "original_crime_type_name"
    )

    watermarked_distinct_table = distinct_table.withWatermark("call_date_time", "20 minutes")

    agg_df = watermarked_distinct_table.groupBy(
        psf.window("call_date_time", "20 minutes"),
        "original_crime_type_name"
    ).count().orderBy("count")

    query = agg_df.writeStream.outputMode("complete").format("console").start()

    query.awaitTermination()

    radio_code_json_filepath = "./radio_code.json"
    radio_code_df = spark.read.json(radio_code_json_filepath)

    radio_code_df = radio_code_df.withColumnRenamed("disposition_code", "disposition")

    join_query = agg_df.join(radio_code_df, "disposition")

    join_query.awaitTermination()
Example #27
0
def parseJSONCols(spark, df, sanitize=True, *cols):
    """Auto infer the schema of a json column and parse into a struct.
    https://stackoverflow.com/questions/41107835/pyspark-parse-a-column-of-json-strings
    rdd-based schema inference works if you have well-formatted JSON,
    like ``{"key": "value", ...}``, but breaks if your 'JSON' is just a
    string (``"data"``) or is an array (``[1, 2, 3]``). In those cases you
    can fix everything by wrapping the data in another JSON object
    (``{"key": [1, 2, 3]}``). The ``sanitize`` option (default True)
    automatically performs the wrapping and unwrapping.

    The schema inference is based on this
    `SO Post <https://stackoverflow.com/a/45880574)/>`_.

    Parameters
    ----------
    df : pyspark dataframe
        Dataframe containing the JSON cols.
    *cols : string(s)
        Names of the columns containing JSON.
    sanitize : boolean
        Flag indicating whether you'd like to sanitize your records
        by wrapping and unwrapping them in another JSON object layer.

    Returns
    -------
    pyspark dataframe
        A dataframe with the decoded columns.
    """
    res = df
    for i in cols:

        # sanitize if requested.
        if sanitize:
            res = (res.withColumn(
                i, psf.concat(psf.lit('{"data": '), i, psf.lit('}'))))
        # infer schema and apply it
        schema = spark.read.json(res.rdd.map(lambda x: x[i])).schema
        res = res.withColumn(i, psf.from_json(psf.col(i), schema))

        # unpack the wrapped object if needed
        if sanitize:
            res = res.withColumn(i, psf.col(i).data)
    return res
Example #28
0
def run_spark_job(spark):

    # TODO Create Spark Configuration
    # Create Spark configurations with max offset of 200 per trigger
    # set up correct bootstrap server and port
    df = spark.readStream.format("kafka").option(
        "kafka.bootstrap.servers",
        "localhost:9092").option("subscribe", "service-calls").option(
            "maxOffsetsPerTrigger", 200).option("startingOffsets",
                                                "earliest").load()

    # Show schema for the incoming resources for checks
    df.printSchema()

    # TODO extract the correct column from the kafka input resources
    # Take only value and convert it to String
    kafka_df = df.selectExpr("CAST(value AS STRING)")

    service_table = kafka_df\
        .select(psf.from_json(psf.col('value'), schema).alias("SERVICE_CALLS"))\
        .select("SERVICE_CALLS.*")

    distinct_table = service_table\
        .select(psf.col('crime_id'),
                psf.col('original_crime_type_name'),
                psf.to_timestamp(psf.col('call_date_time')).alias('call_datetime'),
                psf.col('address'),
                psf.col('disposition'))

    counts_df = distinct_table.withWatermark(
        "call_datetime", "60 minutes").groupBy('original_crime_type_name')

    converted_df = distinct_table.select(
        psf.col('crime_id'), psf.col('original_crime_type_name'),
        udf_convert_time(psf.col('call_datetime')))

    calls_per_2_days = counts_df.groupBy(
        psf.window(distinct_table.call_datetime, "2 days")).count()

    query = converted_df.writeStream.outputMode("Complete").format(
        "console").start()

    query.awaitTermination()
def consume_transactions(input_df,checkpoint_path):
    # The Below Section is for Flight Transactions transformation and loading
    new_df = input_df.withColumn('new_val', F.regexp_replace(input_df['value'], '\\\\', '')).drop('value')
    new_df = new_df.withColumn('value', F.regexp_replace(new_df['new_val'], '""', "'")).drop('new_val')
    new_df = new_df.withColumn('new_val', F.regexp_replace(new_df['value'], '}n', "}")).drop('value')
    new_df = new_df.withColumn('new_val', F.regexp_replace(new_df['new_val'], "'", ""))
    #new_df = new_df.withColumn('struct_val',F.struct(new_df['new_val'])).drop('new_val')

    transaction_schema = StructType((StructField("DestinationAirportCode", StringType()),
                             StructField("Itinerary", StringType()),
                             StructField("OneWayOrReturn", StringType()),
                             StructField("OriginAirportCode", StringType()),
                             StructField("Segment", ArrayType(StructType([StructField("ArrivalAirportCode", StringType()),
                                                                   StructField("DepartureAirportCode", StringType()),
                                                                   StructField("LegNumber", StringType()),
                                                                   StructField("NumberOfPassengers", StringType()),
                                                                   StructField("SegmentNumber", StringType())]))),
                    StructField("TransactionDateUTC", StringType()),
                    StructField("UniqueId", StringType())))

    json_df = new_df.select(F.from_json(F.col("new_val"), transaction_schema).alias("value")).selectExpr('value.*')
    json_df = json_df.withColumn('Segment', F.explode_outer(json_df['Segment']))

    json_df = json_df.withColumn('DepartureAirportCode', F.col('Segment')['DepartureAirportCode']) \
        .withColumn('ArrivalAirportCode', F.col('Segment')['ArrivalAirportCode']) \
        .withColumn('LegNumber', F.col('Segment')['LegNumber']) \
        .withColumn('NumberOfPassengers', F.col('Segment')['NumberOfPassengers']) \
        .withColumn('SegmentNumber', F.col('Segment')['SegmentNumber']) \
        .drop('Segment')

    # Stream the data, from a Kafka topic to a Spark in-memory table
    query = json_df\
        .writeStream \
        .format("memory") \
        .queryName("TransactionTable") \
        .outputMode("append") \
        .option("checkpoint",checkpoint_path)\
        .start()

    query.awaitTermination(5)

    # Let it Fill up the table
    sleep(10)
Example #30
0
def Analytics1():
    while (1):
        global Flags
        if Flags == True:
            spark = SparkSession \
                .builder \
                .config("spark.jars",
                        "spark-streaming-kafka-0-10_2.12-3.0.0.jar,spark-sql-kafka-0-10_2.12-3.0.0.jar,kafka-clients-2.5.0.jar,commons-pool2-2.8.0.jar,spark-token-provider-kafka-0-10_2.12-3.0.0.jar") \
                .appName("Q1") \
                .getOrCreate()
            data_spark_schema = StructType([
                StructField("classification", StringType(), True),
                StructField("status", StringType(), True),
                StructField("initial_firm_notification", StringType(), True)
            ])

            streamingInputDF = spark \
                .readStream \
                .format("kafka") \
                .option("kafka.bootstrap.servers", "192.168.56.1:9092") \
                .option("subscribe", "devices") \
                .load()
            stream_records = streamingInputDF.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING) as devices") \
                .select(psf.from_json("devices", data_spark_schema).alias("devices"))
            stream_records.printSchema()
            stream_records = stream_records.select("devices.*").drop("devices")

            z = stream_records.writeStream.outputMode("append").format(
                "memory").queryName("view").trigger(
                    processingTime=' 2 seconds').start()
            while (z.isActive):
                t.sleep(15)
                x = spark.sql(
                    """select classification, count(classification) from view group by classification"""
                )
                x = x.toPandas()
                x.plot(x='classification',
                       y='count(classification)',
                       kind='bar')
                plt.savefig(
                    'C:\\Users\\balla\\PycharmProjects\\ORIM1\\untitled\\static\\output.png'
                )
            break
from pyspark.sql.functions import to_json
df.selectExpr("(InvoiceNo, Description) as myStruct")\
  .select(to_json(col("myStruct")))


# COMMAND ----------

from pyspark.sql.functions import from_json
from pyspark.sql.types import *
parseSchema = StructType((
  StructField("InvoiceNo",StringType(),True),
  StructField("Description",StringType(),True)))
df.selectExpr("(InvoiceNo, Description) as myStruct")\
  .select(to_json(col("myStruct")).alias("newJSON"))\
  .select(from_json(col("newJSON"), parseSchema), col("newJSON")).show(2)


# COMMAND ----------

udfExampleDF = spark.range(5).toDF("num")
def power3(double_value):
  return double_value ** 3
power3(2.0)


# COMMAND ----------

from pyspark.sql.functions import udf
power3udf = udf(power3)
# COMMAND ----------

from pyspark.sql.types import *
trxSchema = StructType().add("transaction", StructType().add("rewardsTransactionId", StringType()).add("sourceTransactionId", StringType()).add("transactionNumber", StringType()).add("voidedTransactionId", StringType()).add("userId", StringType()).add("membershipId", StringType()).add("memberId", StringType()).add("membershipLevel", StringType()).add("loyaltyPartner", StringType()).add("programId", StringType()).add("programName", StringType()).add("transactionStatus", StringType()).add("sourceTransactionType", StringType()).add("rewardsTransactionType", StringType()).add("rewardsTransactionSubType", StringType()).add("merchantId", StringType()).add("storeNumber", StringType()).add("cardNumber", StringType()).add("checkNumber", StringType()).add("receiptNumber", StringType()).add("terminalLocalDateTime", StringType()).add("terminalNumber", StringType()).add("redemptionItemRewardCode", StringType()).add("redemptionItemProuductSKU", StringType()).add("channelType", StringType()).add("subChannelType", StringType()).add("brandCode", StringType()).add("countryCode", StringType()).add("exchangeRate", StringType()).add("transactionSource", StringType()).add("channelSource", StringType()).add("description", StringType()).add("rewardsActivityId", StringType()).add("rewardsActivityName", StringType()).add("ticketTotalAmount", StringType()).add("ticketAccruableAmount", IntegerType()).add("cardTotalAmount", StringType()).add("cardAccruableAmount", StringType()).add("localCurrencyCode", StringType()).add("baseCurrencyCode", StringType()).add("transactionDate", StringType()).add("transactionProcessedDate", StringType()).add("utcOffset", StringType()).add("createdDate", StringType()).add("modifiedDate", StringType()).add("correlationId", StringType()).add("eventId", StringType()))

accrualSchema = StructType().add("accrual", StructType().add("rewardsAccrualId", StringType()).add("rewardsTransactionId", StringType()).add("sourceTransactionId", StringType()).add("transactionNumber", StringType()).add("userId", StringType()).add("membershipId", StringType()).add("memberId", StringType()).add("membershipLevel", StringType()).add("programId", StringType()).add("programName", StringType()).add("promotionId", StringType()).add("promotionName", StringType()).add("productSKU", StringType()).add("rewardsActivityDescription", StringType()).add("rewardsTransactionSubType", StringType()).add("pointSubType", StringType()).add("pointsAccrued", StringType()).add("pointsUsed", StringType()).add("pointTypeId", StringType()).add("pointType", StringType()).add("pointExpirationDate", StringType()).add("createdDate", StringType()).add("modifiedDate", StringType()).add("correlationId", StringType()).add("eventId", StringType()))

redemptionSchema = StructType().add("redemption", StructType().add("rewardsRedemptionId", StringType()).add("rewardsTransactionId", StringType()).add("originalTransactionId", StringType()).add("userId", StringType()).add("membershipId", StringType()).add("memberId", StringType()).add("programId", StringType()).add("programName", StringType()).add("rewardsActivityDescription", StringType()).add("typeCode", StringType()).add("pointTypeId", StringType()).add("pointType", StringType()).add("pointsRedeemed", StringType()).add("createdDate", StringType()).add("modifiedDate", StringType()).add("rewardsAccrualId", StringType()).add("accrualSourceTransactionId", StringType()).add("accrualTransactionNumber", StringType()).add("accrualMerchantId", StringType()).add("accrualStoreNumber", StringType()).add("accrualBrandCode", StringType()).add("accrualTransactionSource", StringType()).add("accrualChannelSource", StringType()).add("accrualRewardsActivityName", StringType()).add("accrualRewardsActivityDescription", StringType()).add("accrualCreatedDate", StringType()).add("accrualExpirationDate", StringType()).add("correlationId", StringType()).add("eventId", StringType()))

memberBalancesSchema = StructType().add("membershipPointBalance", StructType().add("membershipId", StringType()).add("memberId", StringType()).add("userId", StringType()).add("programName", StringType()).add("currentRewardPoints", StringType()).add("currentLevelPoints", StringType()).add("lifetimeRewardPoints", StringType()).add("lifetimeLevelPoints", StringType()).add("lifetimeRedeemedPoints", StringType()).add("modifiedDate", StringType()).add("lastAccrualTransactionDate", StringType()).add("lastRedemptionTransactionDate", StringType()).add("lastTransactionDate",StringType()).add("createdDate",StringType()).add("correlationId", StringType()).add("eventId", StringType()))

# COMMAND ----------

from pyspark.sql.functions import from_json
trxDf = validDecryptedTrxDf.select(validDecryptedTrxDf.EnqueuedDateTimeUTC, from_json(validDecryptedTrxDf.Body, trxSchema).alias("jsonBody"))
accDf = validDecryptedAccDf.select(validDecryptedAccDf.EnqueuedDateTimeUTC, from_json(validDecryptedAccDf.Body, accrualSchema).alias("jsonBody"))
redDf = validDecryptedRedDf.select(validDecryptedRedDf.EnqueuedDateTimeUTC, from_json(validDecryptedRedDf.Body, redemptionSchema).alias("jsonBody"))
memBalDf = validDecryptedMemBalDf.select(validDecryptedMemBalDf.EnqueuedDateTimeUTC, from_json(validDecryptedMemBalDf.Body, memberBalancesSchema).alias("jsonBody"))

# COMMAND ----------

# TAKES 15 MINUTES TO WRITE -->  Do we need this???
# trxDf.write.mode("Overwrite").parquet('/mnt/transactions/raw_data/' + now_pt_str_hhmmss )
# redDf.write.mode("Overwrite").parquet('/mnt/redemptions/raw_data/' + now_pt_str_hhmmss)
# accDf.write.mode("Overwrite").parquet('/mnt/accruals/raw_data/' + now_pt_str_hhmmss)

# ## which can later be accessed using the below
# from pyspark.sql import SQLContext

# sqlContext = SQLContext(sc)
## Read in data by pointing to its's table name in Glue Data Catalog
schema = StructType() \
  .add('source', StringType()) \
  .add('type', StringType()) \
  .add('data', StringType()) \
  .add('ts', StringType())

src = spark.read.load('s3://demoanalyticsapp-output/*/*/*/*/', format='parquet', schema=schema)

## Build out some new columns with data extracted from the JSON string

df = src \
  .withColumn('n_ts', F.unix_timestamp('ts', "yyyy-MM-dd'T'hh:mm:ss").cast('timestamp')) \
  .withColumn('agerange', \
    F.from_json( \
      F.get_json_object('data', '$.facedetails[*].agerange'), \
      StructType().add('high', IntegerType()).add('low', IntegerType()) \
    ) \
  ) \
  .withColumn('smile', \
    F.from_json( \
      F.get_json_object('data', '$.facedetails[*].smile'), \
      StructType().add('confidence', DoubleType()).add('value', BooleanType()) \
    ) \
  ) \
  .withColumn('eyeglasses', \
    F.from_json( \
      F.get_json_object('data', '$.facedetails[*].eyeglasses'), \
      StructType().add('confidence', DoubleType()).add('value', BooleanType()) \
    ) \
  ) \
.withColumn('sunglasses', \