Beispiel #1
0
def test2(spark):
    """
    Stream to stream join based on a timestamp range
    """
    schema = 'timestamp timestamp, event_type string, device_id string, temp_celsius double'

    controller = os.getenv('PRAVEGA_CONTROLLER', 'tcp://127.0.0.1:9090')
    scope = os.getenv('PRAVEGA_SCOPE', 'examples')
    df = (spark.readStream.format("pravega").option(
        "controller",
        controller).option("scope", scope).option("stream", "sensors").load())

    df = df.withColumnRenamed('event', 'raw_event')
    df = df.select('*', decode('raw_event', 'UTF-8').alias('event_string'))
    df = df.select('*',
                   from_json('event_string', schema=schema).alias('event'))
    df = df.select('*', 'event.*')
    df = df.drop('raw_event', 'event_string', 'event')
    df = df.withWatermark('timestamp', '60 second')
    df = df.withColumnRenamed('device_id', 'device_id_1')
    df = df.withColumnRenamed('timestamp', 'timestamp_1')

    df1 = df

    df = (spark.readStream.format("pravega").option(
        "controller",
        controller).option("scope", scope).option("stream", "sensors2").load())

    df = df.withColumnRenamed('event', 'raw_event')
    df = df.select('*', decode('raw_event', 'UTF-8').alias('event_string'))
    df = df.select('*',
                   from_json('event_string', schema=schema).alias('event'))
    df = df.select('*', 'event.*')
    df = df.drop('raw_event', 'event_string', 'event')
    df = df.withWatermark('timestamp', '60 second')
    df = df.withColumnRenamed('device_id', 'device_id_2')
    df = df.withColumnRenamed('timestamp', 'timestamp_2')

    df2 = df

    df = df1.join(
        df2,
        expr('device_id_1 = device_id_2 and '
             'timestamp_1 >= timestamp_2 and '
             'timestamp_1 < timestamp_2 + interval 2 second'))

    df.printSchema()

    if True:
        (df.writeStream.trigger(
            processingTime='3 seconds')  # limit trigger rate
         .outputMode('append').format('console').option(
             'truncate', 'false').start().awaitTermination())
def test2(spark):
    """
    """
    schema = 'timestamp timestamp, frame_number int, camera int, ssrc int, data binary'

    # To allow for large images and avoid out-of-memory, the JVM will
    # send to the Python UDF this batch size.
    spark.conf.set('spark.sql.execution.arrow.maxRecordsPerBatch', '1')

    controller = os.getenv('PRAVEGA_CONTROLLER', 'tcp://127.0.0.1:9090')
    scope = os.getenv('PRAVEGA_SCOPE', 'examples')

    df = (spark.readStream.format("pravega").option(
        "controller", controller).option("scope", scope).option(
            "stream", "video").option("encoding", "chunked_v1").load())

    df = df.withColumnRenamed('event', 'raw_event')
    df = df.select('*', decode('raw_event', 'UTF-8').alias('event_string'))
    df = df.select('*',
                   from_json('event_string', schema=schema).alias('event'))
    df = df.select('*', 'event.*')
    df = df.select('*', length('data'))
    df = df.withWatermark('timestamp', '60 second')

    def f(batch_df, batch_id):
        print('batch_id=%d' % batch_id)
        png0 = batch_df.select('data').limit(1).collect()[0][0]
        print('png0=%s' % png0[0:20])

    #     IPython.display.clear_output(wait=True)
    #     IPython.display.display(IPython.display.Image(data=png0))

    (df.writeStream.trigger(processingTime='3 seconds')  # limit trigger rate
     .foreachBatch(f).start().awaitTermination())
def test11(spark):
    # ssrc is the synchronization source identifier. See https://en.wikipedia.org/wiki/Real-time_Transport_Protocol.
    # It should be selected at random by each process that writes records.
    schema = 'timestamp timestamp, frame_number int, camera int, chunk int, num_chunks int, ssrc int, data binary'

    controller = os.getenv('PRAVEGA_CONTROLLER', 'tcp://127.0.0.1:9090')
    scope = os.getenv('PRAVEGA_SCOPE', 'examples')
    df = (spark.readStream.format("pravega").option(
        "controller",
        controller).option("scope", scope).option("stream", "video").load())

    # Decode JSON event.
    df = df.withColumnRenamed('event', 'raw_event')
    df = df.select('*', decode('raw_event', 'UTF-8').alias('event_string'))
    df = df.select(
        '*',
        from_json('event_string', schema=schema,
                  options=dict(mode='FAILFAST')).alias('event'))
    df = df.select('*', 'event.*')

    df.printSchema()

    if True:
        (df.writeStream.trigger(
            processingTime='3 seconds')  # limit trigger rate
         .outputMode('append').format('console').option(
             'truncate', 'false').start().awaitTermination())
Beispiel #4
0
def test(spark):
    """
    This demonstrates reading JSON events from Pravega.
    """
    schema = 'timestamp timestamp, event_type string, device_id string, temp_celsius double'

    controller = os.getenv('PRAVEGA_CONTROLLER', 'tcp://127.0.0.1:9090')
    scope = os.getenv('PRAVEGA_SCOPE', 'examples')
    checkpoint_location = os.getenv(
        'CHECKPOINT_LOCATION', '/tmp/spark_checkpoints_test_sensor_processor')

    df = (spark.readStream.format("pravega").option(
        "controller",
        controller).option("scope", scope).option("stream", "sensors").load())

    df = df.withColumnRenamed('event', 'raw_event')
    df = df.select('*', decode('raw_event', 'UTF-8').alias('event_string'))
    df = df.select('*',
                   from_json('event_string', schema=schema).alias('event'))
    df = df.select('*', 'event.*')
    df = df.drop('raw_event', 'event_string', 'event')
    df = df.withWatermark('timestamp', '60 second')

    df.printSchema()

    if True:
        (df.writeStream.trigger(
            processingTime='3 seconds')  # limit trigger rate
         .outputMode('append').format('console').option(
             'truncate',
             'false').option('checkpointLocation',
                             checkpoint_location).start().awaitTermination())
Beispiel #5
0
def main():
    """
    Read sensor values from a Pravega stream, randomly reorder them, and write to JSON files.
    These JSON files can then be used to training a machine learning model.
    """
    print(sys.version)
    spark = (SparkSession.builder.appName('test1').getOrCreate())
    spark.conf.set('spark.sql.shuffle.partitions', '2')
    spark.conf.set('spark.sql.execution.arrow.enabled', 'true')
    controller = os.getenv('PRAVEGA_CONTROLLER', 'tcp://127.0.0.1:9090')
    scope = os.getenv('PRAVEGA_SCOPE', 'examples')
    output_dir = '/tmp/sensor_training_data'
    df = (
        spark.read.format("pravega").option("controller", controller).option(
            "scope", scope).option("stream", "sensors")
        # .option("encoding", "chunked_v1")
        .load())
    df = df.withColumnRenamed('event', 'raw_event')
    df = df.select('*', decode('raw_event', 'UTF-8').alias('event_string'))
    schema = 'timestamp timestamp, event_type string, device_id string, temp_celsius double'
    df = df.select('*',
                   from_json('event_string', schema=schema).alias('event'))
    df = df.select('*', 'event.*')
    df = df.drop('raw_event', 'event_string', 'event')
    #df.limit(5).show()
    (df.orderBy(
        rand(seed=1)).write.mode('overwrite').format('json').save(output_dir))
def test14(spark):
    # ssrc is the synchronization source identifier. See https://en.wikipedia.org/wiki/Real-time_Transport_Protocol.
    # It should be selected at random by each process that writes records.
    schema = 'timestamp timestamp, frame_number int, camera int, ssrc int, data binary'

    controller = os.getenv('PRAVEGA_CONTROLLER', 'tcp://127.0.0.1:9090')
    scope = os.getenv('PRAVEGA_SCOPE', 'examples')
    df = (spark.readStream.format("pravega").option(
        "controller", controller).option("scope", scope).option(
            "stream", "video").option("encoding", "chunked_v1").load())

    # Decode JSON event.
    df = df.withColumnRenamed('event', 'raw_event')
    df = df.select('*', decode('raw_event', 'UTF-8').alias('event_string'))
    df = df.select(
        '*',
        from_json('event_string', schema=schema,
                  options=dict(mode='FAILFAST')).alias('event'))
    df = df.select('*', 'event.*')

    df = df.withWatermark('timestamp', '60 second')

    @udf(returnType=BinaryType())
    def parse_checksum(checksum_and_data):
        return checksum_and_data[0:4]

    @udf(returnType=BinaryType())
    def parse_data(checksum_and_data):
        return checksum_and_data[4:]

    @udf(returnType=BooleanType())
    def is_checksum_correct(checksum, data):
        expected = struct.unpack('!I', checksum)[0]
        calculated = zlib.crc32(data)
        # print('expected=%d, calculated=%d' % (expected, calculated))
        return expected == calculated

    df = df.withColumnRenamed('data', 'checksum_and_data')
    df = df.select('*',
                   parse_checksum('checksum_and_data').alias('checksum'),
                   parse_data('checksum_and_data').alias('data'))
    df = df.select(
        '*',
        is_checksum_correct('checksum', 'data').alias('is_checksum_correct'))
    df = df.select('*', length('data'))
    df = df.drop('raw_event', 'event_string', 'event', 'checksum_and_data',
                 'data')

    df.printSchema()

    if True:
        (df.writeStream.trigger(
            processingTime='3 seconds')  # limit trigger rate
         .outputMode('append').format('console').option(
             'truncate', 'false').start().awaitTermination())
Beispiel #7
0
def main(topic):
    # get stream
    messages = spark.readStream.format('kafka') \
        .option('kafka.bootstrap.servers', '199.60.17.210:9092,199.60.17.193:9092') \
        .option('subscribe', topic).load()

    # get values and decode
    df_base = messages.select(
        functions.decode(messages['value'], 'utf-8').alias('value'))

    # split dataframe column to (x, y)
    df_cols = functions.split(df_base['value'], ' ')
    df_xy = df_base.withColumn('x', df_cols.getItem(0)) \
                   .withColumn('y', df_cols.getItem(1))

    # compute x, y, xy, x^2
    df_main = df_xy.select(df_xy['x'], df_xy['y'],
                           (df_xy['x'] * df_xy['y']).alias('xy'),
                           (df_xy['x'] * df_xy['x']).alias('x_sq'))

    # compute sigma values (n, x, y, xy, x^2)
    df_sigmas = df_main.select(
                functions.count(df_main['x']).alias('n'), \
                functions.sum(df_main['x']).alias('x'), \
                functions.sum(df_main['y']).alias('y'), \
                functions.sum(df_main['xy']).alias('xy'), \
                functions.sum(df_main['x_sq']).alias('x_sq') \
            )

    # compute value for beta
    df_Beta = df_sigmas.select(
                    df_sigmas['*'], \
                    ( \
                        (df_sigmas['xy'] - (1 / df_sigmas['n']) * (df_sigmas['x'] * df_sigmas['y'])) / \
                        (df_sigmas['x_sq'] - (1 / df_sigmas['n']) * (df_sigmas['x'] * df_sigmas['x'])) \
                    ).alias('beta'))

    # compute value for alpha
    df_result = df_Beta.select( \
                    df_Beta['beta'], \
                    ( \
                        (df_Beta['y'] / df_Beta['n']) - (df_Beta['beta'] * (df_Beta['x'] / df_Beta['n'])) \
                    ).alias('alpha'))

    # write to output
    stream = df_result.writeStream.outputMode("complete").format(
        "console").start()
    stream.awaitTermination(600)
def test1(spark):
    """
    This demonstrates reading large images from Pravega and detecting defects.
    The data field contains a base-64 encoded PNG image file.
    It uses chunked encoding to support events of 2 GiB.
    This runs out of memory because the non-Pandas runner uses fixed batches of 100.
    """
    schema = 'timestamp timestamp, frame_number int, camera int, ssrc int, data binary'

    controller = os.getenv('PRAVEGA_CONTROLLER', 'tcp://127.0.0.1:9090')
    scope = os.getenv('PRAVEGA_SCOPE', 'examples')
    df = (spark.readStream.format("pravega").option(
        "controller", controller).option("scope", scope).option(
            "stream", "video").option("encoding", "chunked_v1").load())

    df = df.withColumnRenamed('event', 'raw_event')
    df = df.select('*', decode('raw_event', 'UTF-8').alias('event_string'))
    df = df.select('*',
                   from_json('event_string', schema=schema).alias('event'))
    df = df.select('*', 'event.*')
    df = df.select('*', length('data'))
    # df = df.withWatermark('timestamp', '60 second')

    @udf(returnType=DoubleType())
    def defect_probability(data):
        """Calculate the probability of a defect."""
        # Decode the image.
        rgb = cv2.imdecode(np.array(data), -1)
        # Perform a computation on the image to determine the probability of a defect.
        # For now, we just calculate the mean pixel value.
        # We can any Python library, including NumPy and TensorFlow.
        p = rgb.mean() / 255.0
        return float(p)

    df = df.select('*', defect_probability('data').alias('defect_probability'))

    df = df.drop('raw_event', 'event_string', 'event', 'data')

    df.printSchema()

    if True:
        (df.writeStream.trigger(
            processingTime='3 seconds')  # limit trigger rate
         .outputMode('append').format('console').option(
             'truncate', 'false').start().awaitTermination())
Beispiel #9
0
def load_images(filenames_pattern, train_size=1.):
    """
    Using Spark new built-in data source for images,
    we want to load data into Dataframes in order to pass
    them in a computing statistics pipeline.

    Args:
    filenames_pattern : A string representing path pattern of each image
    train_size : float number representing the train size 
    (use Dataframe.split([train_size, 1 - train_size]))
    """
    struct_keys = ["origin", "height", "width", "nChannels", "mode", "data"]
  
    df = spark.read.load(filenames_pattern, format="image")
    new_cols = [df["image"].getField(alpha) for alpha in struct_keys]
    new_frame = df.select(*new_cols)
    a = new_frame.withColumn("image.data", F.decode(new_frame.image.data,'UTF-8'))\
        .drop("image.data")\
        .withColumnRenamed("image.data", "data")
    a.describe().show()
    a.printSchema()
    return a
Beispiel #10
0
def clean_MES(df):
    df_decoded = df.withColumn('Body', decode(unbase64(df.Body), 'utf-8'))
    return flatten_df(
        flatten_df(
            df_decoded.withColumn(
                'Body',
                from_json(
                    col('Body'),
                    StructType([
                        StructField("dataItemType", StringType(), True),
                        StructField("assetId", StringType(), True),
                        StructField("value", StringType(), True)
                    ], )))).drop(col('SystemProperties')).
        withColumn(
            'SystemProperties_connectionAuthMethod',
            from_json(
                col('SystemProperties_connectionAuthMethod'),
                StructType([
                    StructField("scope", StringType(), True),
                    StructField("type", StringType(), True),
                    StructField("issuer", StringType(), True),
                    StructField("acceptingIpFilterRule", StringType(), True)
                ], ))).withColumn(
                    'Body_Value',
                    from_json(
                        col('Body_Value'),
                        StructType([
                            StructField("eventId", StringType(), True),
                            StructField("assetId", StringType(), True),
                            StructField("telemetryValue", StringType(), True),
                            StructField("description", StringType(), True),
                            StructField("dateTime", StringType(),
                                        True),
                            StructField("componentName", StringType(), True),
                            StructField("status", StringType(), True)
                        ], ))))
def test_create_thumbnails(spark):
    """
    This demonstrates reading large images from Pravega and detecting defects.
    The data field contains a base-64 encoded PNG image file.
    It uses chunked encoding to support events of 2 GiB.
    """
    schema = 'timestamp timestamp, frame_number int, camera int, ssrc int, data binary'

    # To allow for large images and avoid out-of-memory, the JVM will
    # send to the Python UDF this batch size.
    spark.conf.set('spark.sql.execution.arrow.maxRecordsPerBatch', '1')

    controller = os.getenv('PRAVEGA_CONTROLLER', 'tcp://127.0.0.1:9090')
    scope = os.getenv('PRAVEGA_SCOPE', 'examples')
    checkpoint_location = os.getenv(
        'CHECKPOINT_LOCATION',
        '/tmp/spark_checkpoints_test_video_and_sensor_processor')
    shutil.rmtree(checkpoint_location, ignore_errors=True)

    df = (
        spark.readStream.format("pravega").option(
            "controller", controller).option("scope", scope).option(
                "stream", "video").option("encoding", "chunked_v1")
        # .option("start_stream_cut", "earliest")
        .load())

    df = df.withColumnRenamed('event', 'raw_event')
    df = df.select('*', decode('raw_event', 'UTF-8').alias('event_string'))
    df = df.select('*',
                   from_json('event_string', schema=schema).alias('event'))
    df = df.select('*', 'event.*')
    df = df.select('*', length('data'))
    df = df.withWatermark('timestamp', '1 second')
    df = df.drop('raw_event', 'event_string', 'event')

    grp = df.groupby(
        # window('timestamp', '1 second'),
        'frame_number', )
    #df = df.agg(func.collect_list(func.array(df['camera'], df['data'])).alias('cameras'))

    @pandas_udf(returnType='frame_number int, data binary',
                functionType=PandasUDFType.GROUPED_MAP)
    def combine_thumbnails(df):
        """Input is a Pandas dataframe with 1 row per camera and frame.
        Output should be a Pandas dataframe with 1 row per frame."""
        print(f'combine_thumbnails: s={df}')
        df.info(verbose=True)

        return df[['frame_number', 'data']]

    # @pandas_udf(returnType=DoubleType(), functionType=PandasUDFType.SCALAR)
    # def combine_thumbnails(s):
    #     print(f'combine_thumbnails: s={s}')
    #     def f(data):
    #         print('combine_thumbnails: data')
    #         # # Decode the image.
    #         # numpy_array = np.frombuffer(data, dtype='uint8')
    #         # rgb = cv2.imdecode(numpy_array, -1)
    #         # # Perform a computation on the image to determine the probability of a defect.
    #         # # For now, we just calculate the mean pixel value.
    #         # # We can use any Python library, including NumPy and TensorFlow.
    #         # p = rgb.mean() / 255.0
    #         return 3.14
    #     return s.apply(f)

    df = grp.apply(combine_thumbnails)
    # df = df.select('*', combine_thumbnails('cameras').alias('combined'))
    df = df.select(
        func.to_json(func.struct(df["frame_number"],
                                 df["data"])).alias("event"))

    df.printSchema()

    if False:
        (df.writeStream.trigger(
            processingTime='3 seconds')  # limit trigger rate
         .outputMode('append').format('console').option(
             'truncate',
             'true').option('checkpointLocation',
                            checkpoint_location).start().awaitTermination())
    else:
        (df.writeStream.trigger(processingTime="3 seconds").outputMode(
            "append").format("pravega").option(
                "controller", controller).option("scope", scope).option(
                    "stream", "combinedvideo").option(
                        "checkpointLocation",
                        "/tmp/spark-checkpoints-combine_thumbnails").start().
         awaitTermination())
def test_detect_defect(spark):
    """
    This demonstrates reading large images from Pravega and detecting defects.
    The data field contains a base-64 encoded PNG image file.
    It uses chunked encoding to support events of 2 GiB.
    """
    schema = 'timestamp timestamp, frame_number int, camera int, ssrc int, data binary'

    # To allow for large images and avoid out-of-memory, the JVM will
    # send to the Python UDF this batch size.
    spark.conf.set('spark.sql.execution.arrow.maxRecordsPerBatch', '1')

    controller = os.getenv('PRAVEGA_CONTROLLER', 'tcp://127.0.0.1:9090')
    scope = os.getenv('PRAVEGA_SCOPE', 'examples')
    checkpoint_location = os.getenv(
        'CHECKPOINT_LOCATION',
        '/tmp/spark_checkpoints_test_video_and_sensor_processor')

    df = (
        spark.readStream.format("pravega").option(
            "controller", controller).option("scope", scope).option(
                "stream", "video").option("encoding", "chunked_v1")
        # .option("start_stream_cut", "earliest")
        .load())

    df = df.withColumnRenamed('event', 'raw_event')
    df = df.select('*', decode('raw_event', 'UTF-8').alias('event_string'))
    df = df.select('*',
                   from_json('event_string', schema=schema).alias('event'))
    df = df.select('*', 'event.*')
    df = df.select('*', length('data'))
    df = df.withWatermark('timestamp', '60 second')

    @pandas_udf(returnType=DoubleType(), functionType=PandasUDFType.SCALAR)
    def defect_probability(s):
        """Calculate the probability of a defect."""
        def f(data):
            # Decode the image.
            numpy_array = np.frombuffer(data, dtype='uint8')
            rgb = cv2.imdecode(numpy_array, -1)
            # Perform a computation on the image to determine the probability of a defect.
            # For now, we just calculate the mean pixel value.
            # We can use any Python library, including NumPy and TensorFlow.
            p = rgb.mean() / 255.0
            return p

        return s.apply(f)

    df = df.select('*', defect_probability('data').alias('defect_probability'))

    df = df.drop('raw_event', 'event_string', 'event', 'data')

    df.printSchema()

    if True:
        (df.writeStream.trigger(
            processingTime='3 seconds')  # limit trigger rate
         .outputMode('append').format('console').option(
             'truncate',
             'false').option('checkpointLocation',
                             checkpoint_location).start().awaitTermination())
        nested_cols = [c[0] for c in df.dtypes if c[1][:6] == "struct"]

        columns.extend(flat_cols)

        for nested_col in nested_cols:
            projected_df = df.select(nested_col + ".*")
            stack.append((parents + (nested_col, ), projected_df))

    return nested_df.select(columns)


# COMMAND ----------

from pyspark.sql.functions import unbase64, lit, decode, from_json, col
from pyspark.sql.types import StringType, MapType, StructType, StructField
df_decoded = df.withColumn('Body', decode(unbase64(df.Body), 'utf-8'))
df_decoded_flat = flatten_df(
    df_decoded.withColumn(
        'Body',
        from_json(
            col('Body'),
            StructType([
                StructField("dataItemType", StringType(), True),
                StructField("assetId", StringType(), True),
                StructField("value", StringType(), True)
            ], )))).drop(col('SystemProperties')).withColumn(
                'SystemProperties_connectionAuthMethod',
                from_json(
                    col('SystemProperties_connectionAuthMethod'),
                    StructType([
                        StructField("scope", StringType(), True),
Beispiel #14
0
def run(spark):
    """
    This is an attempt at combining multiple video sources into a grid of images.
    WARNING: This is broken because Spark is not maintaining the time order of the images.
    This file has been superceded by the Flink/Java class MultiVideoGridJob in the flinkprocessor directory.
    """
    schema = 'timestamp timestamp, frame_number int, camera int, ssrc int, data binary'

    # To allow for large images and avoid out-of-memory, the JVM will
    # send to the Python UDF this batch size.
    spark.conf.set('spark.sql.execution.arrow.maxRecordsPerBatch', '1')

    controller = os.getenv('PRAVEGA_CONTROLLER', 'tcp://127.0.0.1:9090')
    scope = os.getenv('PRAVEGA_SCOPE', 'examples')
    checkpoint_location = os.getenv('CHECKPOINT_LOCATION',
                                    '/tmp/spark_checkpoints_multi_video_grid')
    shutil.rmtree(checkpoint_location, ignore_errors=True)

    df = (
        spark.readStream.format("pravega").option(
            "controller", controller).option("scope", scope).option(
                "stream", "video").option("encoding", "chunked_v1")
        # .option("start_stream_cut", "earliest")
        .load())

    df = df.withColumnRenamed('event', 'raw_event')
    df = df.select('*', decode('raw_event', 'UTF-8').alias('event_string'))
    df = df.select('*',
                   from_json('event_string', schema=schema).alias('event'))
    df = df.select('*', 'event.*')
    df = df.select('*', length('data'))
    fps = 2.0
    df = df.selectExpr(
        '*',
        f'timestamp(floor(cast(timestamp as double) * {fps}) / {fps}) as discrete_timestamp'
    )
    df = df.withWatermark('discrete_timestamp', '5 second')
    df = df.drop('raw_event', 'event_string', 'event')

    thumbnail_size = (84, 84)

    @pandas_udf(returnType='binary', functionType=PandasUDFType.SCALAR)
    def decode_and_scale_image(data_series, ssrc):
        def f(data):
            in_pil = Image.open(io.BytesIO(data))
            out_pil = in_pil.resize(thumbnail_size)
            return out_pil.tobytes()

        return data_series.apply(f)

    df = df.select(
        '*',
        decode_and_scale_image(df['data'], df['ssrc']).alias('image'))
    df = df.select(
        '*',
        func.to_json(
            func.struct(df['discrete_timestamp'], df['frame_number'],
                        df['camera'])).alias('json'))

    df = df.repartition(1)

    grp = df.groupby(
        # window('timestamp', '1 second'),
        'discrete_timestamp', )

    @pandas_udf(
        returnType=
        'timestamp timestamp, frame_number int, ssrc int, data binary, source string',
        functionType=PandasUDFType.GROUPED_MAP)
    def combine_images_into_grid(df):
        # TODO: This Pandas UDF provides incorrect results because it is called before the aggregation is finalized by the watermark.
        if df.empty:
            return None
        row0 = df.iloc[0]
        num_cameras = df.camera.max() + 1
        grid_count = math.ceil(math.sqrt(num_cameras))
        # Determine number of images per row and column.
        image_width = thumbnail_size[0]
        image_height = thumbnail_size[1]
        image_mode = 'RGB'
        margin = 1
        status_width = 0
        # Create blank output image, white background.

        out_pil = Image.new(
            'RGB',
            ((image_width + margin) * grid_count - margin + status_width,
             (image_height + margin) * grid_count - margin), (128, 128, 128))

        # Add images from each camera
        def add_image(r):
            # in_pil = Image.open(io.BytesIO(r['image']))
            in_pil = Image.frombytes(image_mode, (image_width, image_height),
                                     r['image'])
            x = (r['camera'] % grid_count) * (image_width + margin)
            y = (r['camera'] // grid_count) * (image_width + margin)
            out_pil.paste(in_pil, (x, y))

        df.apply(add_image, axis=1)

        # font = ImageFont.truetype('/usr/share/fonts/truetype/freefont/FreeSans.ttf', font_size)
        # draw = ImageDraw.Draw(img)
        # draw.text((status_width, 0), 'FRAME\n%05d\nCAMERA\n %03d' % (frame_number, camera), font=font, align='center')

        out_bytesio = io.BytesIO()
        out_pil.save(out_bytesio, format='PNG', compress_level=0)
        out_bytes = out_bytesio.getvalue()

        new_row = pd.Series()
        new_row['timestamp'] = row0['discrete_timestamp']
        new_row['ssrc'] = 0
        new_row['frame_number'] = 0
        new_row['source'] = df[['camera', 'frame_number',
                                'timestamp']].to_json()
        new_row['data'] = out_bytes
        # new_row['data'] = b''
        return pd.DataFrame([new_row])

    # @pandas_udf(returnType='string', functionType=PandasUDFType.SCALAR)
    # def combine_images_into_grid2(json):
    #     # TODO
    #     def f(data):
    #         in_pil = Image.open(io.BytesIO(data))
    #         out_pil = in_pil.resize(thumbnail_size)
    #         return out_pil.tobytes()
    #     return data_series.apply(f)

    df = grp.apply(combine_images_into_grid)
    df = df.select(
        func.to_json(func.struct(df["frame_number"],
                                 df["data"])).alias("event"))

    # df = grp.agg(func.collect_list('json'))
    # df = df.selectExpr('*', '0 as ssrc')
    # window = Window.partitionBy('ssrc').orderBy('discrete_timestamp').rowsBetween(Window.unboundedPreceding, Window.currentRow)
    # df = df.select('*', func.row_number().over(window))

    # TODO: Output rows are not written in timestamp order. How can this be fixed?
    # Below gives error: Sorting is not supported on streaming DataFrames/Datasets, unless it is on aggregated DataFrame/Dataset in Complete output mode
    # df = df.sortWithinPartitions(df['discrete_timestamp'])

    df.printSchema()

    if False:
        (df.writeStream
         # .trigger(processingTime='1000 milliseconds')    # limit trigger rate
         .outputMode('append').format('console').option(
             'truncate',
             'false').option('checkpointLocation',
                             checkpoint_location).start().awaitTermination())
    else:
        (df.writeStream.trigger(processingTime="1000 milliseconds").outputMode(
            "append").format("pravega").option(
                "controller", controller).option(
                    "scope", scope).option("stream", "combinedvideo").option(
                        "checkpointLocation",
                        checkpoint_location).start().awaitTermination())
def test12(spark):
    # ssrc is the synchronization source identifier. See https://en.wikipedia.org/wiki/Real-time_Transport_Protocol.
    # It should be selected at random by each process that writes records.
    schema = 'timestamp timestamp, frame_number int, camera int, chunk int, num_chunks int, ssrc int, data binary'

    controller = os.getenv('PRAVEGA_CONTROLLER', 'tcp://127.0.0.1:9090')
    scope = os.getenv('PRAVEGA_SCOPE', 'examples')
    df = (spark.readStream.format("pravega").option(
        "controller",
        controller).option("scope", scope).option("stream", "video").load())

    # Decode JSON event.
    df = df.withColumnRenamed('event', 'raw_event')
    df = df.select('*', decode('raw_event', 'UTF-8').alias('event_string'))
    df = df.select(
        '*',
        from_json('event_string', schema=schema,
                  options=dict(mode='FAILFAST')).alias('event'))
    df = df.select('*', 'event.*')

    df = df.withWatermark('timestamp', '60 second')

    # The number of chunks must be fixed for the entire Spark job because it determines the number of joins.
    num_chunks = 3
    # Ignore any records with a different number of chunks. Perhaps these can be sent to an error stream.
    df = df.filter(df.num_chunks == num_chunks)
    # Create a dataframe for each chunk.
    chunk_dfs = [
        df.filter(df.chunk == chunk_index).drop('chunk').withColumnRenamed(
            'data', 'data%d' % chunk_index)
        for chunk_index in range(num_chunks)
    ]
    # Join chunks.
    df = chunk_dfs[0]
    for chunk_id in range(1, num_chunks):
        df = df.join(chunk_dfs[chunk_id], ['timestamp', 'camera', 'ssrc'],
                     'inner')
    # Concatenate binary data.
    data_cols = ['data%d' % chunk_index for chunk_index in range(num_chunks)]
    df = df.select('timestamp', 'camera', 'ssrc',
                   concat(*data_cols).alias('data'))
    # Deduplication.
    df = df.dropDuplicates(['timestamp', 'camera'])

    @udf(returnType=BinaryType())
    def parse_checksum(checksum_and_data):
        return checksum_and_data[0:4]

    @udf(returnType=BinaryType())
    def parse_data(checksum_and_data):
        return checksum_and_data[4:]

    @udf(returnType=BooleanType())
    def is_checksum_correct(checksum, data):
        expected = struct.unpack('!I', checksum)[0]
        calculated = zlib.crc32(data)
        print('expected=%d, calculated=%d' % (expected, calculated))
        return expected == calculated

    df = df.withColumnRenamed('data', 'checksum_and_data')
    df = df.select('*',
                   parse_checksum('checksum_and_data').alias('checksum'),
                   parse_data('checksum_and_data').alias('data'))
    df = df.select(
        '*',
        is_checksum_correct('checksum', 'data').alias('is_checksum_correct'))
    # df = df.filter(df.is_checksum_correct == True)

    df.printSchema()

    if True:
        (df.writeStream.trigger(
            processingTime='3 seconds')  # limit trigger rate
         .outputMode('append').format('console').option(
             'truncate', 'false').start().awaitTermination())
Beispiel #16
0
if __name__ == '__main__':
    spark = pyspark.sql.SparkSession \
            .builder \
            .appName("StructuredNetworkWordCount") \
            .getOrCreate()

    df = spark \
      .readStream \
      .format("kafka") \
      .option("kafka.bootstrap.servers", "localhost:9092,localhost:9093,localhost:9094") \
      .option("subscribe", "la-crime") \
      .load()

    from pyspark.sql.functions import get_json_object, decode
    df_string = df.select(decode(df.value,
                                 'UTF-8').alias('json'))  # binary to UTF-8
    crime_types = df_string.select(
        get_json_object(df_string.json,
                        '$.Crime Code Description').alias('types'))
    crime_types_count = crime_types.groupBy("types").count().orderBy(
        'count', ascending=False)  # .limit(5)

    query = crime_types_count\
        .writeStream \
        .outputMode("complete")\
        .format("console") \
        .start()

    query.awaitTermination()