def consume_records():
    spark_context = SparkContext(appName='RatingConsumer')
    sql_context = SQLContext(spark_context)
    stream_reader = DataStreamReader(sql_context)

    fpath = os.path.join(os.environ['SPARK_DATA'], 'structured')

    fields = [
        StructField('userId', IntegerType(), True),
        StructField('movieId', IntegerType(), True),
        StructField('rating', FloatType(), True),
        StructField('timestamp', StringType(), True),
    ]

    schema = StructType(fields)
    ratings = stream_reader.load(fpath, schema=schema, format='csv')

    ratings.createOrReplaceTempView('ratingsView')

    #user_481 = sql_context.sql ("select userId, rating from ratingsView where userId < 481")
    user_481 = ratings.where("userId < 481").select("userId", "rating")

    query = user_481\
        .writeStream\
        .outputMode ('append')\
        .format ('console')\
        .start()
    query.awaitTermination()
Ejemplo n.º 2
0
def consume_records():
    spark_context = SparkContext(appName='RatingConsumer')
    sql_context = SQLContext(spark_context)
    stream_reader = DataStreamReader(sql_context)

    fpath = os.path.join(os.environ['SPARK_DATA'], 'structured')

    fields = [
        StructField('userId', IntegerType(), True),
        StructField('movieId', IntegerType(), True),
        StructField('rating', FloatType(), True),
        StructField('timestamp', StringType(), True),
    ]

    schema = StructType(fields)
    ratings = stream_reader.load(fpath, schema=schema, format='csv')

    user_counts = ratings.groupBy('userId').count()

    query = user_counts\
        .writeStream\
        .outputMode ('complete')\
        .format ('console')\
        .start()
    query.awaitTermination()