def consume_records(): spark_context = SparkContext(appName='RatingConsumer') sql_context = SQLContext(spark_context) stream_reader = DataStreamReader(sql_context) fpath = os.path.join(os.environ['SPARK_DATA'], 'structured') fields = [ StructField('userId', IntegerType(), True), StructField('movieId', IntegerType(), True), StructField('rating', FloatType(), True), StructField('timestamp', StringType(), True), ] schema = StructType(fields) ratings = stream_reader.load(fpath, schema=schema, format='csv') ratings.createOrReplaceTempView('ratingsView') #user_481 = sql_context.sql ("select userId, rating from ratingsView where userId < 481") user_481 = ratings.where("userId < 481").select("userId", "rating") query = user_481\ .writeStream\ .outputMode ('append')\ .format ('console')\ .start() query.awaitTermination()
def consume_records(): spark_context = SparkContext(appName='RatingConsumer') sql_context = SQLContext(spark_context) stream_reader = DataStreamReader(sql_context) fpath = os.path.join(os.environ['SPARK_DATA'], 'structured') fields = [ StructField('userId', IntegerType(), True), StructField('movieId', IntegerType(), True), StructField('rating', FloatType(), True), StructField('timestamp', StringType(), True), ] schema = StructType(fields) ratings = stream_reader.load(fpath, schema=schema, format='csv') user_counts = ratings.groupBy('userId').count() query = user_counts\ .writeStream\ .outputMode ('complete')\ .format ('console')\ .start() query.awaitTermination()
def readStream(self) -> DataStreamReader: """ Returns a :class:`DataStreamReader` that can be used to read data streams as a streaming :class:`DataFrame`. .. versionadded:: 2.0.0 Notes ----- This API is evolving. Returns ------- :class:`DataStreamReader` Examples -------- >>> spark.readStream <pyspark.sql.streaming.readwriter.DataStreamReader object ...> The example below uses Rate source that generates rows continously. After that, we operate a modulo by 3, and then write the stream out to the console. The streaming query stops in 3 seconds. >>> import time >>> df = spark.readStream.format("rate").load() >>> df = df.selectExpr("value % 3 as v") >>> q = df.writeStream.format("console").start() >>> time.sleep(3) >>> q.stop() """ return DataStreamReader(self)
def readStream(self): """ Returns a :class:`DataStreamReader` that can be used to read data streams as a streaming :class:`DataFrame`. .. note:: Evolving. :return: :class:`DataStreamReader` """ return DataStreamReader(self._wrapped)
def readStream(self): """ Returns a :class:`DataStreamReader` that can be used to read data streams as a streaming :class:`DataFrame`. .. note:: Evolving. :return: :class:`DataStreamReader` >>> text_sdf = sqlContext.readStream.text(tempfile.mkdtemp()) >>> text_sdf.isStreaming True """ return DataStreamReader(self)
def readStream(self): """ Returns a :class:`DataStreamReader` that can be used to read data streams as a streaming :class:`DataFrame`. .. note:: Experimental. :return: :class:`DataStreamReader` >>> text_sdf = sqlContext.readStream.text(os.path.join(tempfile.mkdtemp(), 'data')) >>> text_sdf.isStreaming True """ return DataStreamReader(self)
def readStream(self) -> DataStreamReader: """ Returns a :class:`DataStreamReader` that can be used to read data streams as a streaming :class:`DataFrame`. .. versionadded:: 2.0.0 Notes ----- This API is evolving. Returns ------- :class:`DataStreamReader` """ return DataStreamReader(self._wrapped)
def readStream(self) -> DataStreamReader: """ Returns a :class:`DataStreamReader` that can be used to read data streams as a streaming :class:`DataFrame`. .. versionadded:: 2.0.0 Notes ----- This API is evolving. Returns ------- :class:`DataStreamReader` >>> text_sdf = sqlContext.readStream.text(tempfile.mkdtemp()) >>> text_sdf.isStreaming True """ return DataStreamReader(self.sparkSession)