def _writeAndVerify(self, ports): # Set up the streaming context and input streams ssc = StreamingContext(self.sc, self.duration) try: addresses = [("localhost", port) for port in ports] dstream = FlumeUtils.createPollingStream( ssc, addresses, maxBatchSize=self._utils.eventsPerBatch(), parallelism=5) outputBuffer = [] def get_output(_, rdd): for e in rdd.collect(): outputBuffer.append(e) dstream.foreachRDD(get_output) ssc.start() self._utils.sendDatAndEnsureAllDataHasBeenReceived() self.wait_for(outputBuffer, self._utils.getTotalEvents()) outputHeaders = [event[0] for event in outputBuffer] outputBodies = [event[1] for event in outputBuffer] self._utils.assertOutput(outputHeaders, outputBodies) finally: ssc.stop(False)
from operator import add import sys host_name = sys.argv[1] port = int(sys.argv[2]) conf = SparkConf(). \ setAppName("streaming_department_count"). \ setMaster("yarn-client") sc = SparkContext(conf= conf) ssc = StreamingContext(sc, 30) agents = [(host_name, port)] polling_stream = FlumeUtils.createPollingStream(ssc, agents) messages = polling_stream.map(lambda msg: msg[1]) department_msg = messages. \ filter(lambda msg: msg.split(" ")[6].split("/")[1] == "department") department_names = department_msg. \ map(lambda msg: (msg.split(" ")[6].split("/")[2], 1)) department_conut = department_names. \ reduceByKey(add) output_prefix = sys.argv[3] department_conut.saveAsTextFiles(output_prefix) ssc.start()
cmd = "rm -rf /tmp/Flume_spark_Streaming_Test/*" os.system(cmd) from pyspark.streaming.flume import FlumeUtils port = 9999 addresses = {(hostname, port)} # Pull-based Approach using a Custom Sink ''' Flume pushes data into the sink, and the data stays buffered. Spark Streaming uses a reliable Flume receiver(Jar-org.apache.spark.streaming.flume.sink.SparkSink) and transactions to pull data from the sink ''' fstream = FlumeUtils.createPollingStream(ssc, addresses) #fstream.pprint() # ({}, u'naresh,kumar,22') input_stream = fstream.map(lambda x: x[1]) # Processing the DStream ''' This step will do the required processing/filtering on the main DStream and generate a Tuple or List or raw value(when we directly send input_stream without any map/filter). In case of any confusion after any map/filter, like what is the Type(list/Tuple/Raw) of DStream, just use print type(xyz) in the function where we are sending this DStream. This will print the type in any of the executor (but NOT on the CONSOLE). From there you get an idea about the type and process the records accordingly. ''' # This sends records as Tuple - (u'naresh', u'kumar', u'21')
if __name__ == "__main__": # if len(sys.argv) != 3: # print("Usage: kafka_wordcount.py <zk> <topic>", file=sys.stderr) # exit(-1) PYSPARK_PYTHON = "C:\\Python27\\python.exe" #多版本python情况下,需要配置这个变量指定使用哪个版本 os.environ["PYSPARK_PYTHON"] = PYSPARK_PYTHON sc = SparkContext(appName="wc002") sqlContext = SQLContext(sc) sc.setLogLevel("ERROR") ssc = StreamingContext(sc, 5) address = [("cdh5-slave2", 9999)] fps = FlumeUtils.createPollingStream(ssc, address) # ssc.checkpoint("hdfs://cdh-master:8020/checkpoint") #提交任务的用户要有目录的读写权限! # lines = fps.map(lambda x: (x[1])).pprint() def p(x): print(type(x), x) def get_field_value(row): # count # uuid # title # reason_type # caseid # province
import os os.environ[ 'PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-streaming-flume-sink_2.11:2.1.0,org.apache.spark:spark-streaming-flume_2.11:2.1.0 pyspark-shell' from pyspark.streaming.flume import FlumeUtils from pyspark.streaming import StreamingContext from pyspark import SparkContext sc = SparkContext(appName="PythonSparkStreamingFlume") sc.setLogLevel("ERROR") ssc = StreamingContext(sc, 10) streamingContext = StreamingContext(sc, 5) addresses = [("IPADDRESS", 2727)] flumeStream = FlumeUtils.createPollingStream(streamingContext, addresses) lines = flumeStream.map(lambda x: x[1].split(",")) lines.pprint() streamingContext.start() streamingContext.awaitTermination()