def stream(ssc): zkQuorum = "localhost:2181" topic = "topic1" tweets = KafkaUtils.createStream(ssc, zkQuorum, "spark-streaming-consumer", {topic: 1}) kstream = KafkaUtils.createDirectStream(ssc, topics = ['topic1'], kafkaParams = {"metadata.broker.list":"localhost:9092"}) tweets = tweets.map(lambda x: x[1].encode("ascii","ignore")) return tweets
def ss_direct_kafka_bucket_counter(brokers, topic, bucket_interval, output_msg, message_parse, valueDecoder=None): """Starts a Spark Streaming job from a Kafka input and parses message time WARNING!! This function only works for spark 1.4.0+ Args: brokers: the kafka broker that we look at for the topic topic: the kafka topic for input timeinterval: the time interval in seconds (int) that the job will bucket Returns: None """ sc = SparkContext(appName="PythonKafkaBucketCounter") ssc = StreamingContext(sc, timeinterval + 5) if valueDecoder: kvs = KafkaUtils.createDirectStream(ssc, [topic], {"metadata.broker.list": brokers}, valueDecoder=valueDecoder) else: kvs = KafkaUtils.createDirectStream(ssc, [topic], {"metadata.broker.list": brokers}) lines = kvs.map(lambda x: x[1]) interval_counts = lines.map(lambda line: (message_parse(line), 1)).reduceByKey(lambda a, b: a + b) output_msg_func = output_msg(sc, ssc) interval_counts.foreachRDD(output_msg_func) ssc.start() ssc.awaitTermination()
def get_kafka_stream(topic, streaming_context): offset_specifications = simport.load(cfg.CONF.repositories.offsets)() app_name = streaming_context.sparkContext.appName saved_offset_spec = offset_specifications.get_kafka_offsets(app_name) if len(saved_offset_spec) < 1: MonMetricsKafkaProcessor.log_debug( "No saved offsets available..." "connecting to kafka without specifying offsets") kvs = KafkaUtils.createDirectStream( streaming_context, [topic], {"metadata.broker.list": cfg.CONF.messaging.brokers}) return kvs else: from_offsets = {} for key, value in saved_offset_spec.items(): if key.startswith("%s_%s" % (app_name, topic)): # spec_app_name = value.get_app_name() spec_topic = value.get_topic() spec_partition = int(value.get_partition()) # spec_from_offset = value.get_from_offset() spec_until_offset = value.get_until_offset() # composite_key = "%s_%s_%s" % (spec_app_name, # spec_topic, # spec_partition) # partition = saved_offset_spec[composite_key] from_offsets[ TopicAndPartition(spec_topic, spec_partition) ] = long(spec_until_offset) MonMetricsKafkaProcessor.log_debug( "get_kafka_stream: calling createDirectStream :" " topic:{%s} : start " % topic) for key, value in from_offsets.items(): MonMetricsKafkaProcessor.log_debug( "get_kafka_stream: calling createDirectStream : " "offsets : TopicAndPartition:{%s,%s}, value:{%s}" % (str(key._topic), str(key._partition), str(value))) MonMetricsKafkaProcessor.log_debug( "get_kafka_stream: calling createDirectStream : " "topic:{%s} : done" % topic) kvs = KafkaUtils.createDirectStream( streaming_context, [topic], {"metadata.broker.list": cfg.CONF.messaging.brokers}, from_offsets) return kvs
def stream(ssc, pwords, nwords, duration): kstream = KafkaUtils.createDirectStream(ssc, topics = ['twitterstream'], kafkaParams = {"metadata.broker.list": 'localhost:9092'}) tweets = kstream.map(lambda x: x[1].encode("ascii","ignore")) tweets = tweets.flatMap(lambda line: line.split(" ")) words = tweets.flatMap(lambda line: line.split(" ")) tweets = tweets.filter(lambda x: x in pwords or x in nwords) tweets = tweets.map(lambda x: ("positive",1) if x in pwords else ("negative",1)) tweets = tweets.reduceByKey(lambda x,y: x+y) tweets = tweets.updateStateByKey(updateFunction) tweets.pprint() pds = words.filter(lambda x: x in pwords) nds = words.filter(lambda x: x in nwords) plist=[] nlist=[] pds.foreachRDD(lambda t,rdd: plist.append(rdd.count())) nds.foreachRDD(lambda t,rdd: nlist.append(rdd.count())) counts = [] ssc.start() # Start the computation ssc.awaitTerminationOrTimeout(duration) ssc.stop(stopGraceFully=True) for i in range(0,len(plist)): counts.append((plist[i],nlist[i])) return counts
def stream(ssc, pwords, nwords, duration): kstream = KafkaUtils.createDirectStream( ssc, topics = ['twitterstream'], kafkaParams = {"metadata.broker.list": 'localhost:9092'}) tweets = kstream.map(lambda x: x[1].encode("ascii","ignore")) #print "HELOKOJOJEORUBEORUBOUBEROUBNOUONEROJOEJRNOJENROJENFOJEFOEJFNOEFUNOEUFN" #tweets.pprint() # Each element of tweets will be the text of a tweet. # You need to find the count of all the positive and negative words in these tweets. # Keep track of a running total counts and print this at every time step (use the pprint function). # YOUR CODE HERE words = tweets.flatMap(lambda line: line.split(" ")) pairs = words.map(classifier).map(lambda word: (word, 1)).filter(lambda x: x[0] != 'none').reduceByKey(lambda a,b: a+b) runningCounts = pairs.updateStateByKey(updateFunction) runningCounts.pprint() # Let the counts variable hold the word counts for all time steps # You will need to use the foreachRDD function. # For our implementation, counts looked like: # [[("positive", 100), ("negative", 50)], [("positive", 80), ("negative", 60)], ...] counts = [] pairs.foreachRDD(lambda t,rdd: counts.append(rdd.collect())) ssc.start() # Start the computation ssc.awaitTerminationOrTimeout(duration) ssc.stop(stopGraceFully=True) #print counts return counts
def start_spark(timeout=None, max_items_per_rdd_sent=None): sc = SparkContext("local[4]", "twitter.trending") ssc = StreamingContext(sc, 5) ssc.checkpoint('hdfs://localhost:9000/user/spark/checkpoint/') kafka_params = { 'zookeeper.connect': config.get('zookeeper', 'host'), 'group.id': config.get('kafka', 'group_id'), 'metadata.broker.list': config.get('kafka', 'hosts') } ksc = KafkaUtils.createDirectStream(ssc, [config.get('kafka', 'topic')], kafka_params) hashtag_counts = get_word_counts(ksc) filtered_tweet_count = filter_tweets(hashtag_counts) send_dstream_data(filtered_tweet_count, max_items_per_rdd_sent) ssc.start() if timeout: ssc.awaitTermination(timeout) ssc.stop(stopSparkContext=True, stopGraceFully=True) else: ssc.awaitTermination()
def stream(ssc, pwords, nwords, duration): kstream = KafkaUtils.createDirectStream( ssc, topics = ['twitterstream'], kafkaParams = {"metadata.broker.list": 'localhost:9092'}) tweets = kstream.map(lambda x: x[1].encode("ascii","ignore")) # Print the first ten elements of each RDD generated in this DStream to the console #tweets.pprint() words = tweets.flatMap(lambda line: line.split(" ")) posNegPairs = words.map(lambda word: myMapping(word, pwords, nwords)) filteredPairs = posNegPairs.filter(lambda x: x[0] != "na") posNegCounts = filteredPairs.reduceByKey(lambda x, y: x + y) # Each element of tweets will be the text of a tweet. # You need to find the count of all the positive and negative words in these tweets. # Keep track of a running total counts and print this at every time step (use the pprint function). cumulativeCounts = posNegCounts.updateStateByKey(myRunningUpdate) cumulativeCounts.pprint() # Let the counts variable hold the word counts for all time steps # You will need to use the foreachRDD function. # For our implementation, counts looked like: # [[("positive", 100), ("negative", 50)], [("positive", 80), ("negative", 60)], ...] counts = [] posNegCounts.foreachRDD(lambda t,rdd: counts.append(rdd.collect())) ssc.start() # Start the computation ssc.awaitTerminationOrTimeout(duration) ssc.stop(stopGraceFully=True) return counts
def stream(ssc, pwords, nwords, duration): kstream = KafkaUtils.createDirectStream(ssc, topics = ['twitterstream'], kafkaParams = {"metadata.broker.list": 'localhost:9092'}) tweets = kstream.map(lambda x: x[1].encode("ascii","ignore")) pword_rdd=tweets.flatMap(lambda line: line.split(" ")).map(lambda word: ("positive",1) if word in pwords else ("positive",0)).reduceByKey(lambda a,b:a+b) nword_rdd=tweets.flatMap(lambda line: line.split(" ")).map(lambda word: ("negative",1) if word in nwords else ("negative",0)).reduceByKey(lambda a,b:a+b) # Each element of tweets will be the text of a tweet. # You need to find the count of all the positive and negative words in these tweets. # Keep track of a running total counts and print this at every time step (use the pprint function). # make the plot on this rdd -combined_rdd combined_rdd=pword_rdd.union(nword_rdd) running_counts=combined_rdd.updateStateByKey(updateFunction) # Let the counts variable hold the word counts for all time steps # You will need to use the foreachRDD function. # For our implementation, counts looked like: # [[("positive", 100), ("negative", 50)], [("positive", 80), ("negative", 60)], ...] counts = [] combined_rdd.foreachRDD(lambda t,rdd: counts.append(rdd.collect())) # print "printing dstream" running_counts.pprint() # Start the computation ssc.start() ssc.awaitTerminationOrTimeout(duration) ssc.stop(stopGraceFully=True) return counts
def start(): sconf = SparkConf() sconf.set('spark.cores.max', 2) sc = SparkContext(appName='KafkaDirectWordCount', conf=sconf) ssc = StreamingContext(sc, 2) brokers = "localhost:9092" topics = ['test'] kafkaStreams_lines = KafkaUtils.createDirectStream(ssc, topics, kafkaParams={"metadata.broker.list": brokers}) lines1 = kafkaStreams_lines.map(lambda x: x[1]) # 注意 取tuple下的第二个即为接收到的kafka流 words = lines1.flatMap(lambda line: line.split(" ")) pairs = words.map(lambda word: (word, 1)) wordcounts = pairs.reduceByKey(lambda x, y: x + y) print(wordcounts) kafkaStreams_lines.transform(storeOffsetRanges).foreachRDD(printOffsetRanges) wordcounts.pprint() # 统计生成的随机数的分布情况 ssc.start() # Start the computation ssc.awaitTermination() # Wait for the computation to terminate
def stream(ssc, pwords, nwords, duration): kstream = KafkaUtils.createDirectStream(ssc, topics = ['twitterstream'], kafkaParams = {"metadata.broker.list":'localhost:9092'}) tweets = kstream.map(lambda x: x[1].encode("ascii","ignore")) # Each element of tweets will be the text of a tweet. # Need to find the count of all the positive and negative words in these tweets. # Keep track of a running total counts and print this at every time step (use the pprint function). pnTweets = tweets.flatMap(lambda line: line.split(" ")) pnTweetsPairs = pnTweets.map(lambda x: determine(x,pwords,nwords)) wordCounts = pnTweetsPairs.reduceByKey(lambda x, y: x + y) totalCounts = pnTweetsPairs.updateStateByKey(updateFunction) totalCounts.pprint() # Let the counts variable hold the word counts for all time steps # Need to use the foreachRDD function. # For our implementation, counts looked like: # [[("positive", 100), ("negative", 50)], [("positive", 80), ("negative", 60)], ...] counts = [] wordCounts.foreachRDD(lambda t,rdd: counts.append(rdd.collect())) ssc.start() # Start the computation ssc.awaitTerminationOrTimeout(duration) ssc.stop(stopGraceFully=True) # becaue counts include those neither ones newCounts = [] for count in counts: newCount = [item for item in count if item[0] == "positive" or item[0] =="negative"] newCounts.insert(len(newCounts),newCount) return newCounts
def stream(ssc, pwords, nwords, duration): kstream = KafkaUtils.createDirectStream( ssc, topics = ['twitterstream'], kafkaParams = {"metadata.broker.list": 'localhost:9092'}) tweets = kstream.map(lambda x: x[1].encode("ascii","ignore")) # Each element of tweets will be the text of a tweet. # You need to find the count of all the positive and negative words in these tweets. # Keep track of a running total counts and print this at every time step (use the pprint function). # YOUR CODE HERE words=tweets.flatMap(lambda x: x.split(" ")).filter(lambda x: x in pwords or x in nwords) wordPairs=words.map(lambda x: ("positive",1) if x in pwords else ("negative",1)) wordCount=wordPairs.reduceByKey(lambda x, y: x + y) runningCounts = wordPairs.updateStateByKey(updateFunction) runningCounts.pprint() # Let the counts variable hold the word counts for all time steps # You will need to use the foreachRDD function. # For our implementation, counts looked like: # [[("positive", 100), ("negative", 50)], [("positive", 80), ("negative", 60)], ...] counts = [] wordCount.foreachRDD(lambda t,rdd: counts.append(rdd.collect())) ssc.start() # Start the computation ssc.awaitTerminationOrTimeout(duration) ssc.stop(stopGraceFully=True) return counts
def stream(ssc, pwords, nwords, duration): kstream = KafkaUtils.createDirectStream( ssc, topics = ['twitterstream'], kafkaParams = {"metadata.broker.list": 'localhost:9092'}) tweets = kstream.map(lambda x: x[1].encode("ascii","ignore")) # Each element of tweets will be the text of a tweet. # You need to find the count of all the positive and negative words in these tweets. # Keep track of a running total counts and print this at every time step (use the pprint function). # YOUR CODE HERE words = tweets.flatMap(lambda line: line.split(' ')) \ .map(lambda word: ('positive', 1) if word in pwords else ('negative', 1) if word in nwords else ('none', 1)) \ .filter(lambda x: x[0]=='positive' or x[0]=='negative') \ .reduceByKey(lambda x, y: x + y) # Print the first ten elements of each RDD generated in this DStream to the console def updateValues(values, count): if count is None: count = 0 return sum(values, count) updatedWords = words.updateStateByKey(updateValues) updatedWords.pprint() # Let the counts variable hold the word counts for all time steps # You will need to use the foreachRDD function. # For our implementation, counts looked like: # [[("positive", 100), ("negative", 50)], [("positive", 80), ("negative", 60)], ...] counts = [] # YOURDSTREAMOBJECT.foreachRDD(lambda t,rdd: counts.append(rdd.collect())) words.foreachRDD(lambda t,rdd: counts.append(rdd.collect())) ssc.start() # Start the computation ssc.awaitTerminationOrTimeout(duration) ssc.stop(stopGraceFully=True) return counts
def test_kafka_direct_stream_transform_get_offsetRanges(self): """Test the Python direct Kafka stream transform get offsetRanges.""" topic = self._randomTopic() sendData = {"a": 1, "b": 2, "c": 3} kafkaParams = {"metadata.broker.list": self._kafkaTestUtils.brokerAddress(), "auto.offset.reset": "smallest"} self._kafkaTestUtils.createTopic(topic) self._kafkaTestUtils.sendMessages(topic, sendData) stream = KafkaUtils.createDirectStream(self.ssc, [topic], kafkaParams) offsetRanges = [] def transformWithOffsetRanges(rdd): for o in rdd.offsetRanges(): offsetRanges.append(o) return rdd # Test whether it is ok mixing KafkaTransformedDStream and TransformedDStream together, # only the TransformedDstreams can be folded together. stream.transform(transformWithOffsetRanges).map(lambda kv: kv[1]).count().pprint() self.ssc.start() self.wait_for(offsetRanges, 1) self.assertEqual(offsetRanges, [OffsetRange(topic, 0, long(0), long(6))])
def start(): sconf = SparkConf() sconf.set('spark.cores.max', 2) sc = SparkContext(appName='KafkaDirectWordCount', conf=sconf) ssc = StreamingContext(sc, 2) brokers = "192.192.0.27:9092" topics = ['topic7'] kafkaStreams_lines = KafkaUtils.createDirectStream(ssc, topics, kafkaParams={"metadata.broker.list": brokers}) lines1 = kafkaStreams_lines.map(lambda x: x[1]) # 注意 取tuple下的第二个即为接收到的kafka流 words = lines1.flatMap(lambda line: line.split(" ")) pairs = words.map(lambda word: (word, 1)) wordcounts = pairs.reduceByKey(lambda x, y: x + y) wordcounts.saveAsTextFiles("/var/lib/hadoop-hdfs/spark-libin/kafka") wordcounts.pprint() # 统计生成的随机数的分布情况 ssc.start() # Start the computation ssc.awaitTermination() # Wait for the computation to terminate
def main(): sc = SparkContext(appName="IntrusionDetector") ssc = StreamingContext(sc, batch_durations) kvs = KafkaUtils.createDirectStream(ssc, [input_topic], {"metadata.broker.list": broker}) kvs.foreachRDD(processRDD) ssc.start() ssc.awaitTermination()
def stream(ssc, pwords, nwords, duration): kstream = KafkaUtils.createDirectStream( ssc, topics = ['twitterstream'], kafkaParams = {"metadata.broker.list": 'localhost:9092'}) tweets = kstream.map(lambda x: x[1].encode("ascii","ignore")) # Each element of tweets will be the text of a tweet. # You need to find the count of all the positive and negative words in these tweets. # Keep track of a running total counts and print this at every time step (use the pprint function). #tweets.pprint() words = tweets.flatMap(lambda tweet:tweet.split(" ")) #words.pprint() positive = words.filter(lambda x: (x in pwords)) negative = words.filter(lambda x: (x in nwords)) #positive.pprint() #negative.pprint() ppairs = positive.map(lambda p: ('positive', 1)) npairs = negative.map(lambda n: ('negative', 1)) pwordCounts = ppairs.reduceByKey(lambda x, y: x + y) nwordCounts = npairs.reduceByKey(lambda x, y: x + y) count = pwordCounts.union(nwordCounts) #count.pprint() #pwordCounts.pprint() #nwordCounts.pprint() def updateFunction(newValues, runningCount): if runningCount is None: runningCount = 0 return sum(newValues, runningCount) prunningCounts = pwordCounts.updateStateByKey(updateFunction) nrunningCounts = nwordCounts.updateStateByKey(updateFunction) #prunningCounts.pprint() #nrunningCounts.pprint() total = prunningCounts.union(nrunningCounts) total.pprint() # Let the counts variable hold the word counts for all time steps # You will need to use the foreachRDD function. # For our implementation, counts looked like: # [[("positive", 100), ("negative", 50)], [("positive", 80), ("negative", 60)], ...] counts = [] count.foreachRDD(lambda t,rdd: counts.append(rdd.collect())) ssc.start() # Start the computation ssc.awaitTerminationOrTimeout(duration) ssc.stop(stopGraceFully=True) return counts
def kafka_spark_streaming_sql_main(app_name, brokers, topic, interval_seconds, sql_function): sc = SparkContext(appName=app_name) sqlContext = SQLContext(sc) # ssc = StreamingContext(sc, interval_seconds) ssc = StreamingContext(sc, 10) kvs = KafkaUtils.createDirectStream(ssc, [topic], {"metadata.broker.list": brokers}) kvs.foreachRDD(sql_function) ssc.start() ssc.awaitTermination()
def read_tweets(): sc = SparkContext(appName="sentimentProducer") ssc = StreamingContext(sc,600) # Test 60 segundos brokers = "localhost:9092" kvs = KafkaUtils.createDirectStream(ssc, ["test"], {"metadata.broker.list": brokers}) kvs.foreachRDD(create_format) producer.flush() ssc.start() ssc.awaitTermination()
def functionToCreateContext(): sc = SparkContext(appName="StreamingExampleWithKafka") ssc = StreamingContext(sc, 10) ssc.checkpoint("checkpoint") opts = {"metadata.broker.list": "node1.example.com:6667,node2.example.com:6667"} kvs = KafkaUtils.createDirectStream(ssc, ["mytopic"], opts) lines = kvs.map(lambda x: x[1]) counts = lines.flatMap(lambda line: line.split(" ")).map(lambda word: (word, 1)).updateStateByKey(updateFunction) counts.pprint() return ssc
def test_kafka_direct_stream(self): """Test the Python direct Kafka stream API.""" topic = self._randomTopic() sendData = {"a": 1, "b": 2, "c": 3} kafkaParams = {"metadata.broker.list": self._kafkaTestUtils.brokerAddress(), "auto.offset.reset": "smallest"} self._kafkaTestUtils.createTopic(topic) self._kafkaTestUtils.sendMessages(topic, sendData) stream = KafkaUtils.createDirectStream(self.ssc, [topic], kafkaParams) self._validateStreamResult(sendData, stream)
def test_kafka_direct_stream_from_offset(self): """Test the Python direct Kafka stream API with start offset specified.""" topic = self._randomTopic() sendData = {"a": 1, "b": 2, "c": 3} fromOffsets = {TopicAndPartition(topic, 0): long(0)} kafkaParams = {"metadata.broker.list": self._kafkaTestUtils.brokerAddress()} self._kafkaTestUtils.createTopic(topic) self._kafkaTestUtils.sendMessages(topic, sendData) stream = KafkaUtils.createDirectStream(self.ssc, [topic], kafkaParams, fromOffsets) self._validateStreamResult(sendData, stream)
def stream_kafka(): global ssc kstream = KafkaUtils.createDirectStream(ssc, topics=['2008'], kafkaParams={ "metadata.broker.list": 'ip-172-31-12-78.us-west-1.compute.internal:6667'}) contents = kstream.flatMap(get_airport_carrier_delay).reduceByKey( lambda a, b: (a[0] + b[0], a[1] + b[1])).foreachRDD(top_complex_average) ssc.start() ssc.awaitTerminationOrTimeout(15000) ssc.stop(stopSparkContext=True, stopGraceFully=True)
def stream_kafka(): global ssc kstream = KafkaUtils.createDirectStream(ssc, topics=['2008'], kafkaParams={ "metadata.broker.list": 'ip-172-31-12-78.us-west-1.compute.internal:6667'}) # kstream = KafkaUtils.createStream(ssc, "localhost:2181", "raw-event-streaming-consumer", {"2008":1}) contents = kstream.flatMap(get_airports).updateStateByKey(update_count) contents.foreachRDD(lambda rdd: top_airports(rdd)) ssc.start() ssc.awaitTerminationOrTimeout(15000) ssc.stop(stopSparkContext=True, stopGraceFully=True)
def main(): parser = OptionParser() parser.add_option('', '--enriched_data_path', action='store', dest='enriched_data_path', help='path to write enriched data') parser.add_option('', '--model_path', action='store', dest='model_path', help='path for model data') parser.add_option('', '--kafka_zookeeper_hosts', action='store', dest='kafka_zookeeper_hosts', help='list of Zookeeper hosts (host:port)') parser.add_option('', '--kafka_broker_list', action='store', dest='kafka_broker_list', help='list of Kafka brokers (host:port)') parser.add_option('', '--kafka_message_topic', action='store', dest='kafka_message_topic', help='topic to consume input messages from') parser.add_option('', '--kafka_alert_topic', action='store', dest='kafka_alert_topic', help='topic to produce alert messages to') parser.add_option('', '--kafka_enriched_data_topic', action='store', dest='kafka_enriched_data_topic', help='topic to produce enriched data to') parser.add_option('', '--streaming_batch_duration_sec', type='float', default=15.0, action='store', dest='streaming_batch_duration_sec', help='Streaming batch duration in seconds') parser.add_option('', '--max_batches', type='int', default=0, action='store', dest='max_batches', help='Number of batches to process (0 means forever)') options, args = parser.parse_args() sc = SparkContext() ssc = StreamingContext(sc, options.streaming_batch_duration_sec) sqlContext = getSqlContextInstance(sc) # Load saved model. model = None if options.model_path: model = RandomForestModel.load(sc, options.model_path) else: print('No model loaded.') # Create Kafka stream to receive new messages. kvs = KafkaUtils.createDirectStream(ssc, [options.kafka_message_topic], { 'metadata.broker.list': options.kafka_broker_list, 'group.id': 'spark_streaming_processor.py'}) # Take only the 2nd element of the tuple. messages = kvs.map(lambda x: x[1]) # Convert RDD of JSON strings to RDD of Rows. rows = messages.map(json_to_row) # Process messages. rows.foreachRDD(lambda time, rdd: process_messages(time, rdd, ssc=ssc, model=model, enriched_data_path=options.enriched_data_path, zookeeper_hosts=options.kafka_zookeeper_hosts, kafka_alert_topic=options.kafka_alert_topic, kafka_enriched_data_topic=options.kafka_enriched_data_topic, max_batches=options.max_batches)) ssc.start() ssc.awaitTermination()
def main(): brokers = 'localhost:9092' topic = 'openbmp.parsed.unicast_prefix' sc = SparkContext(appName='BGPPrefixOriginValidation') ssc = StreamingContext(sc,2) directKafkaStream = KafkaUtils.createDirectStream(ssc, [topic], {'metadata.broker.list':brokers}) #directKafkaStream.pprint() lines = directKafkaStream.flatMap(lambda x: x[1].splitlines()).filter(lambda line: line.startswith('add')) structured_rdd = lines.map(structure_data) structured_rdd.foreachRDD(lambda rdd: rdd.foreachPartition(validate_bgp_prefix)) ssc.start() ssc.awaitTermination()
def createContext(brokers, topic): print("Create new context") conf = SparkConf().set("spark.default.parallelism", "2").set("spark.streaming.kafka.maxRatePerPartition", 1000) sc = SparkContext(appName="PythonStreamingDirectKafkaWordCount", conf=conf) ssc = StreamingContext(sc, 10) # 直接 连接kafka,没有Receiver, 可以完全实现exactly-once # 具体exactly-once 可参考 http://blog.cloudera.com/blog/2015/03/exactly-once-spark-streaming-from-apache-kafka/ kvs = KafkaUtils.createDirectStream(ssc, [topic], {"metadata.broker.list": brokers, "auto.offset.reset": "smallest"}) kvs.transform(storeOffsetRanges).foreachRDD(printOffsetRanges) lines = kvs.map(lambda x: x[1]) counts = lines.flatMap(lambda line: line.split(" ")) \ .map(lambda word: (word, 1)) \ .reduceByKey(lambda a, b: a+b) counts.pprint() return ssc
def stream(ssc, pwords, nwords, duration): kstream = KafkaUtils.createDirectStream( ssc, topics = ['twitterstream'], kafkaParams = {"metadata.broker.list": 'localhost:9092'}) tweets = kstream.map(lambda x: x[1].encode("ascii","ignore")) # Each element of tweets will be the text of a tweet. # You need to find the count of all the positive and negative words in these tweets. # Keep track of a running total counts and print this at every time step (use the pprint function). #creating a list for filtering positive || negative permissible words List = pwords + nwords #changing key value to positive and negative for the RDD. counts = tweets.map(lambda line: line.split(" ")).flatMap(lambda line: line) \ .map(lambda word: (word, 1)).filter(lambda x : x[0] in List).map(lambda x: ('positive', x[1]) if (x[0] in pwords) else ('negative', x[1])) # Add counts, tweets stores sum across all time stamps tweets = counts.updateStateByKey(updateFunction)
def readSource(ssc, di_in_conf_with_ds_conf, app_conf): sourceType = di_in_conf_with_ds_conf['source.type'] if sourceType == 'kafka': kafkaSimpleConsumerApiUsed = app_conf.get('kafka.simple.consumer.api.used', True) if kafkaSimpleConsumerApiUsed: topics = di_in_conf_with_ds_conf['topics'] if not isinstance(topics, list): raise TypeError("topic should be list") brokers = di_in_conf_with_ds_conf['metadata.broker.list'] kafkaParams = {"metadata.broker.list": brokers} stream = KafkaUtils.createDirectStream(ssc, topics, kafkaParams).map(lambda x: x[1]) else: zkConnect = di_in_conf_with_ds_conf['zookeeper.connect'] groupId = app_conf['group.id'] numReceivers = app_conf.get('num.receivers', 1) numConsumerFetchers = app_conf.get('num.consumer.fetchers') topics = di_in_conf_with_ds_conf['topics'] topic_map = dict(zip(topics, numConsumerFetchers)) # streams = reduce(lambda x, y: x.union(y), # map(KafkaUtils.createStream(ssc, zkConnect, groupId, topic_map), # range(0, numReceivers))) streams = [KafkaUtils.createStream(ssc, zkConnect, groupId, topic_map) for i in range(0, numReceivers)] stream = ssc.union(streams).map(lambda x: x[1]) elif sourceType == 'hdfs': path = di_in_conf_with_ds_conf['fs.defaultFS'] + '/' + di_in_conf_with_ds_conf['path'] stream = ssc.textFilesStream(path) else: raise Exception('Error: unsupported source.type = ' + sourceType) num_repartition = app_conf.get('dataInterface.stream.repatition.partitions') if num_repartition is None or not isinstance(num_repartition, int): stream2 = stream else: stream2 = stream.repartition(num_repartition) # 是否使用格式化插件类格式化 format_class_path = di_in_conf_with_ds_conf.get('format.class', '') if format_class_path.strip() == '': stream3 = stream2 else: format_class_obj = get_class_obj(format_class_path) stream3 = format_class_obj.format(stream2) return stream3
def createContext(brokers, topic, checkpointDir): # If you do not see this printed, that means the StreamingContext has been loaded # from the new checkpoint sc = SparkContext(appName="PythonStreamingRecoverableNetworkWordCount") ssc = StreamingContext(sc, 1) kvs = KafkaUtils.createDirectStream(ssc, [topic], {"metadata.broker.list": brokers}) lines = kvs.map(lambda x: x[1]) wordCounts = lines.flatMap(lambda line: line.split(" ")) \ .map(lambda word: (word, 1)) \ .reduceByKey(lambda a, b: a+b) #wordCounts.foreachRDD(echo) wordCounts.pprint() ssc.checkpoint(checkpointDir) return ssc
def functionToCreateContext(): # spark context config sc = SparkContext(appName="StreamingExampleWithKafka") ssc = StreamingContext(sc, 10) ssc.checkpoint("checkpoint") # kafka opts = {"metadata.broker.list": "node1.example.com:6667,node2.example.com:6667"} kvs = KafkaUtils.createDirectStream(ssc, ["mytopic"], opts) # processing lines = kvs.map(lambda x: x[1]) counts = lines.flatMap(lambda line: line.split(" ")) \ .map(lambda word: (word, 1)) \ .updateStateByKey(updateFunction) \ .map(toStringList) \ .foreachRDD(lambda rdd: rdd.saveAsNewAPIHadoopDataset(conf=conf, keyConverter=keyConv, valueConverter=valueConv)) return ssc
def process(rdd): print(">>>> BEGIN CASS") info = getSqlContextInstance(rdd.context).createDataFrame(rdd) info.registerTempTable("info") info.write.format("org.apache.spark.sql.cassandra").\ options(keyspace="record", table="campaign").\ save(mode="append") print(">>>> END CASS") if __name__ == "__main__": sc = SparkContext(appName="test-streaming") ssc = StreamingContext(sc, 5) chance_stream = KafkaUtils.createDirectStream( ssc, [KAFKA_TOPIC_CHANCE], {"metadata.broker.list": KAFKA_NODE}) impression_stream = KafkaUtils.createDirectStream( ssc, [KAFKA_TOPIC_IMPRESSION], {"metadata.broker.list": KAFKA_NODE}) click_stream = KafkaUtils.createDirectStream( ssc, [KAFKA_TOPIC_CLICK], {"metadata.broker.list": KAFKA_NODE}) campaignChance = chance_stream.map(lambda record : record[1].split(" ")).\ map(lambda x: (x[0],1)).\ reduceByKey(lambda x,y : x+y) campaignImpr = impression_stream.map(lambda (k, v): json.loads(v)).\ map(lambda x: (x['compaignId'],x['price'],1)).\ educeByKey(lambda x,y : (x[0]+y[0], x[1]+y[1])).\ map(lambda x : (x[0], x[1]/x[0])) campaignClick = click_stream.map(lambda (k, v): json.loads(v)).\
import os os.environ['SPARK_HOME'] = '/usr/lib/spark' from pyspark.streaming.kafka import KafkaUtils from pyspark import SparkContext, SparkConf from pyspark.streaming import StreamingContext if __name__ == "__main__": conf = (SparkConf().setMaster("local[*]").setAppName("spar_stream").set( "spark.executor.memory", "4g").set("spark.driver.memory", "5g")) sc = SparkContext(conf=conf) ssc = StreamingContext(sc, 10) kafkastream = KafkaUtils.createDirectStream( ssc, ['rna'], {"metadata.broker.list": 'localhost:9093'}) kafkastream.pprint() # print count.pprint() # kafkastream.saveAsTextFile("/user/cloudera/twitter"3 # flatMap(lambda x: x.split('\n')).map(lambda x: x.split(',')) ssc.start() ssc.awaitTermination()
from pyspark import SparkContext from pyspark.sql import SparkSession from pyspark.streaming import StreamingContext from pyspark.streaming.kafka import KafkaUtils def handle_rdd(rdd): if not rdd.isEmpty(): global ss df = ss.createDataFrame(rdd, schema=['text', 'words', 'length']) df.show() # df.write.saveAsTable(name='default.tweets', format='hive', mode='append') sc = SparkContext(appName="Something") ssc = StreamingContext(sc, 5) ss = SparkSession.builder.appName("Something").getOrCreate() ss.sparkContext.setLogLevel('WARN') ks = KafkaUtils.createDirectStream(ssc, ['kafkaTwitterSpark'], {'metadata.broker.list': 'localhost:9096'}) lines = ks.map(lambda x: x[1]) transform = lines.map(lambda tweet: (tweet, int(len(tweet.split())), int(len(tweet)))) transform.foreachRDD(handle_rdd) ssc.start() ssc.awaitTermination()
stream.map(pair).reduceByKey(lambda a, b: (a[0] + b[0], a[1] + b[1])).map( lambda (k, v): (k, v[0] / v[1])).foreachRDD(send_to_kafka) if __name__ == '__main__': if len(sys.argv) != 4: print("Usage: stream-process.py [topic] [target-topic] [broker-list]") exit(1) # - create SparkContext and StreamingContext sc = SparkContext("local[2]", "StockAveragePrice") sc.setLogLevel('INFO') ssc = StreamingContext(sc, 5) topic, target_topic, brokers = sys.argv[1:] # - instantiate a kafka stream for processing directKafkaStream = KafkaUtils.createDirectStream( ssc, [topic], {'metadata.broker.list': brokers}) process_stream(directKafkaStream) # - instantiate a simple kafka producer kafka_producer = KafkaProducer(bootstrap_servers=brokers) # - setup proper shutdown hook atexit.register(shutdown_hook, kafka_producer) ssc.start() ssc.awaitTermination()
from pyspark import SparkContext from pyspark.streaming import StreamingContext from pyspark.sql import SQLContext from pyspark.streaming.kafka import KafkaUtils import json if __name__ == '__main__': sc = SparkContext(appName='PythonSparkStreamingKafka') sc.setLogLevel("WARN") # avoid printing logs ssc = StreamingContext(sparkContext=sc, batchDuration=2) spark_sql = SQLContext(sparkContext=sc) kafkaStream = KafkaUtils.createDirectStream(ssc=ssc, topics=['trump'], kafkaParams={"metadata.broker.list": 'localhost:9092'}) dfs = kafkaStream.\ map(lambda dstream: dstream[1]) dfs.pprint(2) ssc.start() ssc.awaitTermination()
import sys from pyspark import SparkContext, SparkConf from pyspark.streaming import StreamingContext from pyspark.streaming.kafka import KafkaUtils def echo(rdd): print rdd if __name__ == '__main__': sc = SparkContext(appName="text") ssc = StreamingContext(sc, 5) broker = "localhost:2181" topic = "model" kbrokers = "localhost:9092" # kvs=KafkaUtils.createStream(ssc,broker,"3e",{topic:1},kafkaParams={"metadata.broker.list":kbrokers}) # kvs = KafkaUtils.createStream(ssc, "localhost:2181", 'spark', {"model": 1}) kvs=KafkaUtils.createDirectStream(ssc,["model"],{"metadata.broker.list": kbrokers}) lines = kvs.map(lambda x: x[1]) counts = lines.flatMap(lambda line: line.split(" ")).map(lambda word: (word, 1)) \ .reduceByKey(lambda a, b: a + b) counts.pprint() ssc.start() ssc.awaitTermination()
.option("dbtable", OFFSET_TABLE_NAME) \ .option("user", TARGET_DB_USER_NAME) \ .option("password", TARGET_DB_USER_PASSWORD) \ .load() maxOffset = df_read_offsets.agg({'OFFSET': 'max'}).collect()[0][0] if maxOffset == None: maxOffset = 0 topicPartion = TopicAndPartition(TOPIC, PARTITION) fromOffset = {topicPartion: maxOffset} kafkaParams = { "metadata.broker.list": BROKER_LIST, "enable.auto.commit": "false" } directKafkaStream = KafkaUtils.createDirectStream( ssc, [TOPIC], kafkaParams, fromOffsets=fromOffset, keyDecoder=deserializer(), valueDecoder=deserializer()) time.sleep(5) directKafkaStream.foreachRDD(lambda x: save_data(x)) directKafkaStream.transform(store_offset_ranges) \ .foreachRDD(write_offset_ranges) ssc.start() ssc.awaitTermination()
from big_data_kafka.kafka_producer import produce, get_producer def push_likes_counts_to_kafka(fb_likes): producer = get_producer('likes-counts') producer.start() for fb_like in fb_likes: print('FB like:') print(fb_like) produce(producer, fb_like) # produce_messages(producer, fb_likes) producer.stop() if __name__ == "__main__": sc = SparkContext(appName='PythonStreamingDirectKafkaWordCount') ssc = StreamingContext(sc, 30) # noinspection PyDeprecation kvs = KafkaUtils.createDirectStream(ssc, ['bdsample'], {'metadata.broker.list': 'localhost:9092'}) kvs.pprint() print('Running KAFKA JOB') likes = kvs.map(lambda message: json.loads(message[1])) counts = likes.map(lambda fb: (parser.parse(fb['timestamp']).hour, 1)) \ .reduceByKey(lambda a, b: a + b) counts.pprint() counts.foreachRDD(lambda rdd: rdd.foreachPartition(push_likes_counts_to_kafka)) ssc.start() ssc.awaitTermination()
for record in records: message = "%s %s %s %s %s %s %s %s" % \ (record[0][0], record[0][1], record[0][2], record[0][3], record[1][0], record[1][1], record[1][2], record[1][3]) producer.send_messages('best_flights_2008', message.encode()) # MAIN sc = SparkContext(appName="BestFlights") sc.setLogLevel('ERROR') # Create a local StreamingContext ssc = StreamingContext(sc, 1) ssc.checkpoint( "s3://hsc4-cc-part2-streaming/checkpoints/checkpoint-best-flights") lines = KafkaUtils.createDirectStream(ssc, ['input_2008'], \ {"metadata.broker.list": sys.argv[1], "auto.offset.reset":"smallest"}) # Filter only for data in 2008 lines = lines.map(lambda tup: tup[1]) # Split each line by separator rows = lines.map(lambda line: line.split()) # Get relevant data rows = rows.filter(lambda row: len(row) > 8) airports_fromto = rows.map(lambda row: ( \ (row[0], row[1], row[2], AMOrPM(row[5])), \ (row[3], row[4], departureTimePretty(row[5]), float(row[8])) \ ) \ ) # Filtering just necessary flights
for category in categoriesAll[i]: #combine two classes if labels.collect()[0] > 0: category = category + '_like' else: category = category + '_dislike' Push_to_ES.push('tweets_2', category, json.dumps(data[i])) kafkaparams = { "zookeeper.connect": "localhost:2181", "group.id": "my-group", "zookeeper.connection.timeout.ms": "10000", "metadata.broker.list": "localhost:9092" } Stream_feedback_B = KafkaUtils.createDirectStream(ssc, topic_1, kafkaparams) Stream_feedback_A = KafkaUtils.createDirectStream(ssc, topic_3, kafkaparams) Stream_rawtweets = KafkaUtils.createDirectStream(ssc, topic_2, kafkaparams) Stream_feedback_B.pprint() Stream_feedback_A.pprint() Stream_rawtweets.pprint() Stream_feedback_B.foreachRDD(lambda k: process_feedback_B(k)) Stream_feedback_A.foreachRDD(lambda k: process_feedback_A(k)) Stream_rawtweets.foreachRDD(lambda k: process_tweets(k)) ssc.start() ssc.awaitTermination()
import pyspark_cassandra from pyspark_cassandra import streaming import time def sanitize(line): line = line.replace('\\n', ' ') line = re.sub(r'\\u.{4}', '', line) return line if __name__ == "__main__": sc = SparkContext(appName="TwitterGenerator") sc.setLogLevel('ERROR') ssc = StreamingContext(sc, 10) kvs = KafkaUtils.createDirectStream( ssc, ["tweets.topic"], {"metadata.broker.list": "broker:9092"}) lines = kvs.map(lambda x: x[1]) counts = lines\ .flatMap(lambda line: sanitize(line).split(" "))\ .filter(lambda w: w.startswith('#'))\ .map(lambda word: (word, 1))\ .reduceByKey(lambda a, b: a+b)\ .map(lambda t: (time.time() * 1000, t[0], t[1])) counts.pprint() counts.saveToCassandra("myks", "test", columns=['timestamp', 'word', 'count'])
conf.set("spark.cassandra.connection.host", "10.240.14.37") conf.set("spark.cassandra.connection.port", "9042") # SparkContext represents the connection to a Spark cluster # Only one SparkContext may be active per JVM sc = SparkContext(conf=conf) # Creating a streaming context with batch interval of 10 sec # As the main point of entry for streaming, StreamingContext handles the streaming application's actions, # including checkpointing and transformations of the RDD. ssc = StreamingContext(sc, 3) # DStream 반환 (RDD로 이루어진 객체) (RDD 스파크 데이터 단위) kafkaStream = KafkaUtils.createDirectStream( ssc, topics=["tweets"], kafkaParams={"bootstrap.servers": "localhost:9092"} #"group.id" -> "spark-streaming-notes", #"auto.offset.reset" -> "earliest" ) #Parse Twitter Data as json json_stream = kafkaStream.map(lambda tweet: json.loads(tweet[1])) parsed = json_stream.map(lambda tweet: tweet_filter(tweet)) parsed.foreachRDD(lambda x: x.saveToCassandra("bts", "tweet_dataset")) #parsed.pprint() #Start Execution of Streams ssc.start() ssc.awaitTermination()
from pyspark import SparkContext from pyspark.streaming import StreamingContext from pyspark.streaming.kafka import KafkaUtils def deal_data(rdd): data = rdd.collect() for d in data: print(d) sc = SparkContext(sparkHome="local", appName="Realtime-Analytics-Engine") ssc = StreamingContext(sc, batchDuration=int(6)) kafkaParams = { "metadata.broker.list": "192.168.32.18:9092,192.168.32.19:9092,192.168.32.20:9092", "serializer.class": "kafka.serializer.StringEncoder", "auto.offset.reset": "smallest", "fetch.message.max.bytes": "22388608" } kvs = KafkaUtils.createDirectStream( ssc, list('senmdt-cache-records'), kafkaParams, keyDecoder="kafka.serializer.StringEncoder", valueDecoder="kafka.serializer.StringEncoder") kvs.foreachRDD(lambda rdd: deal_data(rdd))
.option('kudu.master',kuduMaster)\ .option('kudu.table','impala::sensors.asset_sensors').load() assets = sqc.read.format('org.apache.kudu.spark.kudu')\ .option('kudu.master',kuduMaster)\ .option('kudu.table','impala::sensors.well_assets').load() sensorInfo = sensors.join(assets, ['asset_id']) # Persist in memory for fast lookup sensorInfo.persist(StorageLevel.MEMORY_ONLY) sensorInfo.show() # Initialize the Spark Streaming Context to pull data from Kafka every 5 seconds ssc = StreamingContext(sc, 30) kafkaStream = KafkaUtils.createDirectStream( ssc, [kafkaTopic], {"metadata.broker.list": kafkaBroker}) sensorDS = kafkaStream.map(lambda x: x[1])\ def process(time, rdd): print("========= Time: %s =========" % str(time)) try: rawSensor = spark.read.json(rdd) rawSensor = rawSensor.withColumn('sensor_id',rawSensor.sensor_id.cast('integer'))\ .withColumn('value',rawSensor.value.cast('float')) if rawSensor.count() == 0: print('No data, sleep until next window') return print('Raw Sensor Data:')
from __future__ import print_function import sys from pyspark import SparkContext from pyspark import SparkConf from pyspark.streaming import StreamingContext from pyspark.streaming.kafka import KafkaUtils from pyspark.sql.context import SQLContext if __name__ == "__main__": sc = SparkContext("local", "wordcounttest") #sc.setLogLevel(logLevel="OFF") ssc = StreamingContext(sc, 20) #ssc.checkpoint("c:\Playground\spark\logs") brokers, topic = sys.argv[1:] kvs = KafkaUtils.createDirectStream(ssc, ["events.noflguid"],{"metadata.broker.list":"ec2-52-203-200-3.compute-1.amazonaws.com:9092"}) print(str(brokers)) print(str(topic)) lines = kvs.map(lambda x: x[1]) counts = lines.flatMap(lambda line: line.split(" ")) \ .map(lambda word: (word, 1)) \ .reduceByKey(lambda a, b: a+b) counts.pprint() print(kvs.count()) ssc.start() ssc.awaitTermination()
with open(sys.argv[1], 'r') as yml: config = yaml.load(yml) except FileNotFoundError as ex: print('ERROR: Config file does not exist: ' + str(ex) + '\n') traceback.print_tb(ex.__traceback__) print(config) spark = SparkSession \ .builder \ .appName(config['app_name']) \ .getOrCreate() ssc = StreamingContext(spark.sparkContext, config['time_batch_window']) sql_context = SQLContext(spark) kvs = KafkaUtils.createDirectStream(ssc, [config['kafka']['topic']], {"metadata.broker.list": config['kafka']['brokers']}) data_functions = DataFunction(config) lines = kvs \ .map(lambda rdd: rdd[1]) \ .map(data_functions.load_json_from_string) \ .filter(lambda x: x != {}) lines.foreachRDD(lambda rdd: data_functions.create_df(spark, sql_context, rdd)) ssc.start() ssc.awaitTermination()
#Insert data into Kudu def insert_into_kudu(time, rdd): sqc = getSqlContextInstance(rdd.context) kudu_df = sqc.createDataFrame(rdd, schema) kudu_df.show() kudu_df.write.format('org.apache.kudu.spark.kudu') \ .option('kudu.master',kudu_master) \ .option('kudu.table',kudu_table) \ .mode("append") \ .save() if __name__ == "__main__": sc = SparkContext(appName="SparkStreaming_IoT") ssc = StreamingContext(sc, 5) # 5 second window kvs = KafkaUtils.createDirectStream( ssc, [kafka_topic], {"metadata.broker.list": kafka_brokers}) # parse the kafka message into a tuple kafka_stream = kvs.map(lambda x: x[1]) \ .map(lambda l: json.loads(l)) \ .map(lambda p: (int(p['sensor_id']), int(p['sensor_ts']), float(p['sensor_0']), float(p['sensor_1']), float(p['sensor_2']), float(p['sensor_3']), float(p['sensor_4']), float(p['sensor_5']), float(p['sensor_6']), float(p['sensor_7']), float(p['sensor_8']),
# return to the pool for future reuse # ConnectionPool.returnConnection(connection) # To Run: # sudo $SPARK_HOME/bin/spark-submit --packages org.apache.spark:spark-streaming-kafka-0-8_2.11:2.1.0 kafka-spark-test.py if __name__ == "__main__": # To run on cluster: # conf = SparkConf().setAppName("Venmo-Graph-Analytics-Dev").setMaster("spark://ip-172-31-0-135:7077") # sc = SparkContext(conf=conf) # To run locally: sc = SparkContext(appName="Venmo-Graph-Analytics-Dev") # Set up resources ssc = StreamingContext(sc, 1) # Set Spark Streaming context # brokers = "ec2-50-112-19-115.us-west-2.compute.amazonaws.com:9092,ec2-52-33-162-7.us-west-2.compute.amazonaws.com:9092,ec2-52-89-43-209.us-west-2.compute.amazonaws.com:9092" brokers = "ec2-52-25-139-222.us-west-2.compute.amazonaws.com:9092" topic = 'Venmo-Transactions-Dev' kafka_stream = KafkaUtils.createDirectStream( ssc, [topic], {"metadata.broker.list": brokers}) transaction = kafka_stream.map(lambda kafka_response: json.loads(kafka_response[1]))\ .map(lambda json_body: extract_data(json_body))\ .foreachRDD(lambda rdd: rdd.foreachPartition(send_partition)) # transaction.pprint() ssc.start() ssc.awaitTermination()
return rdd def print_offset(rdd): for o in offsets: print("%s %s %s %s %s" % (o.topic, o.partition, o.fromOffset, o.untilOffset, o.untilOffset - o.fromOffset)) config = SparkConf().set("spark.streaming.kafka.maxRatePerPartition", 30000) scontext = SparkContext(conf=config) #scontext = SparkContext("local[2]", "kafka_pyspark_test") stream_context = StreamingContext(scontext, 3) msg_stream = KafkaUtils.createDirectStream( stream_context, [ 'test', ], kafkaParams={"metadata.broker.list": "127.0.0.1:9092,"}) '''result = msg_stream.map(lambda x :json.loads(x).keys()).reduce(out_put) msg_stream.transform(store_offset,).foreachRDD(print_offset) result.pprint() ''' targets = msg_stream.map(lambda msg_stream: msg_stream[1]) json_values = [] def write(res): json_str = json.dumps(res) with open("test_data.json", "a") as json_file:
import csv from json import loads from flatten_json import flatten from time import sleep # import pandas as pd print("PROGRAM START!!!") print("PROGRAM START!!!") print("PROGRAM START!!!") print("PROGRAM START!!!") sc= SparkContext() ssc = StreamingContext(sc, 10) sqlc= SQLContext(sc) directKafkaStream = KafkaUtils.createDirectStream(ssc, ["kafkaNBA"], {"metadata.broker.list": "localhost:9099"}) lines= directKafkaStream.map(lambda x: x[1]) print("LINES START!!!") print("LINES START!!!") print("LINES START!!!") print("LINES START!!!") def transformer(rdd): my_obj= json.loads(rdd) return (my_obj["player"]["weight_pounds"]) transform= lines.map(transformer) def build_df(rdd): if not rdd.isEmpty():
# -*- coding: UTF-8 -*- from pyspark import SparkConf from pyspark.sql import SparkSession from pyspark import SparkContext from pyspark.streaming import StreamingContext from pyspark.streaming.kafka import KafkaUtils from setting.default import DefaultConfig import happybase # 1、创建spark streaming context conf conf = SparkConf() conf.setAll(DefaultConfig.SPARK_ONLINE_CONFIG) sc = SparkContext(conf=conf) stream_sc = StreamingContext(sc, 60) # 2、配置与kafka读取的配置 # 一个点击日志⾏行为如果有多个消费者,可以设置分组,那么这个数据会备份多份 similar_kafka = {"metadata.broker.list": DefaultConfig.KAFKA_SERVER, "group.id": 'similar'} # 2.1、消费者(用户点击日志) SIMILAR_DS = KafkaUtils.createDirectStream(stream_sc, ['click-trace'], similar_kafka) # 2.2、配置HOT文章读取配置 kafka_params = {"metadata.broker.list": DefaultConfig.KAFKA_SERVER} HOT_DS = KafkaUtils.createDirectStream(stream_sc, ['click-trace'], kafka_params) # 2.3、配置新文章的读取KAFKA配置 click_kafkaParams = {"metadata.broker.list": DefaultConfig.KAFKA_SERVER} NEW_ARTICLE_DS = KafkaUtils.createDirectStream(stream_sc, ['new-article'], click_kafkaParams)
def enc(data): result = {k: helper(v) for k, v in data.items()} return result #return dict(map(lambda line: line.encode('ascii'), pair.value) for pair in data.items()) conf = SparkConf().setMaster("local[*]").setAppName("StreamingDirectKafka") sc = SparkContext(conf=conf) ssc = StreamingContext(sc, 10) skQuorum = "localhost:2181" topic = ["meetup"] kafkaParams = {"metadata.broker.list": "localhost:9092"} #, kafkaParams = {"metadata.broker.list":"localhost:9092"} kafkaStream = KafkaUtils.createDirectStream(ssc, topic, kafkaParams) #stream = ssc.receiverStream( \ # MeetupReceiver("https://stream.meetup.com/2/rsvps") \ #) """ data = kafkaStream.map(lambda line: json.loads(line) """ rsvp = kafkaStream.map(lambda line: line[1]) rsvp2 = rsvp.map(lambda line: json.loads(line.encode("ascii", "ignore"))) #kafkaStream.pprint() """ process = data.mapValues(lambda line: line.encode('ascii')).cache() """ #event = data["topic_name"]
new_vals0 = 0.0 new_vals1 = 0 for val in new_values: new_vals0 += val[0] new_vals1 += val[1] last_vals0 = last_sum[0] if last_sum is not None else 0.0 last_vals1 = last_sum[1] if last_sum is not None else 0 return (new_vals0 + last_vals0,\ new_vals1 + last_vals1) kafkaStream = KafkaUtils.createDirectStream(ssc,[topic], { 'bootstrap.servers':'localhost:9092', 'group.id':'video-group', 'fetch.message.max.bytes':'15728640', 'auto.offset.reset':'largest'}) # Group ID is completely arbitrary ontime_data = kafkaStream.map(lambda x: x[1]).map(split).flatMap(parse) filtered = ontime_data.map(lambda fl: ((fl.Origin, fl.Dest), (fl.ArrDelay, 1)))\ .updateStateByKey(updateFunction) # filtered.foreachRDD(lambda rdd: print_rdd(rdd)) filtered.foreachRDD(lambda rdd: rdd.foreachPartition(save_partition)) ssc.start() # time.sleep(600) # Run stream for 10 minutes just in case no detection of producer # ssc. awaitTermination() ssc.stop(stopSparkContext=True,stopGraceFully=True)
.master("local[*]")\ .getOrCreate() sc = spark.sparkContext ssc = StreamingContext(sparkContext=sc, batchDuration=1) # the topic to subscribe topic_to_sub = ["test"] # the address of kafka, separate with comma if there are many bootstrap_servers = "localhost:9092" # kafka config info kafka_params = {"metadata.broker.list": bootstrap_servers} # initialize stream to consume data from kafka kafka_stream = KafkaUtils.createDirectStream(ssc=ssc, topics=topic_to_sub, kafkaParams=kafka_params) kafka_stream.pprint() r = redis.Redis("127.0.0.1") def save_redis(rdd): """ save word count result into redis hash. example: word_count green 9 blue 3 red 1
from pyspark.streaming.kafka import KafkaUtils from pyspark.streaming import StreamingContext from pyspark import SparkContext from pyspark.sql import SparkSession import json from json import loads # spark-submit --packages org.apache.spark:spark-streaming-kafka-0-8_2.11:2.4.4 try1.py sc = SparkContext(appName="samir") ssc = StreamingContext(sc, 5) ks = KafkaUtils.createDirectStream(ssc, ['kafkaNBA2'], {'metadata.broker.list': 'localhost:9092'}) result1 = ks.map(lambda x: json.loads(x[1])).flatMap(lambda x: x['data']).map( lambda x: x['player']) result1.pprint() def handle_rdd(rdd): if not rdd.isEmpty(): global ss df = ss.createDataFrame(rdd, schema=[ 'first_name', 'last_name', 'height_inches', 'weight_pounds', 'team_id', 'height_feet', 'position', 'id' ]) df.show()
from __future__ import print_function import sys from pyspark import SparkContext from pyspark.streaming import StreamingContext from pyspark.streaming.kafka import KafkaUtils if __name__ == '__main__': sc = SparkContext(appName="PythonStreamingKafkaWordCount") ssc = StreamingContext(sc, 1) kstream = KafkaUtils.createDirectStream(ssc, topics = ['CodeSubmission'], \ kafkaParams = {"metadata.broker.list": '52.53.157.26:9092'}) data = kstream.map(lambda x: x[1].encode("utf-8")) data.pprint() ssc.start() ssc.awaitTerminationOrTimeout(30) ssc.stop(stopGraceFully=True)
def getSqlContext(conf): input_uri = "mongodb://localhost:27017/Bitcoin.bitcoin" output_uri = "mongodb://localhost:27017/Bitcoin.bitcoin" conf.set('spark.mongodb.input.uri', input_uri) conf.set('spark.mongodb.output.uri', output_uri) conf.set('spark.mongodb.input.sampleSize', 50000) sc = SparkContext.getOrCreate(conf=conf) return SQLContext(sc) conf = SparkConf().setAppName("BitcoinPrediction").setMaster('local') sc = SparkContext.getOrCreate(conf=conf) ssc = StreamingContext(sc, 5) data = KafkaUtils.createDirectStream( ssc, topics=["Bitcoin"], kafkaParams={"metadata.broker.list": "localhost:9092"}) sqlContext = getSqlContext(conf) # data.foreachRDD(lambda rdd: rdd.collect().toList) df = sqlContext.createDataFrame( data.foreachRDD(lambda rdd: rdd.map(lambda x: (x[0], list(set(x[1]))))), "ok") df.write.format("mongo").mode("append").save() ssc.start() ssc.awaitTermination()
sc = SparkContext(appName="mytstApp") sc.setLogLevel("ERROR") # 减少shell打印日志 ssc = StreamingContext(sc, 30) #tlist = ['Spark_1','Spark_2'] checkpoint_dir = './Checkpoint/spark' ssc.checkpoint(checkpoint_dir) kafka_params = { "bootstrap.servers": "localhost:9092", "group.id": "myUserGroup", "enable.auto.commit": "false", "auto.offset.reset": "largest" } dstream = [KafkaUtils.createDirectStream(ssc, [tlist[i]], kafka_params,\ keyDecoder=spot_decoder,\ valueDecoder=spot_decoder,\ messageHandler=setHandler )\ for i in range(len(tlist)) ] countList = [] for index in range(len(tlist)): print(tlist[index]) tempt = ( dstream[index].map( lambda x : getID(x) )\ .map( lambda x : ( 1, x))\ .updateStateByKey( updatefunction )\ ) print("lalalaall") countList.append(tempt) countList[index].foreachRDD(lambda x: displayID(x))
if __name__ == "__main__": # To run on cluster: # conf = SparkConf().setAppName("Venmo-Graph-Analytics-Test").setMaster("spark://ip-172-31-0-135:7077") # sc = SparkContext(conf=conf) # To run locally: sc = SparkContext(appName="Venmo-Graph-Analytics-Test") # Set up resources ssc = StreamingContext(sc, 1) # Set Spark Streaming context # brokers = "ec2-50-112-19-115.us-west-2.compute.amazonaws.com:9092,ec2-52-33-162-7.us-west-2.compute.amazonaws.com:9092,ec2-52-89-43-209.us-west-2.compute.amazonaws.com:9092" brokers = "ec2-52-25-139-222.us-west-2.compute.amazonaws.com:9092" kafka_stream = KafkaUtils.createDirectStream( ssc, ['Venmo-Transactions-Test'], {"metadata.broker.list": brokers}) transaction = kafka_stream.map(lambda kafka_response: json.loads(kafka_response[1]))\ .map(lambda json_body: extract_data(json_body))\ .foreachRDD(lambda rdd: rdd.foreachPartition(send_partition)) # transaction.pprint() # lines = kafka_stream.map(lambda x: x[1]) # counts = lines.flatMap(lambda line: line.split(" ")) \ # .map(lambda word: (word, 1)) \ # .reduceByKey(lambda a, b: a+b) # counts.pprint() ssc.start() ssc.awaitTermination()
#tf-idf calculation tf = HashingTF(numFeatures=numFeatures).transform(prep_filtered.map(porter_stem, preservesPartitioning=True)) idf = IDF().fit(tf) train_tfidf = idf.transform(tf) #set training dataset with label training = review_labels.zip(train_tfidf).map(lambda x: LabeledPoint(x[0], x[1])) #train the model classifier model = NaiveBayes.train(training) model_name = "naivebayes"+str(counter_model) #save model classifier to HDFS output_dir = "hdfs://VM10-1-0-14:9000/classifier/"+model_name model.save(sc, output_dir) counter_model.add(1) end = time.time() print("Model Name : ", model_name ,", Total Reviews : ", reviews.count(), "Processing Time : ", (end-start)) #create stream initiation to Kafka kvs = KafkaUtils.createDirectStream(ssc, [topic], kafkaParams) parsed = kvs.map(lambda v: json.loads(v[1])) reviews = parsed.map(lambda r: [r['overall'], r['reviewText']]) reviews.foreachRDD(process) ssc.start() ssc.awaitTermination()