class MyStreamingListener(StreamingListener):
    """
    Uses py4j framework to send Java objects to the pyspark process.
    The parameters to the callbacks are Java objects with members variables as objects.
    They are not sent as primitive data types.
    """
    def onBatchStarted(self, batchStarted):
        # 'batchStarted' instance of org.apache.spark.streaming.api.java.JavaStreamingListenerBatchStarted
        print('>>> Batch completed...number of records: ', batchStarted.batchInfo().numRecords())

    def onBatchCompleted(self, batchCompleted):
        # 'batchStarted' instance of org.apache.spark.streaming.api.java.JavaStreamingListenerBatchCompleted
        print('>>> Batch completed...time taken (ms) = ', batchCompleted.batchInfo().totalDelay())
        
if __name__ == '__main__':
    ssc = StreamingContext(\
        SparkContext(conf = SparkConf().setAppName('TestStreamingListenerJob')), \
        5)
 
    ssc.addStreamingListener(MyStreamingListener())
    
    ssc\
         .socketTextStream('localhost', 9999)\
         .flatMap(lambda line: line.split(' '))\
         .count()\
         .pprint()

    ssc.start()
    ssc.awaitTermination()
    def start(self):
        # optimize spark-streaming performance
        conf = SparkConf()
        conf.set("spark.locality.wait", 10)
        conf.set("spark.streaming.backpressure.enabled", True)
        conf.set("spark.streaming.kafka.consumer.poll.ms", 512)
        conf.set("spark.streaming.receiver.maxRate", 1000)
        sc = SparkContext(conf=conf, appName = "spark_streaming_kafka")
        
        sc.setLogLevel("WARN")
        ssc = StreamingContext(sc, \
				self.config["TWITTER_STREAMING"]["MINI_BATCH_TIME_INTERVAL_SEC"] )
        listener = Listener(sc)
        ssc.addStreamingListener(listener)

        self.num_subreddit = len(get_tweet_count_dict(sc))
        print("classify to %d" % \
				self.num_subreddit)

        # union of streams
        numStreams = 8
        kafkaStreams = [KafkaUtils.createStream(ssc, self.config["DEFAULT"]\
				["KAFKA_PUBLIC_IP"]+':2181', 'spark-streaming', {'twitter':1}) for _ in range (numStreams)]
        unifiedStream = ssc.union(*kafkaStreams) 

        # if used direct stream
        #kafkaStream = KafkaUtils.createDirectStream(ssc, ['twitter'], {"metadata.broker.list": self.config["DEFAULT"]["KAFKA_BROKER_LIST"]})
        
		
		# load streaming message from kafka
        parsed = unifiedStream.map(lambda v: json.loads(v[1]))
        
        # debug usage
        #parsed.count().map(lambda x:'Tweets in this batch: %s' % x).pprint()

		# process and classify tweets
        subreddit_topic = parsed\
                #.map(lambda tweet: self.get_word_set(tweet['text'], tweet['user']['name']))
                .map(lambda tweets: (tweet['user']['name'], self.get_word_set(tweet['text']))
        
        subreddit_topic = subreddit_topic.map(self.get_top_topic)
        subreddit_topic.pprint()
		
        ssc.start()
        ssc.awaitTermination()
        return

def main():
    process = twitterStreamingProcess()
    process.start()

if __name__ == '__main__':
    main()
from pyspark.streaming import StreamingContext
from pyspark.streaming.listener import StreamingListener

#sc = StreamingContext()

sc = SparkContext(appName="PythonTwitterStreaming")
ssc = StreamingContext(sc, 1)

def batchStarted(input):
    print('batch started')
    print(input)

listener = StreamingListener()
listener.onBatchStarted = batchStarted

ssc.addStreamingListener(listener)

# ssc.addStreamingListener()
#lines = ssc.socketTextStream("54.213.33.240", 9002)

stream = ssc.socketTextStream("54.213.33.240", 9002)
#stream = ssc.addStreamingListener()

stream.pprint()

#print(lines)

ssc.start()
ssc.awaitTermination()

        pprint(outputOperationCompleted)


# conf = (SparkConf()
#          .setMaster("spark://192.168.1.33:7077")
#          .setAppName("PythonStreamingExample"))

# sc = SparkContext(conf = conf)

sc = SparkContext(appName="PythonStreamingNetworkWordCount2")
ssc = StreamingContext(sc,
                       5)  # second argument is the batch interval in seconds.

# 9999 1MB text file
# 9998 tiny text file
# 9997 nc with stdin

# IP address that worker node will connect to (don't use localhost or 127.0.0.1)
lines = ssc.socketTextStream('192.168.1.33', 9999)
# lines = ssc.socketTextStream('localhost', 9999)

lines.flatMap(lambda line: line.split(" "))\
              .map(lambda word: (word, 1))\
              .reduceByKey(lambda a, b: a+b).pprint()

streamingListener = DebugStreamingListener()

ssc.addStreamingListener(streamingListener=streamingListener)

ssc.start()
ssc.awaitTermination()
                'place':
                tweet['place'],
                'hash_tags':
                hashtag['text']
            }
            res_list.append(temp)
    print(res_list)
    if res_list:
        insert_into_hashtags(res_list)


if __name__ == "__main__":

    # creating spark configuration
    conf = SparkConf()
    conf.setAppName(SparkStream.APP_NAME.value)
    sc = SparkContext(conf=conf)
    sc.setLogLevel("ERROR")
    ssc = StreamingContext(sc, int(SparkStream.STREAM_INTERVAL.value))
    ssc.checkpoint(SparkStream.CHECKPOINT.value)

    dataStream = ssc.socketTextStream(SparkStream.TCP_IP.value,
                                      SparkStream.TCP_PORT.value)
    cv = dataStream.map(lambda x: json.loads(x))
    cv.foreachRDD(process_rdd)

    # add listener to check if stream closed
    ssc.addStreamingListener(CustomListener())
    ssc.start()
    ssc.awaitTermination()
Exemple #6
0
    #output_file.write("KMeans Model Update, %d, %s, %.5f\n"%(count,NUMBER_PARTITIONS, end_train-start))
    output_file.flush()


appName = "PythonSparkStreamingKafkaKMeans"
conf = SparkConf().setAppName(appName).set(
    'spark.metrics.conf.*.sink.csv.class',
    'org.apache.spark.metrics.sink.CsvSink').set(
        'spark.metrics.conf.*.sink.csv.directory', './')
sc = SparkContext(conf=conf)

ssc_start = time.time()
ssc = StreamingContext(sc, STREAMING_WINDOW)

batch_collector = BatchInfoCollector()
ssc.addStreamingListener(batch_collector)

kafka_dstream = KafkaUtils.createDirectStream(
    ssc, [TOPIC], {"metadata.broker.list": METABROKER_LIST})
ssc_end = time.time()
#output_file.write("Spark SSC Startup, %d, %s\n"%( NUMBER_PARTITIONS, str(ssc_end-ssc_start)))

kafka_dstream.count().pprint()

points = kafka_dstream.transform(pre_process)
points.pprint()
points.foreachRDD(model_update)

try:
    ssc.start()
    ssc.awaitTermination()
Exemple #7
0
from pyspark.sql.functions import explode
from pyspark.sql.functions import split

from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.listener import StreamingListener

#sc = StreamingContext()

sc = SparkContext(appName="PythonTwitterStreaming")
ssc = StreamingContext(sc, 1)

sl = StreamingListener()
sl.onBatchStarted()

ssc.addStreamingListener()
#lines = ssc.socketTextStream("54.213.33.240", 9002)

#stream = ssc.socketTextStream("54.213.33.240", 9002)
#stream = ssc.addStreamingListener()

stream.pprint()

#print(lines)

ssc.start()
ssc.awaitTermination()

#ssc = new StreamingContext("local[2]", "NodejsTcpClient", Seconds(1))

#val lines = ssc.socketTextStream("127.0.0.1", 1337, StorageLevel.MEMORY_AND_DISK_SER)
    #output_file.write("KMeans Prediction, %.3f\n"%(end_pred-end_train))
    #return predictions
    

def model_prediction(rdd):
    pass


##########################################################################################################################
# Start Streaming App
    
ssc_start = time.time()    
ssc = StreamingContext(sc, STREAMING_WINDOW)

batch_collector = BatchInfoCollector()
ssc.addStreamingListener(batch_collector)
      

#kafka_dstream = KafkaUtils.createStream(ssc, KAFKA_ZK, "spark-streaming-consumer", {TOPIC: 1})
#kafka_param: "metadata.broker.list": brokers
#              "auto.offset.reset" : "smallest" # start from beginning
kafka_dstream = KafkaUtils.createDirectStream(ssc, [TOPIC], {"metadata.broker.list": METABROKER_LIST,
                                                             "auto.offset.reset" : "smallest"}) #, fromOffsets=fromOffset)
ssc_end = time.time()    
output_file.write("Spark SSC Startup, %d, %d, %s, %.5f\n"%(spark_cores, -1, NUMBER_PARTITIONS, ssc_end-ssc_start))


#####################################################################
# Scenario Count

#global counts