Example #1
2
def stream(ssc):

    zkQuorum = "localhost:2181"
    topic = "topic1"
    tweets = KafkaUtils.createStream(ssc, zkQuorum, "spark-streaming-consumer", {topic: 1})
    kstream = KafkaUtils.createDirectStream(ssc, topics = ['topic1'], kafkaParams = {"metadata.broker.list":"localhost:9092"})

    tweets = tweets.map(lambda x: x[1].encode("ascii","ignore"))
    return tweets
def ss_direct_kafka_bucket_counter(brokers, topic, bucket_interval, output_msg, message_parse, valueDecoder=None):
    """Starts a Spark Streaming job from a Kafka input and parses message time

	WARNING!! This function only works for spark 1.4.0+ 

	Args:
		brokers: the kafka broker that we look at for the topic
		topic: the kafka topic for input
		timeinterval: the time interval in seconds (int) that the job will 
			bucket

	Returns:
		None
		
	"""
    sc = SparkContext(appName="PythonKafkaBucketCounter")
    ssc = StreamingContext(sc, timeinterval + 5)

    if valueDecoder:
        kvs = KafkaUtils.createDirectStream(ssc, [topic], {"metadata.broker.list": brokers}, valueDecoder=valueDecoder)
    else:
        kvs = KafkaUtils.createDirectStream(ssc, [topic], {"metadata.broker.list": brokers})

    lines = kvs.map(lambda x: x[1])
    interval_counts = lines.map(lambda line: (message_parse(line), 1)).reduceByKey(lambda a, b: a + b)

    output_msg_func = output_msg(sc, ssc)

    interval_counts.foreachRDD(output_msg_func)

    ssc.start()
    ssc.awaitTermination()
    def get_kafka_stream(topic, streaming_context):
        offset_specifications = simport.load(cfg.CONF.repositories.offsets)()
        app_name = streaming_context.sparkContext.appName
        saved_offset_spec = offset_specifications.get_kafka_offsets(app_name)
        if len(saved_offset_spec) < 1:

            MonMetricsKafkaProcessor.log_debug(
                "No saved offsets available..."
                "connecting to kafka without specifying offsets")
            kvs = KafkaUtils.createDirectStream(
                streaming_context, [topic],
                {"metadata.broker.list": cfg.CONF.messaging.brokers})

            return kvs

        else:
            from_offsets = {}
            for key, value in saved_offset_spec.items():
                if key.startswith("%s_%s" % (app_name, topic)):
                    # spec_app_name = value.get_app_name()
                    spec_topic = value.get_topic()
                    spec_partition = int(value.get_partition())
                    # spec_from_offset = value.get_from_offset()
                    spec_until_offset = value.get_until_offset()
                    # composite_key = "%s_%s_%s" % (spec_app_name,
                    #                               spec_topic,
                    #                               spec_partition)
                    # partition = saved_offset_spec[composite_key]
                    from_offsets[
                        TopicAndPartition(spec_topic, spec_partition)
                    ] = long(spec_until_offset)

            MonMetricsKafkaProcessor.log_debug(
                "get_kafka_stream: calling createDirectStream :"
                " topic:{%s} : start " % topic)
            for key, value in from_offsets.items():
                MonMetricsKafkaProcessor.log_debug(
                    "get_kafka_stream: calling createDirectStream : "
                    "offsets : TopicAndPartition:{%s,%s}, value:{%s}" %
                    (str(key._topic), str(key._partition), str(value)))
            MonMetricsKafkaProcessor.log_debug(
                "get_kafka_stream: calling createDirectStream : "
                "topic:{%s} : done" % topic)

            kvs = KafkaUtils.createDirectStream(
                streaming_context, [topic],
                {"metadata.broker.list": cfg.CONF.messaging.brokers},
                from_offsets)
            return kvs
def stream(ssc, pwords, nwords, duration):
    kstream = KafkaUtils.createDirectStream(ssc, topics = ['twitterstream'], kafkaParams = {"metadata.broker.list": 'localhost:9092'})
    tweets = kstream.map(lambda x: x[1].encode("ascii","ignore"))
    tweets = tweets.flatMap(lambda line: line.split(" "))
    words = tweets.flatMap(lambda line: line.split(" "))
    tweets = tweets.filter(lambda x: x in pwords or x in nwords)
    tweets = tweets.map(lambda x: ("positive",1) if x in pwords else ("negative",1))
    tweets = tweets.reduceByKey(lambda x,y: x+y)
    tweets = tweets.updateStateByKey(updateFunction)
    tweets.pprint()

    pds = words.filter(lambda x: x in pwords)
    nds = words.filter(lambda x: x in nwords)

    plist=[]
    nlist=[]

    pds.foreachRDD(lambda t,rdd: plist.append(rdd.count()))    
    nds.foreachRDD(lambda t,rdd: nlist.append(rdd.count()))

    counts = []
  
    ssc.start()                         # Start the computation
    ssc.awaitTerminationOrTimeout(duration)
    ssc.stop(stopGraceFully=True)

    for i in range(0,len(plist)):
        counts.append((plist[i],nlist[i]))

    return counts
def stream(ssc, pwords, nwords, duration):
    kstream = KafkaUtils.createDirectStream(
        ssc, topics = ['twitterstream'], kafkaParams = {"metadata.broker.list": 'localhost:9092'})
    tweets = kstream.map(lambda x: x[1].encode("ascii","ignore"))
    #print "HELOKOJOJEORUBEORUBOUBEROUBNOUONEROJOEJRNOJENROJENFOJEFOEJFNOEFUNOEUFN"
    #tweets.pprint()
    # Each element of tweets will be the text of a tweet.
    # You need to find the count of all the positive and negative words in these tweets.
    # Keep track of a running total counts and print this at every time step (use the pprint function).
    # YOUR CODE HERE
    words = tweets.flatMap(lambda line: line.split(" "))
    pairs = words.map(classifier).map(lambda word: (word, 1)).filter(lambda x: x[0] != 'none').reduceByKey(lambda a,b: a+b)
    runningCounts = pairs.updateStateByKey(updateFunction)
    runningCounts.pprint()
    # Let the counts variable hold the word counts for all time steps
    # You will need to use the foreachRDD function.
    # For our implementation, counts looked like:
    #   [[("positive", 100), ("negative", 50)], [("positive", 80), ("negative", 60)], ...]
    counts = []
    pairs.foreachRDD(lambda t,rdd: counts.append(rdd.collect()))
    
    ssc.start()                         # Start the computation
    ssc.awaitTerminationOrTimeout(duration)
    ssc.stop(stopGraceFully=True)
    #print counts
    return counts
Example #6
0
def start_spark(timeout=None, max_items_per_rdd_sent=None):
    sc = SparkContext("local[4]", "twitter.trending")
    ssc = StreamingContext(sc, 5)

    ssc.checkpoint('hdfs://localhost:9000/user/spark/checkpoint/')

    kafka_params = {
        'zookeeper.connect': config.get('zookeeper', 'host'),
        'group.id': config.get('kafka', 'group_id'),
        'metadata.broker.list': config.get('kafka', 'hosts')
    }

    ksc = KafkaUtils.createDirectStream(ssc,
                                        [config.get('kafka', 'topic')],
                                        kafka_params)

    hashtag_counts = get_word_counts(ksc)
    filtered_tweet_count = filter_tweets(hashtag_counts)
    send_dstream_data(filtered_tweet_count, max_items_per_rdd_sent)
    ssc.start()
    if timeout:
        ssc.awaitTermination(timeout)
        ssc.stop(stopSparkContext=True, stopGraceFully=True)
    else:
        ssc.awaitTermination()
def stream(ssc, pwords, nwords, duration):
    kstream = KafkaUtils.createDirectStream(
        ssc, topics = ['twitterstream'], kafkaParams = {"metadata.broker.list": 'localhost:9092'})
    tweets = kstream.map(lambda x: x[1].encode("ascii","ignore"))


    # Print the first ten elements of each RDD generated in this DStream to the console
    #tweets.pprint()
    words = tweets.flatMap(lambda line: line.split(" "))

    posNegPairs = words.map(lambda word: myMapping(word, pwords, nwords))
    filteredPairs = posNegPairs.filter(lambda x: x[0] != "na")
    posNegCounts = filteredPairs.reduceByKey(lambda x, y: x + y)


    # Each element of tweets will be the text of a tweet.
    # You need to find the count of all the positive and negative words in these tweets.
    # Keep track of a running total counts and print this at every time step (use the pprint function).

    cumulativeCounts = posNegCounts.updateStateByKey(myRunningUpdate)
    cumulativeCounts.pprint()    
    
    # Let the counts variable hold the word counts for all time steps
    # You will need to use the foreachRDD function.
    # For our implementation, counts looked like:
    #   [[("positive", 100), ("negative", 50)], [("positive", 80), ("negative", 60)], ...]
    counts = []
    posNegCounts.foreachRDD(lambda t,rdd: counts.append(rdd.collect()))
    
    ssc.start()                         # Start the computation
    ssc.awaitTerminationOrTimeout(duration)
    ssc.stop(stopGraceFully=True)
    return counts
def stream(ssc, pwords, nwords, duration):
    kstream = KafkaUtils.createDirectStream(ssc, topics = ['twitterstream'], kafkaParams = {"metadata.broker.list": 'localhost:9092'})

    tweets = kstream.map(lambda x: x[1].encode("ascii","ignore"))

    pword_rdd=tweets.flatMap(lambda line: line.split(" ")).map(lambda word: ("positive",1) if word in pwords else ("positive",0)).reduceByKey(lambda a,b:a+b)
    nword_rdd=tweets.flatMap(lambda line: line.split(" ")).map(lambda word: ("negative",1) if word in nwords else ("negative",0)).reduceByKey(lambda a,b:a+b)

    # Each element of tweets will be the text of a tweet.
    # You need to find the count of all the positive and negative words in these tweets.
    # Keep track of a running total counts and print this at every time step (use the pprint function).
    # make the plot on this rdd -combined_rdd

    combined_rdd=pword_rdd.union(nword_rdd)
    running_counts=combined_rdd.updateStateByKey(updateFunction)
    
    # Let the counts variable hold the word counts for all time steps
    # You will need to use the foreachRDD function.
    # For our implementation, counts looked like:
    #   [[("positive", 100), ("negative", 50)], [("positive", 80), ("negative", 60)], ...]

    counts = []
    combined_rdd.foreachRDD(lambda t,rdd: counts.append(rdd.collect()))
    
    # print "printing dstream"
    running_counts.pprint()
		
	# Start the computation
    ssc.start()                         
    ssc.awaitTerminationOrTimeout(duration)
    ssc.stop(stopGraceFully=True)

    return counts
Example #9
0
def start():
    sconf = SparkConf()
    sconf.set('spark.cores.max', 2)
    sc = SparkContext(appName='KafkaDirectWordCount', conf=sconf)
    ssc = StreamingContext(sc, 2)

    brokers = "localhost:9092"
    topics = ['test']

    kafkaStreams_lines = KafkaUtils.createDirectStream(ssc, topics, kafkaParams={"metadata.broker.list": brokers})

    lines1 = kafkaStreams_lines.map(lambda x: x[1])  # 注意 取tuple下的第二个即为接收到的kafka流

    words = lines1.flatMap(lambda line: line.split(" "))

    pairs = words.map(lambda word: (word, 1))

    wordcounts = pairs.reduceByKey(lambda x, y: x + y)

    print(wordcounts)

    kafkaStreams_lines.transform(storeOffsetRanges).foreachRDD(printOffsetRanges)

    wordcounts.pprint()
    # 统计生成的随机数的分布情况
    ssc.start()  # Start the computation
    ssc.awaitTermination()  # Wait for the computation to terminate
Example #10
0
def stream(ssc, pwords, nwords, duration):
    kstream = KafkaUtils.createDirectStream(ssc, topics = ['twitterstream'], kafkaParams = {"metadata.broker.list":'localhost:9092'})
    tweets = kstream.map(lambda x: x[1].encode("ascii","ignore"))

    
    # Each element of tweets will be the text of a tweet.
    # Need to find the count of all the positive and negative words in these tweets.
    # Keep track of a running total counts and print this at every time step (use the pprint function).
    pnTweets = tweets.flatMap(lambda line: line.split(" "))
    pnTweetsPairs = pnTweets.map(lambda x: determine(x,pwords,nwords))
    wordCounts = pnTweetsPairs.reduceByKey(lambda x, y: x + y)
    
    totalCounts = pnTweetsPairs.updateStateByKey(updateFunction)
    totalCounts.pprint()
    # Let the counts variable hold the word counts for all time steps
    # Need to use the foreachRDD function.
    # For our implementation, counts looked like:
    #   [[("positive", 100), ("negative", 50)], [("positive", 80), ("negative", 60)], ...]
    counts = []
    wordCounts.foreachRDD(lambda t,rdd: counts.append(rdd.collect()))
    
    
    
    ssc.start()                         # Start the computation
    ssc.awaitTerminationOrTimeout(duration)
    ssc.stop(stopGraceFully=True)
    # becaue counts include those neither ones
    
    newCounts = []
    for count in counts:
        newCount = [item for item in count if item[0] == "positive" or item[0] =="negative"]
        newCounts.insert(len(newCounts),newCount)
    
    return newCounts
def stream(ssc, pwords, nwords, duration):
    kstream = KafkaUtils.createDirectStream(
        ssc, topics = ['twitterstream'], kafkaParams = {"metadata.broker.list": 'localhost:9092'})
    tweets = kstream.map(lambda x: x[1].encode("ascii","ignore"))

    # Each element of tweets will be the text of a tweet.
    # You need to find the count of all the positive and negative words in these tweets.
    # Keep track of a running total counts and print this at every time step (use the pprint function).
    # YOUR CODE HERE
   
    words=tweets.flatMap(lambda x: x.split(" ")).filter(lambda x: x in pwords or x in nwords) 
    wordPairs=words.map(lambda x: ("positive",1) if x in pwords else ("negative",1))

    wordCount=wordPairs.reduceByKey(lambda x, y: x + y)
    
    runningCounts = wordPairs.updateStateByKey(updateFunction)

    runningCounts.pprint()
    

    # Let the counts variable hold the word counts for all time steps
    # You will need to use the foreachRDD function.
    # For our implementation, counts looked like:
    #   [[("positive", 100), ("negative", 50)], [("positive", 80), ("negative", 60)], ...]
    counts = []
    wordCount.foreachRDD(lambda t,rdd: counts.append(rdd.collect()))
    
    ssc.start()                         # Start the computation
    ssc.awaitTerminationOrTimeout(duration)
    ssc.stop(stopGraceFully=True)

    return counts
Example #12
0
def stream(ssc, pwords, nwords, duration):
    kstream = KafkaUtils.createDirectStream(
        ssc, topics = ['twitterstream'], kafkaParams = {"metadata.broker.list": 'localhost:9092'})
    tweets = kstream.map(lambda x: x[1].encode("ascii","ignore"))

    # Each element of tweets will be the text of a tweet.
    # You need to find the count of all the positive and negative words in these tweets.
    # Keep track of a running total counts and print this at every time step (use the pprint function).
    # YOUR CODE HERE
    words = tweets.flatMap(lambda line: line.split(' ')) \
            .map(lambda word: ('positive', 1) if word in pwords else ('negative', 1) if word in nwords else ('none', 1)) \
            .filter(lambda x: x[0]=='positive' or x[0]=='negative') \
            .reduceByKey(lambda x, y: x + y)
    # Print the first ten elements of each RDD generated in this DStream to the console
    def updateValues(values, count):
        if count is None:
            count = 0
        return sum(values, count)

    updatedWords = words.updateStateByKey(updateValues)
    updatedWords.pprint()
    
    # Let the counts variable hold the word counts for all time steps
    # You will need to use the foreachRDD function.
    # For our implementation, counts looked like:
    #   [[("positive", 100), ("negative", 50)], [("positive", 80), ("negative", 60)], ...]
    counts = []
    # YOURDSTREAMOBJECT.foreachRDD(lambda t,rdd: counts.append(rdd.collect()))
    words.foreachRDD(lambda t,rdd: counts.append(rdd.collect()))

    ssc.start()                         # Start the computation
    ssc.awaitTerminationOrTimeout(duration)
    ssc.stop(stopGraceFully=True)

    return counts
Example #13
0
    def test_kafka_direct_stream_transform_get_offsetRanges(self):
        """Test the Python direct Kafka stream transform get offsetRanges."""
        topic = self._randomTopic()
        sendData = {"a": 1, "b": 2, "c": 3}
        kafkaParams = {"metadata.broker.list": self._kafkaTestUtils.brokerAddress(),
                       "auto.offset.reset": "smallest"}

        self._kafkaTestUtils.createTopic(topic)
        self._kafkaTestUtils.sendMessages(topic, sendData)

        stream = KafkaUtils.createDirectStream(self.ssc, [topic], kafkaParams)

        offsetRanges = []

        def transformWithOffsetRanges(rdd):
            for o in rdd.offsetRanges():
                offsetRanges.append(o)
            return rdd

        # Test whether it is ok mixing KafkaTransformedDStream and TransformedDStream together,
        # only the TransformedDstreams can be folded together.
        stream.transform(transformWithOffsetRanges).map(lambda kv: kv[1]).count().pprint()
        self.ssc.start()
        self.wait_for(offsetRanges, 1)

        self.assertEqual(offsetRanges, [OffsetRange(topic, 0, long(0), long(6))])
def start():
    sconf = SparkConf()
    sconf.set('spark.cores.max', 2)
    sc = SparkContext(appName='KafkaDirectWordCount', conf=sconf)
    ssc = StreamingContext(sc, 2)

    brokers = "192.192.0.27:9092"
    topics = ['topic7']

    kafkaStreams_lines = KafkaUtils.createDirectStream(ssc, topics, kafkaParams={"metadata.broker.list": brokers})

    lines1 = kafkaStreams_lines.map(lambda x: x[1])  # 注意 取tuple下的第二个即为接收到的kafka流

    words = lines1.flatMap(lambda line: line.split(" "))

    pairs = words.map(lambda word: (word, 1))

    wordcounts = pairs.reduceByKey(lambda x, y: x + y)

    wordcounts.saveAsTextFiles("/var/lib/hadoop-hdfs/spark-libin/kafka")

    wordcounts.pprint()
    # 统计生成的随机数的分布情况
    ssc.start()  # Start the computation
    ssc.awaitTermination()  # Wait for the computation to terminate
Example #15
0
def main():
    sc = SparkContext(appName="IntrusionDetector")
    ssc = StreamingContext(sc, batch_durations)

    kvs = KafkaUtils.createDirectStream(ssc, [input_topic], {"metadata.broker.list": broker})
    kvs.foreachRDD(processRDD)
    ssc.start()
    ssc.awaitTermination()
def stream(ssc, pwords, nwords, duration):
    kstream = KafkaUtils.createDirectStream(
        ssc, topics = ['twitterstream'], kafkaParams = {"metadata.broker.list": 'localhost:9092'})
    tweets = kstream.map(lambda x: x[1].encode("ascii","ignore"))

    # Each element of tweets will be the text of a tweet.
    # You need to find the count of all the positive and negative words in these tweets.
    # Keep track of a running total counts and print this at every time step (use the pprint function).
    

    #tweets.pprint()
    words = tweets.flatMap(lambda tweet:tweet.split(" "))
    #words.pprint()

    positive = words.filter(lambda x: (x in pwords))
    negative = words.filter(lambda x: (x in nwords))

    #positive.pprint()
    #negative.pprint()

    ppairs = positive.map(lambda p: ('positive', 1))
    npairs = negative.map(lambda n: ('negative', 1))

    pwordCounts = ppairs.reduceByKey(lambda x, y: x + y)
    nwordCounts = npairs.reduceByKey(lambda x, y: x + y)

    count = pwordCounts.union(nwordCounts)
    #count.pprint()
    #pwordCounts.pprint()
    #nwordCounts.pprint()

    def updateFunction(newValues, runningCount):
        if runningCount is None:
           runningCount = 0
        return sum(newValues, runningCount)

    prunningCounts = pwordCounts.updateStateByKey(updateFunction)
    nrunningCounts = nwordCounts.updateStateByKey(updateFunction)

    #prunningCounts.pprint()
    #nrunningCounts.pprint()

    total = prunningCounts.union(nrunningCounts)
    total.pprint()

    # Let the counts variable hold the word counts for all time steps
    # You will need to use the foreachRDD function.
    # For our implementation, counts looked like:
    #   [[("positive", 100), ("negative", 50)], [("positive", 80), ("negative", 60)], ...]
    counts = []
    count.foreachRDD(lambda t,rdd: counts.append(rdd.collect()))

    ssc.start()                         # Start the computation
    ssc.awaitTerminationOrTimeout(duration)
    ssc.stop(stopGraceFully=True)

    return counts
def kafka_spark_streaming_sql_main(app_name, brokers, topic, interval_seconds, sql_function):
    sc = SparkContext(appName=app_name)
    sqlContext = SQLContext(sc)
    # ssc = StreamingContext(sc, interval_seconds)
    ssc = StreamingContext(sc, 10)
    kvs = KafkaUtils.createDirectStream(ssc, [topic], {"metadata.broker.list": brokers})
    kvs.foreachRDD(sql_function)
    ssc.start()
    ssc.awaitTermination()
def read_tweets():

    sc = SparkContext(appName="sentimentProducer")
    ssc = StreamingContext(sc,600)  # Test 60 segundos
    brokers = "localhost:9092"
    kvs = KafkaUtils.createDirectStream(ssc, ["test"], {"metadata.broker.list": brokers})
    kvs.foreachRDD(create_format)
    producer.flush()
    ssc.start()
    ssc.awaitTermination()
def functionToCreateContext():
    sc = SparkContext(appName="StreamingExampleWithKafka")
    ssc = StreamingContext(sc, 10)
    ssc.checkpoint("checkpoint")
    opts = {"metadata.broker.list": "node1.example.com:6667,node2.example.com:6667"}
    kvs = KafkaUtils.createDirectStream(ssc, ["mytopic"], opts)
    lines = kvs.map(lambda x: x[1])
    counts = lines.flatMap(lambda line: line.split(" ")).map(lambda word: (word, 1)).updateStateByKey(updateFunction)
    counts.pprint()
    return ssc
    def test_kafka_direct_stream(self):
        """Test the Python direct Kafka stream API."""
        topic = self._randomTopic()
        sendData = {"a": 1, "b": 2, "c": 3}
        kafkaParams = {"metadata.broker.list": self._kafkaTestUtils.brokerAddress(), "auto.offset.reset": "smallest"}

        self._kafkaTestUtils.createTopic(topic)
        self._kafkaTestUtils.sendMessages(topic, sendData)

        stream = KafkaUtils.createDirectStream(self.ssc, [topic], kafkaParams)
        self._validateStreamResult(sendData, stream)
Example #21
0
    def test_kafka_direct_stream_from_offset(self):
        """Test the Python direct Kafka stream API with start offset specified."""
        topic = self._randomTopic()
        sendData = {"a": 1, "b": 2, "c": 3}
        fromOffsets = {TopicAndPartition(topic, 0): long(0)}
        kafkaParams = {"metadata.broker.list": self._kafkaTestUtils.brokerAddress()}

        self._kafkaTestUtils.createTopic(topic)
        self._kafkaTestUtils.sendMessages(topic, sendData)

        stream = KafkaUtils.createDirectStream(self.ssc, [topic], kafkaParams, fromOffsets)
        self._validateStreamResult(sendData, stream)
def stream_kafka():
    global ssc

    kstream = KafkaUtils.createDirectStream(ssc, topics=['2008'], kafkaParams={
        "metadata.broker.list": 'ip-172-31-12-78.us-west-1.compute.internal:6667'})

    contents = kstream.flatMap(get_airport_carrier_delay).reduceByKey(
        lambda a, b: (a[0] + b[0], a[1] + b[1])).foreachRDD(top_complex_average)

    ssc.start()
    ssc.awaitTerminationOrTimeout(15000)
    ssc.stop(stopSparkContext=True, stopGraceFully=True)
def stream_kafka():
    global ssc

    kstream = KafkaUtils.createDirectStream(ssc, topics=['2008'], kafkaParams={
        "metadata.broker.list": 'ip-172-31-12-78.us-west-1.compute.internal:6667'})
    # kstream = KafkaUtils.createStream(ssc, "localhost:2181", "raw-event-streaming-consumer", {"2008":1})
    contents = kstream.flatMap(get_airports).updateStateByKey(update_count)
    contents.foreachRDD(lambda rdd: top_airports(rdd))

    ssc.start()
    ssc.awaitTerminationOrTimeout(15000)
    ssc.stop(stopSparkContext=True, stopGraceFully=True)
def main():
    parser = OptionParser()
    parser.add_option('', '--enriched_data_path', action='store', dest='enriched_data_path', help='path to write enriched data')
    parser.add_option('', '--model_path', action='store', dest='model_path', help='path for model data')
    parser.add_option('', '--kafka_zookeeper_hosts', action='store', dest='kafka_zookeeper_hosts', help='list of Zookeeper hosts (host:port)')
    parser.add_option('', '--kafka_broker_list', action='store', dest='kafka_broker_list', help='list of Kafka brokers (host:port)')
    parser.add_option('', '--kafka_message_topic', action='store', dest='kafka_message_topic', help='topic to consume input messages from')
    parser.add_option('', '--kafka_alert_topic', action='store', dest='kafka_alert_topic', help='topic to produce alert messages to')
    parser.add_option('', '--kafka_enriched_data_topic', action='store', dest='kafka_enriched_data_topic', help='topic to produce enriched data to')
    parser.add_option('', '--streaming_batch_duration_sec', type='float', default=15.0,
        action='store', dest='streaming_batch_duration_sec', help='Streaming batch duration in seconds')
    parser.add_option('', '--max_batches', type='int', default=0,
        action='store', dest='max_batches', help='Number of batches to process (0 means forever)')
    options, args = parser.parse_args()

    sc = SparkContext()
    ssc = StreamingContext(sc, options.streaming_batch_duration_sec)
    sqlContext = getSqlContextInstance(sc)

    # Load saved model.
    model = None
    if options.model_path:
        model = RandomForestModel.load(sc, options.model_path)
    else:
        print('No model loaded.')

    # Create Kafka stream to receive new messages.
    kvs = KafkaUtils.createDirectStream(ssc, [options.kafka_message_topic], {
        'metadata.broker.list': options.kafka_broker_list,
        'group.id': 'spark_streaming_processor.py'})

    # Take only the 2nd element of the tuple.
    messages = kvs.map(lambda x: x[1])

    # Convert RDD of JSON strings to RDD of Rows.
    rows = messages.map(json_to_row)

    # Process messages.
    rows.foreachRDD(lambda time, rdd: 
        process_messages(time, rdd,
            ssc=ssc,
            model=model,
            enriched_data_path=options.enriched_data_path,
            zookeeper_hosts=options.kafka_zookeeper_hosts,
            kafka_alert_topic=options.kafka_alert_topic,
            kafka_enriched_data_topic=options.kafka_enriched_data_topic,
            max_batches=options.max_batches))

    ssc.start()
    ssc.awaitTermination()
def main():
    brokers = 'localhost:9092'
    topic = 'openbmp.parsed.unicast_prefix'
    sc = SparkContext(appName='BGPPrefixOriginValidation')
    ssc = StreamingContext(sc,2)
 
    directKafkaStream = KafkaUtils.createDirectStream(ssc, [topic], {'metadata.broker.list':brokers})
    #directKafkaStream.pprint()

    lines = directKafkaStream.flatMap(lambda x: x[1].splitlines()).filter(lambda line: line.startswith('add'))
    structured_rdd = lines.map(structure_data)
 
    structured_rdd.foreachRDD(lambda rdd: rdd.foreachPartition(validate_bgp_prefix)) 
    
    ssc.start()
    ssc.awaitTermination()
def createContext(brokers, topic):
    print("Create new context")
    conf = SparkConf().set("spark.default.parallelism", "2").set("spark.streaming.kafka.maxRatePerPartition", 1000)
    sc = SparkContext(appName="PythonStreamingDirectKafkaWordCount", conf=conf)
    ssc = StreamingContext(sc, 10)
    # 直接 连接kafka,没有Receiver, 可以完全实现exactly-once
    # 具体exactly-once 可参考 http://blog.cloudera.com/blog/2015/03/exactly-once-spark-streaming-from-apache-kafka/
    kvs = KafkaUtils.createDirectStream(ssc, [topic], {"metadata.broker.list": brokers, "auto.offset.reset": "smallest"})
    kvs.transform(storeOffsetRanges).foreachRDD(printOffsetRanges)
    lines = kvs.map(lambda x: x[1])
    counts = lines.flatMap(lambda line: line.split(" ")) \
        .map(lambda word: (word, 1)) \
        .reduceByKey(lambda a, b: a+b)
    counts.pprint()

    return ssc
def stream(ssc, pwords, nwords, duration):
    kstream = KafkaUtils.createDirectStream(
        ssc, topics = ['twitterstream'], kafkaParams = {"metadata.broker.list": 'localhost:9092'})
    tweets = kstream.map(lambda x: x[1].encode("ascii","ignore"))
    
    # Each element of tweets will be the text of a tweet.
    # You need to find the count of all the positive and negative words in these tweets.
    # Keep track of a running total counts and print this at every time step (use the pprint function).
    
	#creating a list for filtering positive || negative permissible words
    List = pwords + nwords 
	#changing key value to positive and negative for the RDD.
    counts = tweets.map(lambda line: line.split(" ")).flatMap(lambda line: line) \
        .map(lambda word: (word, 1)).filter(lambda x : x[0] in List).map(lambda x: ('positive', x[1]) if (x[0] in pwords) else ('negative', x[1]))
    # Add counts, tweets stores sum across all time stamps 
	tweets = counts.updateStateByKey(updateFunction)
    def readSource(ssc, di_in_conf_with_ds_conf, app_conf):
        sourceType = di_in_conf_with_ds_conf['source.type']

        if sourceType == 'kafka':
            kafkaSimpleConsumerApiUsed = app_conf.get('kafka.simple.consumer.api.used', True)
            if kafkaSimpleConsumerApiUsed:
                topics = di_in_conf_with_ds_conf['topics']
                if not isinstance(topics, list):
                    raise TypeError("topic should be list")

                brokers = di_in_conf_with_ds_conf['metadata.broker.list']
                kafkaParams = {"metadata.broker.list": brokers}
                stream = KafkaUtils.createDirectStream(ssc, topics, kafkaParams).map(lambda x: x[1])
            else:
                zkConnect = di_in_conf_with_ds_conf['zookeeper.connect']
                groupId = app_conf['group.id']
                numReceivers = app_conf.get('num.receivers', 1)
                numConsumerFetchers = app_conf.get('num.consumer.fetchers')
                topics = di_in_conf_with_ds_conf['topics']
                topic_map = dict(zip(topics, numConsumerFetchers))
                # streams = reduce(lambda x, y: x.union(y),
                #                  map(KafkaUtils.createStream(ssc, zkConnect, groupId, topic_map),
                #                      range(0, numReceivers)))
                streams = [KafkaUtils.createStream(ssc, zkConnect, groupId, topic_map) for i in range(0, numReceivers)]
                stream = ssc.union(streams).map(lambda x: x[1])
        elif sourceType == 'hdfs':
            path = di_in_conf_with_ds_conf['fs.defaultFS'] + '/' + di_in_conf_with_ds_conf['path']
            stream = ssc.textFilesStream(path)
        else:
            raise Exception('Error: unsupported source.type = ' + sourceType)

        num_repartition = app_conf.get('dataInterface.stream.repatition.partitions')
        if num_repartition is None or not isinstance(num_repartition, int):
            stream2 = stream
        else:
            stream2 = stream.repartition(num_repartition)

        # 是否使用格式化插件类格式化
        format_class_path = di_in_conf_with_ds_conf.get('format.class', '')
        if format_class_path.strip() == '':
            stream3 = stream2
        else:
            format_class_obj = get_class_obj(format_class_path)
            stream3 = format_class_obj.format(stream2)

        return stream3
def createContext(brokers, topic, checkpointDir):
    # If you do not see this printed, that means the StreamingContext has been loaded
    # from the new checkpoint

    sc = SparkContext(appName="PythonStreamingRecoverableNetworkWordCount")
    ssc = StreamingContext(sc, 1)

    kvs = KafkaUtils.createDirectStream(ssc, [topic], {"metadata.broker.list": brokers})
    lines = kvs.map(lambda x: x[1])
    wordCounts = lines.flatMap(lambda line: line.split(" ")) \
        .map(lambda word: (word, 1)) \
        .reduceByKey(lambda a, b: a+b)

    #wordCounts.foreachRDD(echo)
    wordCounts.pprint()
    ssc.checkpoint(checkpointDir)
    return ssc
def functionToCreateContext():
  # spark context config
  sc = SparkContext(appName="StreamingExampleWithKafka")
  ssc = StreamingContext(sc, 10)
  ssc.checkpoint("checkpoint")
  
  # kafka
  opts = {"metadata.broker.list": "node1.example.com:6667,node2.example.com:6667"}
  kvs = KafkaUtils.createDirectStream(ssc, ["mytopic"], opts)
  # processing
  lines = kvs.map(lambda x: x[1])
  counts = lines.flatMap(lambda line: line.split(" ")) \
   .map(lambda word: (word, 1)) \
   .updateStateByKey(updateFunction) \
   .map(toStringList) \
   .foreachRDD(lambda rdd: rdd.saveAsNewAPIHadoopDataset(conf=conf, keyConverter=keyConv, valueConverter=valueConv))
  return ssc
Example #31
0
def process(rdd):
    print(">>>> BEGIN CASS")
    info = getSqlContextInstance(rdd.context).createDataFrame(rdd)
    info.registerTempTable("info")
    info.write.format("org.apache.spark.sql.cassandra").\
             options(keyspace="record", table="campaign").\
             save(mode="append")

    print(">>>> END CASS")


if __name__ == "__main__":
    sc = SparkContext(appName="test-streaming")
    ssc = StreamingContext(sc, 5)

    chance_stream = KafkaUtils.createDirectStream(
        ssc, [KAFKA_TOPIC_CHANCE], {"metadata.broker.list": KAFKA_NODE})
    impression_stream = KafkaUtils.createDirectStream(
        ssc, [KAFKA_TOPIC_IMPRESSION], {"metadata.broker.list": KAFKA_NODE})
    click_stream = KafkaUtils.createDirectStream(
        ssc, [KAFKA_TOPIC_CLICK], {"metadata.broker.list": KAFKA_NODE})



    campaignChance = chance_stream.map(lambda record : record[1].split(" ")).\
         map(lambda x: (x[0],1)).\
         reduceByKey(lambda x,y : x+y)
    campaignImpr = impression_stream.map(lambda (k, v): json.loads(v)).\
         map(lambda x: (x['compaignId'],x['price'],1)).\
         educeByKey(lambda x,y : (x[0]+y[0], x[1]+y[1])).\
         map(lambda x : (x[0], x[1]/x[0]))
    campaignClick = click_stream.map(lambda (k, v): json.loads(v)).\
import os
os.environ['SPARK_HOME'] = '/usr/lib/spark'

from pyspark.streaming.kafka import KafkaUtils
from pyspark import SparkContext, SparkConf
from pyspark.streaming import StreamingContext

if __name__ == "__main__":
    conf = (SparkConf().setMaster("local[*]").setAppName("spar_stream").set(
        "spark.executor.memory", "4g").set("spark.driver.memory", "5g"))
    sc = SparkContext(conf=conf)
    ssc = StreamingContext(sc, 10)
    kafkastream = KafkaUtils.createDirectStream(
        ssc, ['rna'], {"metadata.broker.list": 'localhost:9093'})
    kafkastream.pprint()
    # print count.pprint()
    # kafkastream.saveAsTextFile("/user/cloudera/twitter"3
    # flatMap(lambda x: x.split('\n')).map(lambda x: x.split(','))
    ssc.start()
    ssc.awaitTermination()
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils


def handle_rdd(rdd):
    if not rdd.isEmpty():
        global ss
        df = ss.createDataFrame(rdd, schema=['text', 'words', 'length'])
        df.show()
        # df.write.saveAsTable(name='default.tweets', format='hive', mode='append')


sc = SparkContext(appName="Something")
ssc = StreamingContext(sc, 5)
ss = SparkSession.builder.appName("Something").getOrCreate()
ss.sparkContext.setLogLevel('WARN')
ks = KafkaUtils.createDirectStream(ssc, ['kafkaTwitterSpark'],
                                   {'metadata.broker.list': 'localhost:9096'})

lines = ks.map(lambda x: x[1])
transform = lines.map(lambda tweet:
                      (tweet, int(len(tweet.split())), int(len(tweet))))

transform.foreachRDD(handle_rdd)

ssc.start()
ssc.awaitTermination()
Example #34
0
    stream.map(pair).reduceByKey(lambda a, b: (a[0] + b[0], a[1] + b[1])).map(
        lambda (k, v): (k, v[0] / v[1])).foreachRDD(send_to_kafka)


if __name__ == '__main__':
    if len(sys.argv) != 4:
        print("Usage: stream-process.py [topic] [target-topic] [broker-list]")
        exit(1)

    # - create SparkContext and StreamingContext
    sc = SparkContext("local[2]", "StockAveragePrice")
    sc.setLogLevel('INFO')
    ssc = StreamingContext(sc, 5)

    topic, target_topic, brokers = sys.argv[1:]

    # - instantiate a kafka stream for processing
    directKafkaStream = KafkaUtils.createDirectStream(
        ssc, [topic], {'metadata.broker.list': brokers})
    process_stream(directKafkaStream)

    # - instantiate a simple kafka producer
    kafka_producer = KafkaProducer(bootstrap_servers=brokers)

    # - setup proper shutdown hook
    atexit.register(shutdown_hook, kafka_producer)

    ssc.start()
    ssc.awaitTermination()
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.sql import SQLContext
from pyspark.streaming.kafka import KafkaUtils
import json


if __name__ == '__main__':
    sc = SparkContext(appName='PythonSparkStreamingKafka')
    sc.setLogLevel("WARN")  # avoid printing logs
    ssc = StreamingContext(sparkContext=sc, batchDuration=2)
    spark_sql = SQLContext(sparkContext=sc)

    kafkaStream = KafkaUtils.createDirectStream(ssc=ssc,
                                                topics=['trump'],
                                                kafkaParams={"metadata.broker.list": 'localhost:9092'})

    dfs = kafkaStream.\
        map(lambda dstream: dstream[1])
    dfs.pprint(2)

    ssc.start()
    ssc.awaitTermination()
Example #36
0
import sys

from pyspark import SparkContext, SparkConf
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils


def echo(rdd):
    print rdd


if __name__ == '__main__':
    sc = SparkContext(appName="text")
    ssc = StreamingContext(sc, 5)
    broker = "localhost:2181"
    topic = "model"
    kbrokers = "localhost:9092"
    # kvs=KafkaUtils.createStream(ssc,broker,"3e",{topic:1},kafkaParams={"metadata.broker.list":kbrokers})
    # kvs = KafkaUtils.createStream(ssc, "localhost:2181", 'spark', {"model": 1})
    kvs=KafkaUtils.createDirectStream(ssc,["model"],{"metadata.broker.list": kbrokers})
    lines = kvs.map(lambda x: x[1])
    counts = lines.flatMap(lambda line: line.split(" ")).map(lambda word: (word, 1)) \
        .reduceByKey(lambda a, b: a + b)
    counts.pprint()

    ssc.start()

    ssc.awaitTermination()
Example #37
0
        .option("dbtable", OFFSET_TABLE_NAME) \
        .option("user", TARGET_DB_USER_NAME) \
        .option("password", TARGET_DB_USER_PASSWORD) \
        .load()

    maxOffset = df_read_offsets.agg({'OFFSET': 'max'}).collect()[0][0]
    if maxOffset == None:
        maxOffset = 0

    topicPartion = TopicAndPartition(TOPIC, PARTITION)
    fromOffset = {topicPartion: maxOffset}
    kafkaParams = {
        "metadata.broker.list": BROKER_LIST,
        "enable.auto.commit": "false"
    }

    directKafkaStream = KafkaUtils.createDirectStream(
        ssc, [TOPIC],
        kafkaParams,
        fromOffsets=fromOffset,
        keyDecoder=deserializer(),
        valueDecoder=deserializer())

    time.sleep(5)
    directKafkaStream.foreachRDD(lambda x: save_data(x))
    directKafkaStream.transform(store_offset_ranges) \
        .foreachRDD(write_offset_ranges)

    ssc.start()
    ssc.awaitTermination()
Example #38
0
from big_data_kafka.kafka_producer import produce, get_producer


def push_likes_counts_to_kafka(fb_likes):
    producer = get_producer('likes-counts')
    producer.start()
    for fb_like in fb_likes:
        print('FB like:')
        print(fb_like)
        produce(producer, fb_like)
    # produce_messages(producer, fb_likes)
    producer.stop()


if __name__ == "__main__":
    sc = SparkContext(appName='PythonStreamingDirectKafkaWordCount')
    ssc = StreamingContext(sc, 30)
    # noinspection PyDeprecation
    kvs = KafkaUtils.createDirectStream(ssc, ['bdsample'], {'metadata.broker.list': 'localhost:9092'})
    kvs.pprint()
    print('Running KAFKA JOB')
    likes = kvs.map(lambda message: json.loads(message[1]))
    counts = likes.map(lambda fb: (parser.parse(fb['timestamp']).hour, 1)) \
        .reduceByKey(lambda a, b: a + b)

    counts.pprint()
    counts.foreachRDD(lambda rdd: rdd.foreachPartition(push_likes_counts_to_kafka))

    ssc.start()
    ssc.awaitTermination()
Example #39
0
    for record in records:
        message = "%s %s %s %s %s %s %s %s" % \
         (record[0][0], record[0][1], record[0][2], record[0][3], record[1][0], record[1][1], record[1][2], record[1][3])
        producer.send_messages('best_flights_2008', message.encode())


# MAIN

sc = SparkContext(appName="BestFlights")
sc.setLogLevel('ERROR')

# Create a local StreamingContext
ssc = StreamingContext(sc, 1)
ssc.checkpoint(
    "s3://hsc4-cc-part2-streaming/checkpoints/checkpoint-best-flights")
lines = KafkaUtils.createDirectStream(ssc, ['input_2008'], \
 {"metadata.broker.list": sys.argv[1], "auto.offset.reset":"smallest"})

# Filter only for data in 2008
lines = lines.map(lambda tup: tup[1])

# Split each line by separator
rows = lines.map(lambda line: line.split())

# Get relevant data
rows = rows.filter(lambda row: len(row) > 8)
airports_fromto = rows.map(lambda row: ( \
  (row[0], row[1], row[2], AMOrPM(row[5])), \
  (row[3], row[4], departureTimePretty(row[5]), float(row[8])) \
 ) \
)
# Filtering just necessary flights
Example #40
0
            for category in categoriesAll[i]:
                #combine two classes
                if labels.collect()[0] > 0:
                    category = category + '_like'
                else:
                    category = category + '_dislike'
                Push_to_ES.push('tweets_2', category, json.dumps(data[i]))


kafkaparams = {
    "zookeeper.connect": "localhost:2181",
    "group.id": "my-group",
    "zookeeper.connection.timeout.ms": "10000",
    "metadata.broker.list": "localhost:9092"
}

Stream_feedback_B = KafkaUtils.createDirectStream(ssc, topic_1, kafkaparams)
Stream_feedback_A = KafkaUtils.createDirectStream(ssc, topic_3, kafkaparams)
Stream_rawtweets = KafkaUtils.createDirectStream(ssc, topic_2, kafkaparams)

Stream_feedback_B.pprint()
Stream_feedback_A.pprint()
Stream_rawtweets.pprint()

Stream_feedback_B.foreachRDD(lambda k: process_feedback_B(k))
Stream_feedback_A.foreachRDD(lambda k: process_feedback_A(k))
Stream_rawtweets.foreachRDD(lambda k: process_tweets(k))

ssc.start()
ssc.awaitTermination()
Example #41
0
import pyspark_cassandra
from pyspark_cassandra import streaming
import time


def sanitize(line):
    line = line.replace('\\n', ' ')
    line = re.sub(r'\\u.{4}', '', line)
    return line


if __name__ == "__main__":
    sc = SparkContext(appName="TwitterGenerator")
    sc.setLogLevel('ERROR')
    ssc = StreamingContext(sc, 10)
    kvs = KafkaUtils.createDirectStream(
        ssc, ["tweets.topic"], {"metadata.broker.list": "broker:9092"})

    lines = kvs.map(lambda x: x[1])
    counts = lines\
            .flatMap(lambda line: sanitize(line).split(" "))\
            .filter(lambda w: w.startswith('#'))\
            .map(lambda word: (word, 1))\
            .reduceByKey(lambda a, b: a+b)\
            .map(lambda t: (time.time() * 1000, t[0], t[1]))

    counts.pprint()

    counts.saveToCassandra("myks",
                           "test",
                           columns=['timestamp', 'word', 'count'])
    conf.set("spark.cassandra.connection.host", "10.240.14.37")
    conf.set("spark.cassandra.connection.port", "9042")

    # SparkContext represents the connection to a Spark cluster
    # Only one SparkContext may be active per JVM
    sc = SparkContext(conf=conf)

    # Creating a streaming context with batch interval of 10 sec
    # As the main point of entry for streaming, StreamingContext handles the streaming application's actions,
    # including checkpointing and transformations of the RDD.
    ssc = StreamingContext(sc, 3)

    # DStream 반환 (RDD로 이루어진 객체) (RDD 스파크 데이터 단위)
    kafkaStream = KafkaUtils.createDirectStream(
        ssc,
        topics=["tweets"],
        kafkaParams={"bootstrap.servers": "localhost:9092"}
        #"group.id" -> "spark-streaming-notes",
        #"auto.offset.reset" -> "earliest"
    )

    #Parse Twitter Data as json
    json_stream = kafkaStream.map(lambda tweet: json.loads(tweet[1]))
    parsed = json_stream.map(lambda tweet: tweet_filter(tweet))
    parsed.foreachRDD(lambda x: x.saveToCassandra("bts", "tweet_dataset"))
    #parsed.pprint()

    #Start Execution of Streams
    ssc.start()
    ssc.awaitTermination()
Example #43
0
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils


def deal_data(rdd):
    data = rdd.collect()
    for d in data:
        print(d)


sc = SparkContext(sparkHome="local", appName="Realtime-Analytics-Engine")
ssc = StreamingContext(sc, batchDuration=int(6))

kafkaParams = {
    "metadata.broker.list":
    "192.168.32.18:9092,192.168.32.19:9092,192.168.32.20:9092",
    "serializer.class": "kafka.serializer.StringEncoder",
    "auto.offset.reset": "smallest",
    "fetch.message.max.bytes": "22388608"
}
kvs = KafkaUtils.createDirectStream(
    ssc,
    list('senmdt-cache-records'),
    kafkaParams,
    keyDecoder="kafka.serializer.StringEncoder",
    valueDecoder="kafka.serializer.StringEncoder")

kvs.foreachRDD(lambda rdd: deal_data(rdd))
    .option('kudu.master',kuduMaster)\
    .option('kudu.table','impala::sensors.asset_sensors').load()

assets = sqc.read.format('org.apache.kudu.spark.kudu')\
    .option('kudu.master',kuduMaster)\
    .option('kudu.table','impala::sensors.well_assets').load()

sensorInfo = sensors.join(assets, ['asset_id'])

# Persist in memory for fast lookup
sensorInfo.persist(StorageLevel.MEMORY_ONLY)
sensorInfo.show()

# Initialize the Spark Streaming Context to pull data from Kafka every 5 seconds
ssc = StreamingContext(sc, 30)
kafkaStream = KafkaUtils.createDirectStream(
    ssc, [kafkaTopic], {"metadata.broker.list": kafkaBroker})
sensorDS = kafkaStream.map(lambda x: x[1])\


def process(time, rdd):
    print("========= Time: %s =========" % str(time))
    try:
        rawSensor = spark.read.json(rdd)
        rawSensor = rawSensor.withColumn('sensor_id',rawSensor.sensor_id.cast('integer'))\
          .withColumn('value',rawSensor.value.cast('float'))

        if rawSensor.count() == 0:
            print('No data, sleep until next window')
            return

        print('Raw Sensor Data:')
from __future__ import print_function
import sys
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
from pyspark.sql.context import SQLContext


if __name__ == "__main__":
    sc = SparkContext("local", "wordcounttest")
    #sc.setLogLevel(logLevel="OFF")
    ssc = StreamingContext(sc, 20)
    #ssc.checkpoint("c:\Playground\spark\logs")
    brokers, topic = sys.argv[1:]
 
    kvs = KafkaUtils.createDirectStream(ssc, ["events.noflguid"],{"metadata.broker.list":"ec2-52-203-200-3.compute-1.amazonaws.com:9092"})

    print(str(brokers))
    print(str(topic))
  
    lines = kvs.map(lambda x: x[1])
    counts = lines.flatMap(lambda line: line.split(" ")) \
                  .map(lambda word: (word, 1)) \
                  .reduceByKey(lambda a, b: a+b)
    counts.pprint()
    print(kvs.count())
    ssc.start()
    ssc.awaitTermination()
Example #46
0
        with open(sys.argv[1], 'r') as yml:
            config = yaml.load(yml)
    except FileNotFoundError as ex:
        print('ERROR: Config file does not exist: ' + str(ex) + '\n')
        traceback.print_tb(ex.__traceback__)

    print(config)

    spark = SparkSession \
        .builder \
        .appName(config['app_name']) \
        .getOrCreate()
    ssc = StreamingContext(spark.sparkContext, config['time_batch_window'])
    sql_context = SQLContext(spark)

    kvs = KafkaUtils.createDirectStream(ssc,
                                        [config['kafka']['topic']],
                                        {"metadata.broker.list": config['kafka']['brokers']})

    data_functions = DataFunction(config)

    lines = kvs \
        .map(lambda rdd: rdd[1]) \
        .map(data_functions.load_json_from_string) \
        .filter(lambda x: x != {})

    lines.foreachRDD(lambda rdd: data_functions.create_df(spark, sql_context, rdd))

    ssc.start()
    ssc.awaitTermination()
Example #47
0
#Insert data into Kudu
def insert_into_kudu(time, rdd):
    sqc = getSqlContextInstance(rdd.context)
    kudu_df = sqc.createDataFrame(rdd, schema)
    kudu_df.show()
    kudu_df.write.format('org.apache.kudu.spark.kudu') \
                 .option('kudu.master',kudu_master) \
                 .option('kudu.table',kudu_table) \
                 .mode("append") \
                 .save()


if __name__ == "__main__":
    sc = SparkContext(appName="SparkStreaming_IoT")
    ssc = StreamingContext(sc, 5)  # 5 second window
    kvs = KafkaUtils.createDirectStream(
        ssc, [kafka_topic], {"metadata.broker.list": kafka_brokers})

    # parse the kafka message into a tuple
    kafka_stream = kvs.map(lambda x: x[1]) \
                           .map(lambda l: json.loads(l)) \
                           .map(lambda p: (int(p['sensor_id']),
                                           int(p['sensor_ts']),
                                           float(p['sensor_0']),
                                           float(p['sensor_1']),
                                           float(p['sensor_2']),
                                           float(p['sensor_3']),
                                           float(p['sensor_4']),
                                           float(p['sensor_5']),
                                           float(p['sensor_6']),
                                           float(p['sensor_7']),
                                           float(p['sensor_8']),
# return to the pool for future reuse
# ConnectionPool.returnConnection(connection)

# To Run:
# sudo $SPARK_HOME/bin/spark-submit --packages org.apache.spark:spark-streaming-kafka-0-8_2.11:2.1.0 kafka-spark-test.py
if __name__ == "__main__":

    # To run on cluster:
    # conf = SparkConf().setAppName("Venmo-Graph-Analytics-Dev").setMaster("spark://ip-172-31-0-135:7077")
    # sc = SparkContext(conf=conf)

    # To run locally:
    sc = SparkContext(appName="Venmo-Graph-Analytics-Dev")

    # Set up resources
    ssc = StreamingContext(sc, 1)  # Set Spark Streaming context

    # brokers = "ec2-50-112-19-115.us-west-2.compute.amazonaws.com:9092,ec2-52-33-162-7.us-west-2.compute.amazonaws.com:9092,ec2-52-89-43-209.us-west-2.compute.amazonaws.com:9092"
    brokers = "ec2-52-25-139-222.us-west-2.compute.amazonaws.com:9092"
    topic = 'Venmo-Transactions-Dev'

    kafka_stream = KafkaUtils.createDirectStream(
        ssc, [topic], {"metadata.broker.list": brokers})

    transaction = kafka_stream.map(lambda kafka_response: json.loads(kafka_response[1]))\
        .map(lambda json_body: extract_data(json_body))\
        .foreachRDD(lambda rdd: rdd.foreachPartition(send_partition))
    # transaction.pprint()

    ssc.start()
    ssc.awaitTermination()
    return rdd


def print_offset(rdd):
    for o in offsets:
        print("%s %s %s %s %s" % (o.topic, o.partition, o.fromOffset,
                                  o.untilOffset, o.untilOffset - o.fromOffset))


config = SparkConf().set("spark.streaming.kafka.maxRatePerPartition", 30000)
scontext = SparkContext(conf=config)
#scontext = SparkContext("local[2]", "kafka_pyspark_test")
stream_context = StreamingContext(scontext, 3)
msg_stream = KafkaUtils.createDirectStream(
    stream_context, [
        'test',
    ],
    kafkaParams={"metadata.broker.list": "127.0.0.1:9092,"})
'''result = msg_stream.map(lambda x :json.loads(x).keys()).reduce(out_put)
msg_stream.transform(store_offset,).foreachRDD(print_offset)
result.pprint()
'''

targets = msg_stream.map(lambda msg_stream: msg_stream[1])

json_values = []


def write(res):
    json_str = json.dumps(res)
    with open("test_data.json", "a") as json_file:
Example #50
0
import csv
from json import loads
from flatten_json import flatten
from time import sleep
# import pandas as pd


print("PROGRAM START!!!")
print("PROGRAM START!!!")
print("PROGRAM START!!!")
print("PROGRAM START!!!")

sc= SparkContext()
ssc = StreamingContext(sc, 10)
sqlc= SQLContext(sc)
directKafkaStream = KafkaUtils.createDirectStream(ssc, ["kafkaNBA"], {"metadata.broker.list": "localhost:9099"})
lines= directKafkaStream.map(lambda x: x[1])

print("LINES START!!!")
print("LINES START!!!")
print("LINES START!!!")
print("LINES START!!!")

def transformer(rdd):
	my_obj= json.loads(rdd)
	return (my_obj["player"]["weight_pounds"])
transform= lines.map(transformer)


def build_df(rdd):
	if not rdd.isEmpty():
# -*- coding: UTF-8 -*-

from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
from setting.default import DefaultConfig
import happybase

# 1、创建spark streaming context conf
conf = SparkConf()
conf.setAll(DefaultConfig.SPARK_ONLINE_CONFIG)
sc = SparkContext(conf=conf)
stream_sc = StreamingContext(sc, 60)

# 2、配置与kafka读取的配置
# 一个点击日志⾏行为如果有多个消费者,可以设置分组,那么这个数据会备份多份
similar_kafka = {"metadata.broker.list": DefaultConfig.KAFKA_SERVER, "group.id": 'similar'}

# 2.1、消费者(用户点击日志)
SIMILAR_DS = KafkaUtils.createDirectStream(stream_sc, ['click-trace'], similar_kafka)

# 2.2、配置HOT文章读取配置
kafka_params = {"metadata.broker.list": DefaultConfig.KAFKA_SERVER}
HOT_DS = KafkaUtils.createDirectStream(stream_sc, ['click-trace'], kafka_params)

# 2.3、配置新文章的读取KAFKA配置
click_kafkaParams = {"metadata.broker.list": DefaultConfig.KAFKA_SERVER}
NEW_ARTICLE_DS = KafkaUtils.createDirectStream(stream_sc, ['new-article'], click_kafkaParams)
Example #52
0
def enc(data):
    result = {k: helper(v) for k, v in data.items()}
    return result


#return dict(map(lambda line: line.encode('ascii'), pair.value) for pair in data.items())

conf = SparkConf().setMaster("local[*]").setAppName("StreamingDirectKafka")
sc = SparkContext(conf=conf)
ssc = StreamingContext(sc, 10)
skQuorum = "localhost:2181"
topic = ["meetup"]
kafkaParams = {"metadata.broker.list": "localhost:9092"}

#, kafkaParams = {"metadata.broker.list":"localhost:9092"}
kafkaStream = KafkaUtils.createDirectStream(ssc, topic, kafkaParams)
#stream = ssc.receiverStream( \
#    MeetupReceiver("https://stream.meetup.com/2/rsvps") \
#)
"""
data = kafkaStream.map(lambda line: json.loads(line)
"""
rsvp = kafkaStream.map(lambda line: line[1])
rsvp2 = rsvp.map(lambda line: json.loads(line.encode("ascii", "ignore")))

#kafkaStream.pprint()
"""
process = data.mapValues(lambda line: line.encode('ascii')).cache()
"""

#event = data["topic_name"]
Example #53
0
    new_vals0 = 0.0
    new_vals1 = 0
    for val in new_values:
        new_vals0 += val[0]
        new_vals1 += val[1]
    last_vals0 = last_sum[0] if last_sum is not None else 0.0
    last_vals1 = last_sum[1] if last_sum is not None else 0

    return (new_vals0 + last_vals0,\
            new_vals1 + last_vals1)



kafkaStream = KafkaUtils.createDirectStream(ssc,[topic], {
    'bootstrap.servers':'localhost:9092',
    'group.id':'video-group',
    'fetch.message.max.bytes':'15728640',
    'auto.offset.reset':'largest'}) # Group ID is completely arbitrary

ontime_data = kafkaStream.map(lambda x: x[1]).map(split).flatMap(parse)

filtered = ontime_data.map(lambda fl: ((fl.Origin, fl.Dest), (fl.ArrDelay, 1)))\
                .updateStateByKey(updateFunction)


# filtered.foreachRDD(lambda rdd: print_rdd(rdd))
filtered.foreachRDD(lambda rdd: rdd.foreachPartition(save_partition))

ssc.start()
# time.sleep(600) # Run stream for 10 minutes just in case no detection of producer # ssc. awaitTermination() ssc.stop(stopSparkContext=True,stopGraceFully=True)
Example #54
0
    .master("local[*]")\
    .getOrCreate()

sc = spark.sparkContext
ssc = StreamingContext(sparkContext=sc, batchDuration=1)

# the topic to subscribe
topic_to_sub = ["test"]
# the address of kafka, separate with comma if there are many
bootstrap_servers = "localhost:9092"
# kafka config info
kafka_params = {"metadata.broker.list": bootstrap_servers}

# initialize stream to consume data from kafka
kafka_stream = KafkaUtils.createDirectStream(ssc=ssc,
                                             topics=topic_to_sub,
                                             kafkaParams=kafka_params)

kafka_stream.pprint()
r = redis.Redis("127.0.0.1")


def save_redis(rdd):
    """
    save word count result into redis hash.

    example:
        word_count
            green 9
            blue 3
            red 1
from pyspark.streaming.kafka import KafkaUtils
from pyspark.streaming import StreamingContext
from pyspark import SparkContext
from pyspark.sql import SparkSession
import json
from json import loads

# spark-submit --packages org.apache.spark:spark-streaming-kafka-0-8_2.11:2.4.4 try1.py

sc = SparkContext(appName="samir")
ssc = StreamingContext(sc, 5)

ks = KafkaUtils.createDirectStream(ssc, ['kafkaNBA2'],
                                   {'metadata.broker.list': 'localhost:9092'})

result1 = ks.map(lambda x: json.loads(x[1])).flatMap(lambda x: x['data']).map(
    lambda x: x['player'])

result1.pprint()


def handle_rdd(rdd):
    if not rdd.isEmpty():
        global ss
        df = ss.createDataFrame(rdd,
                                schema=[
                                    'first_name', 'last_name', 'height_inches',
                                    'weight_pounds', 'team_id', 'height_feet',
                                    'position', 'id'
                                ])
        df.show()
from __future__ import print_function

import sys

from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils

if __name__ == '__main__':

    sc = SparkContext(appName="PythonStreamingKafkaWordCount")
    ssc = StreamingContext(sc, 1)

    kstream = KafkaUtils.createDirectStream(ssc, topics = ['CodeSubmission'], \
      kafkaParams = {"metadata.broker.list": '52.53.157.26:9092'})

    data = kstream.map(lambda x: x[1].encode("utf-8"))
    data.pprint()

    ssc.start()
    ssc.awaitTerminationOrTimeout(30)
    ssc.stop(stopGraceFully=True)
def getSqlContext(conf):
    input_uri = "mongodb://localhost:27017/Bitcoin.bitcoin"
    output_uri = "mongodb://localhost:27017/Bitcoin.bitcoin"

    conf.set('spark.mongodb.input.uri', input_uri)
    conf.set('spark.mongodb.output.uri', output_uri)
    conf.set('spark.mongodb.input.sampleSize', 50000)

    sc = SparkContext.getOrCreate(conf=conf)
    return SQLContext(sc)


conf = SparkConf().setAppName("BitcoinPrediction").setMaster('local')
sc = SparkContext.getOrCreate(conf=conf)
ssc = StreamingContext(sc, 5)
data = KafkaUtils.createDirectStream(
    ssc,
    topics=["Bitcoin"],
    kafkaParams={"metadata.broker.list": "localhost:9092"})

sqlContext = getSqlContext(conf)

# data.foreachRDD(lambda rdd: rdd.collect().toList)
df = sqlContext.createDataFrame(
    data.foreachRDD(lambda rdd: rdd.map(lambda x: (x[0], list(set(x[1]))))),
    "ok")
df.write.format("mongo").mode("append").save()

ssc.start()
ssc.awaitTermination()
Example #58
0
sc = SparkContext(appName="mytstApp")
sc.setLogLevel("ERROR")  # 减少shell打印日志
ssc = StreamingContext(sc, 30)
#tlist = ['Spark_1','Spark_2']
checkpoint_dir = './Checkpoint/spark'
ssc.checkpoint(checkpoint_dir)

kafka_params = {
    "bootstrap.servers": "localhost:9092",
    "group.id": "myUserGroup",
    "enable.auto.commit": "false",
    "auto.offset.reset": "largest"
}
dstream = [KafkaUtils.createDirectStream(ssc, [tlist[i]], kafka_params,\
      keyDecoder=spot_decoder,\
      valueDecoder=spot_decoder,\
            messageHandler=setHandler )\
           for i in range(len(tlist))
           ]
countList = []

for index in range(len(tlist)):
    print(tlist[index])
    tempt = ( dstream[index].map( lambda x : getID(x) )\
                    .map( lambda x : ( 1,  x))\
                    .updateStateByKey( updatefunction )\
                )
    print("lalalaall")
    countList.append(tempt)
    countList[index].foreachRDD(lambda x: displayID(x))
if __name__ == "__main__":

    # To run on cluster:
    # conf = SparkConf().setAppName("Venmo-Graph-Analytics-Test").setMaster("spark://ip-172-31-0-135:7077")
    # sc = SparkContext(conf=conf)

    # To run locally:
    sc = SparkContext(appName="Venmo-Graph-Analytics-Test")

    # Set up resources
    ssc = StreamingContext(sc, 1)  # Set Spark Streaming context

    # brokers = "ec2-50-112-19-115.us-west-2.compute.amazonaws.com:9092,ec2-52-33-162-7.us-west-2.compute.amazonaws.com:9092,ec2-52-89-43-209.us-west-2.compute.amazonaws.com:9092"
    brokers = "ec2-52-25-139-222.us-west-2.compute.amazonaws.com:9092"

    kafka_stream = KafkaUtils.createDirectStream(
        ssc, ['Venmo-Transactions-Test'], {"metadata.broker.list": brokers})

    transaction = kafka_stream.map(lambda kafka_response: json.loads(kafka_response[1]))\
        .map(lambda json_body: extract_data(json_body))\
        .foreachRDD(lambda rdd: rdd.foreachPartition(send_partition))

    # transaction.pprint()

    # lines = kafka_stream.map(lambda x: x[1])
    # counts = lines.flatMap(lambda line: line.split(" ")) \
    #     .map(lambda word: (word, 1)) \
    #     .reduceByKey(lambda a, b: a+b)
    # counts.pprint()

    ssc.start()
    ssc.awaitTermination()
Example #60
0
			#tf-idf calculation
			tf = HashingTF(numFeatures=numFeatures).transform(prep_filtered.map(porter_stem, preservesPartitioning=True))
			idf = IDF().fit(tf)
			train_tfidf = idf.transform(tf)

			#set training dataset with label
			training = review_labels.zip(train_tfidf).map(lambda x: LabeledPoint(x[0], x[1]))
			
			#train the model classifier
			model = NaiveBayes.train(training)
			model_name = "naivebayes"+str(counter_model)
			#save model classifier to HDFS
			output_dir = "hdfs://VM10-1-0-14:9000/classifier/"+model_name
			model.save(sc, output_dir)
			
			counter_model.add(1)

			end = time.time()
			print("Model Name : ", model_name ,", Total Reviews : ", reviews.count(), "Processing Time : ", (end-start))
	
	#create stream initiation to Kafka
	kvs = KafkaUtils.createDirectStream(ssc, [topic], kafkaParams)
	parsed = kvs.map(lambda v: json.loads(v[1]))
	reviews = parsed.map(lambda r: [r['overall'], r['reviewText']])	
	
	reviews.foreachRDD(process)	

	ssc.start()
	ssc.awaitTermination()