def main(ssc):
    zkQuorum, topic = sys.argv[1:]
    kvs = KafkaUtils.createStream(ssc, zkQuorum, "spark-streaming-consumer", {topic: 1})
    lines = kvs.map(lambda x: x[1])
    counts = lines.flatMap(lambda line: line.split(" ")) \
        .map(lambda word: (word, 1)) \
        .reduceByKey(lambda a, b: a+b)
    counts.pprint()

    ssc.start()
    ssc.awaitTermination()
Example #2
2
def stream(ssc):

    zkQuorum = "localhost:2181"
    topic = "topic1"
    tweets = KafkaUtils.createStream(ssc, zkQuorum, "spark-streaming-consumer", {topic: 1})
    kstream = KafkaUtils.createDirectStream(ssc, topics = ['topic1'], kafkaParams = {"metadata.broker.list":"localhost:9092"})

    tweets = tweets.map(lambda x: x[1].encode("ascii","ignore"))
    return tweets
Example #3
1
def main():
    # Create a local StreamingContext with two working thread and batch interval of 5 second
    sc = SparkContext("spark://ip-172-31-29-29:7077", "MyKafkaStream")

    # stream interval of 5 seconds
    ssc = StreamingContext(sc, 5)
    kafkaStream = KafkaUtils.createStream(ssc, "52.3.61.194:2181", "GroupNameDoesntMatter", {"parking_sensor_data": 2})
    messages = kafkaStream.flatMap(lambda s: create_tuple(s[1])).reduceByKey(lambda a,b: (int(a)+int(b))/2)
    messages1 = messages.filter(lambda s: s[1] > 0)
    messages1.pprint()

    ssc.start()             # Start the computation
    ssc.awaitTermination()  # Wait for the computation to terminate
def bro_parse(zk,topic,db,db_table,num_of_workers):
    
    app_name = "ONI-INGEST-{0}".format(topic)
    wrks = int(num_of_workers)

 	# create spark context
    sc = SparkContext(appName=app_name)
    ssc = StreamingContext(sc,1)
    sqc = HiveContext(sc)

    # create DStream for each topic partition.
    topic_dstreams = [ KafkaUtils.createStream(ssc, zk, app_name, {topic: 1}, keyDecoder=oni_decoder, valueDecoder=oni_decoder) for _ in range (wrks)  ] 
    tp_stream = ssc.union(*topic_dstreams)

    # Parallelism in Data Processing
    #processingDStream = tp_stream(wrks)

    # parse the RDD content.
    proxy_logs = tp_stream.map(lambda x: proxy_parser(x[1]))

    # save RDD into hive .
    proxy_logs.foreachRDD(lambda x: save_to_hive(x,sqc,db,db_table,topic))

    ssc.start()
    ssc.awaitTermination()
Example #5
0
def main():
    if len(sys.argv) != 4:
        print("Usage: kafka_wordcount.py <zk> <topic> <timeout>",
              file=sys.stderr)
        exit(-1)

    sc = SparkContext(appName="PythonStreamingKafkaWordCount")
    ssc = StreamingContext(sc, 1)
    timeout = None
    if len(sys.argv) == 4:
        zk, topic, timeout = sys.argv[1:]
        timeout = int(timeout)
    else:
        zk, topic = sys.argv[1:]
    kvs = KafkaUtils.createStream(
        ssc, zk, "spark-streaming-consumer", {topic: 1})
    lines = kvs.map(lambda x: x[1])
    counts = lines.flatMap(lambda line: (line.split(" "))
                           .map(lambda word: (word, 1))
                           .reduceByKey(lambda a, b: a+b))
    counts.pprint()
    kwargs = {}
    if timeout:
        kwargs['timeout'] = timeout
    ssc.start()
    ssc.awaitTermination(**kwargs)
def main():
    conf = SparkConf().setAppName("kafka_source_mongo_sink_pymongo_filtered")
    sc = SparkContext(conf=conf)
    ssc = StreamingContext(sc, 1)
    try:
        kafka_streams = KafkaUtils.createStream(ssc, "localhost:2181", "spark-streaming-consumer", {"splash_json": 2})
        kafka_streams.foreachRDD(process_rdd)
    except Exception as e:
        print e
    ssc.start()
    ssc.awaitTermination()
Example #7
0
    def test_kafka_stream(self):
        """Test the Python Kafka stream API."""
        topic = self._randomTopic()
        sendData = {"a": 3, "b": 5, "c": 10}

        self._kafkaTestUtils.createTopic(topic)
        self._kafkaTestUtils.sendMessages(topic, sendData)

        stream = KafkaUtils.createStream(self.ssc, self._kafkaTestUtils.zkAddress(),
                                         "test-streaming-consumer", {topic: 1},
                                         {"auto.offset.reset": "smallest"})
        self._validateStreamResult(sendData, stream)
def main(ssc):
    zkQuorum, topic = sys.argv[1:]
    kvs = KafkaUtils.createStream(ssc, zkQuorum, "spark-streaming-consumer", {topic: 2})
    lines = kvs.map(lambda x: x[1])

    # Convert RDDs of the words DStream to DataFrame and run SQL query
    def process(time, rdd):
        print("========= %s =========" % str(time))
        try:
            # Get the singleton instance of SQLContext
            sqlContext = getSqlContextInstance(rdd.context)
            # Convert RDD[String] to RDD[Row] to DataFrame
            parts = rdd.map(lambda line: line.split(","))
            delays_rdd= parts.map(lambda w: Row(carrier=w[0], origin=w[1], delay=float(w[2])))
            delays = sqlContext.createDataFrame(delays_rdd, samplingRatio=1)

            avg_delays = delays.groupBy("origin", "carrier").agg(F.avg(delays.delay).alias('average'))

            avg_delays.write.format("org.apache.spark.sql.cassandra").\
                options(table="task2_part2_group2_1", keyspace="mykeyspace").\
                save(mode="append")

            # Register as table
            #dataFrame.registerTempTable("origin_carrier_delays")
            # Do word count on table using SQL and print it
            #carrier_delays_df = \
            #    sqlContext.sql("SELECT origin, carrier, avg(delay) AS average FROM origin_carrier_delays GROUP BY origin, carrier")
            #carrier_delays_df.registerTempTable("origin_carrier_avg_delays")
            #carrier_avg_delays_df = \
            #    sqlContext.sql("SELECT origin, carrier, avg_delay FROM origin_carrier_avg_delays GROUP BY origin ORDER BY avg_delay LIMIT 10")
            #for i in carrier_delays_df.rdd.takeOrderedByKey(10, sortValue=lambda x: x[2], reverse=False).map(lambda x: x[1]).collect():
            #    print (i)
            #dataFrame.select("origin", "carrier", "delay").write \
            #carrier_delays_df.write \
            #    .format("org.apache.spark.sql.cassandra") \
            #    .options( table = "task2_part2_group2_1", keyspace = "mykeyspace") \
            #    .save(mode="append")
            #carrier_delays_df.show()
        except Exception as e: print (e)
        #except:
        #    pass
    #data = lines.map(lambda line: line.split(",")) \
    #    .map(lambda word: (word[0], float(word[1])) ) \
    #    .aggregateByKey((0,0), lambda a,b: (a[0] + b, a[1] + 1), lambda a,b: (a[0] + b[0], a[1] + b[1])) \
    #    .mapValues(lambda v: v[0]/v[1]) \
    #    .updateStateByKey(updateFunc) \
    #    .transform(lambda rdd: rdd.sortBy(lambda (word, count): -count))
    #data.pprint()
    lines.foreachRDD(process)

    ssc.start()
    ssc.awaitTermination()
def main(ssc):
    zkQuorum, topic = sys.argv[1:]
    kvs = KafkaUtils.createStream(ssc, zkQuorum, "spark-streaming-consumer", {topic: 1})
    lines = kvs.map(lambda x: x[1])
    data = lines.map(lambda line: line.split(",")) \
        .flatMap(lambda word: [(word[0], 1), (word[1], 1)]) \
        .reduceByKey(lambda a, b: a+b) \
        .updateStateByKey(updateFunc) \
        .transform(lambda rdd: rdd.sortBy(lambda (word, count): -count))
    data.pprint()

    ssc.start()
    ssc.awaitTermination()
Example #10
0
def main():
    sc = SparkContext(appName="PythonStreamingKafkaWordCount")
    ssc = StreamingContext(sc, 1)

    zkQuorum = "localhost:2181"
    topic = "twitter_raw"
    kvs = KafkaUtils.createStream(ssc, zkQuorum, "spark-streaming-consumer", {topic: 1})
    lines = kvs.map(lambda x: pickle.loads(x[1].decode("utf-8"))["text"])  # fetch the text
    count = lines.map(lambda line: len(line.split())).reduce(add)  # split into words and count
    count.foreachRDD(publishToRedis)  # publish to redis
    count.pprint()

    ssc.start()
    ssc.awaitTermination()
def ss_kafka_bucket_counter(broker, topic, bucket_interval, output_msg, message_parse, valueDecoder=None):
    """Starts a Spark Streaming job from a Kafka input and parses message time

	Args:
		broker: the kafka broker that we look at for the topic
		topic: the kafka topic for input
		bucket_interval: the time interval in seconds (int) that the job will 
			bucket
		output_msg: a function that takes in a spark SparkContext (sc) and 
			StreamingContext (ssc) and returns a function that takes a rdd that 
			performs the output task
		message_parse: how the message is to be parsed
		valueDecoder: same as Spark's valueDecoder

	Returns:
		None
		
	"""
    sc = SparkContext(appName="PythonKafkaBucketCounter")
    ssc = StreamingContext(sc, bucket_interval + 5)

    if valueDecoder:
        kvs = KafkaUtils.createStream(ssc, broker, "spark-streaming-consumer", {topic: 1}, valueDecoder=valueDecoder)
    else:
        kvs = KafkaUtils.createStream(ssc, broker, "spark-streaming-consumer", {topic: 1})

        # I assume that we do not store kafka keys
    lines = kvs.map(lambda x: x[1])
    interval_counts = lines.map(lambda line: (message_parse(line), 1)).reduceByKey(lambda a, b: a + b)

    output_msg_func = output_msg(sc, ssc)

    interval_counts.foreachRDD(output_msg_func)

    ssc.start()
    ssc.awaitTermination()
Example #12
0
	def consume(self):
		messages = KafkaUtils.createStream(self.ssc, self.zookeeper, "spark-streaming-consumer", {self.topic: 1})
		lines = messages.map(lambda x: x[1])

		rows = lines.map(lambda x: { 
			"data": json.loads(x)['data'],
			"time": json.loads(x)['time']
		})

		rows.foreachRDD(lambda x: {
			self.check_and_write(x)
		})

		self.ssc.start()
		self.ssc.awaitTermination()
Example #13
0
def main():
    #main function to execute code
    sqlContext = SQLContext(sc)
    zk_host = zk_ip+":2181"
    consumer_group = "reading-consumer-group"
    kafka_partitions={topic:1}
    #create kafka stream
    kvs = KafkaUtils.createStream(ssc,zk_host,consumer_group,kafka_partitions,valueDecoder=decoder)
    lines = kvs.map(lambda x: x[1])
    readings = lines.map(lambda x: Row(device_id=x["device_id"],\
        metric_time=datetime.datetime.fromtimestamp(int(x["metric_time"])),\
        metric_name=x["metric_name"],\
        metric_value=float(x["metric_value"])))
    readings.foreachRDD(process)
    ssc.start()
    ssc.awaitTermination()
Example #14
0
def bluecoat_parse(zk,topic,db,db_table,num_of_workers,batch_size):
    
    app_name = topic
    wrks = int(num_of_workers)

    # create spark context
    sc = SparkContext(appName=app_name)
    ssc = StreamingContext(sc,int(batch_size))
    sqc = HiveContext(sc)

    tp_stream = KafkaUtils.createStream(ssc, zk, app_name, {topic: wrks}, keyDecoder=spot_decoder, valueDecoder=spot_decoder)

    proxy_data = tp_stream.map(lambda row: row[1]).flatMap(lambda row: row.split("\n")).filter(lambda row: rex_date.match(row)).map(lambda row: row.strip("\n").strip("\r").replace("\t", " ").replace("  ", " ")).map(lambda row:  split_log_entry(row)).map(lambda row: proxy_parser(row))
    saved_data = proxy_data.foreachRDD(lambda row: save_data(row,sqc,db,db_table,topic))
    ssc.start();
    ssc.awaitTermination()
    def readSource(ssc, di_in_conf_with_ds_conf, app_conf):
        sourceType = di_in_conf_with_ds_conf['source.type']

        if sourceType == 'kafka':
            kafkaSimpleConsumerApiUsed = app_conf.get('kafka.simple.consumer.api.used', True)
            if kafkaSimpleConsumerApiUsed:
                topics = di_in_conf_with_ds_conf['topics']
                if not isinstance(topics, list):
                    raise TypeError("topic should be list")

                brokers = di_in_conf_with_ds_conf['metadata.broker.list']
                kafkaParams = {"metadata.broker.list": brokers}
                stream = KafkaUtils.createDirectStream(ssc, topics, kafkaParams).map(lambda x: x[1])
            else:
                zkConnect = di_in_conf_with_ds_conf['zookeeper.connect']
                groupId = app_conf['group.id']
                numReceivers = app_conf.get('num.receivers', 1)
                numConsumerFetchers = app_conf.get('num.consumer.fetchers')
                topics = di_in_conf_with_ds_conf['topics']
                topic_map = dict(zip(topics, numConsumerFetchers))
                # streams = reduce(lambda x, y: x.union(y),
                #                  map(KafkaUtils.createStream(ssc, zkConnect, groupId, topic_map),
                #                      range(0, numReceivers)))
                streams = [KafkaUtils.createStream(ssc, zkConnect, groupId, topic_map) for i in range(0, numReceivers)]
                stream = ssc.union(streams).map(lambda x: x[1])
        elif sourceType == 'hdfs':
            path = di_in_conf_with_ds_conf['fs.defaultFS'] + '/' + di_in_conf_with_ds_conf['path']
            stream = ssc.textFilesStream(path)
        else:
            raise Exception('Error: unsupported source.type = ' + sourceType)

        num_repartition = app_conf.get('dataInterface.stream.repatition.partitions')
        if num_repartition is None or not isinstance(num_repartition, int):
            stream2 = stream
        else:
            stream2 = stream.repartition(num_repartition)

        # 是否使用格式化插件类格式化
        format_class_path = di_in_conf_with_ds_conf.get('format.class', '')
        if format_class_path.strip() == '':
            stream3 = stream2
        else:
            format_class_obj = get_class_obj(format_class_path)
            stream3 = format_class_obj.format(stream2)

        return stream3
Example #16
0
File: tests.py Project: 4bin/spark
    def test_kafka_stream(self):
        """Test the Python Kafka stream API."""
        topic = "topic1"
        sendData = {"a": 3, "b": 5, "c": 10}

        self._kafkaTestUtils.createTopic(topic)
        self._kafkaTestUtils.sendMessages(topic, sendData)

        stream = KafkaUtils.createStream(self.ssc, self._kafkaTestUtils.zkAddress(),
                                         "test-streaming-consumer", {topic: 1},
                                         {"auto.offset.reset": "smallest"})

        result = {}
        for i in chain.from_iterable(self._collect(stream.map(lambda x: x[1]),
                                                   sum(sendData.values()))):
            result[i] = result.get(i, 0) + 1

        self.assertEqual(sendData, result)
Example #17
0
def main():
#main function to execute code
    sc = SparkContext(appName="ReadingWriter")
    ssc = StreamingContext(sc,10)
    sqlContext = SQLContext(sc)
    zk_host = zk_ip+":2181"
    consumer_group = "reading-consumer-group"
    kafka_partitions={"amtest":1}
    #create kafka stream
    kvs = KafkaUtils.createStream(ssc,zk_host,consumer_group,kafka_partitions,valueDecoder=decoder)
    lines = kvs.map(lambda x: x[1])
    #readings = lines.map(lambda x: {"device_id":x["device_id"],"metric_time":x["metric_time"],"metric_name":x["metric_name"],"metric_value":x["metric_value"]})
    readings = lines.map(lambda x: {"device_id":x["device_id"],"metric_time":datetime.datetime.fromtimestamp(int(x["metric_time"])),"metric_name":x["metric_name"],"metric_value":float(x["metric_value"])})
    readings.foreachRDD(lambda rdd: rdd.saveToCassandra("metrics", "raw_metrics"))
    #readingdf.show()
    #readings.pprint()
    #lines.saveToCassandra("metrics", "raw_metrics")
    ssc.start()
    ssc.awaitTermination()
Example #18
0
    def main(self):

        # loading configuration parameters (from a config file when working on a project)
        zk, topic, app_name, batch_duration, master = self.setConfiguration()

        # initiate the spark context / streaming context
        conf = (SparkConf().setMaster(master))
        sc = SparkContext(appName=app_name, conf=conf)
        ssc = StreamingContext(sc, batch_duration)

        # reading data to kafka
        kvs = KafkaUtils.createStream(ssc, zk, "spark-streaming-consumer",
                                      {topic: 1})
        lines = kvs.map(lambda x: x[1])

        lines.pprint()

        ssc.start()  # Start the computation
        ssc.awaitTermination()  # Wait for the computation to terminate
        sc.close()
Example #19
0
def start():
    # sc = SparkContext(appName='txt', conf=sconf)
    sc = SparkContext("spark://192.168.1.148:7077", "NetworkWordCount")
    ssc = StreamingContext(sc, 3)
    brokers = "192.168.1.148:2181"
    topic = 'taimei'

    user_data = KafkaUtils.createStream(ssc, brokers,
                                        "spark-streaming-consumer", {topic: 1})
    # fromOffsets 设置从起始偏移量消费
    # user_data = KafkaUtils.createDirectStream(ssc,[topic],kafkaParams={"metadata.broker.list":brokers},fromOffsets={TopicAndPartition(topic,partition):long(start)})
    user_fields = user_data.map(lambda line: line[1].split('|'))
    gender_users = user_fields.map(lambda fields: fields[3]).map(
        lambda gender: (gender, 1)).reduceByKey(lambda a, b: a + b)
    # user_data.foreachRDD(offset)  # 存储offset信息
    print("---------")
    gender_users.pprint()
    gender_users.foreachRDD(lambda rdd: rdd.foreach(echo))  # 返回元组
    ssc.start()
    ssc.awaitTermination()
Example #20
0
def main():
    #main function to execute code
    sqlContext = SQLContext(sc)
    zk_host = zk_ip + ":2181"
    consumer_group = "reading-consumer-group"
    kafka_partitions = {topic: 1}
    #create kafka stream
    kvs = KafkaUtils.createStream(ssc,
                                  zk_host,
                                  consumer_group,
                                  kafka_partitions,
                                  valueDecoder=decoder)
    lines = kvs.map(lambda x: x[1])
    readings = lines.map(lambda x: Row(device_id=x["device_id"],\
        metric_time=datetime.datetime.fromtimestamp(int(x["metric_time"])),\
        metric_name=x["metric_name"],\
        metric_value=float(x["metric_value"])))
    readings.foreachRDD(process)
    ssc.start()
    ssc.awaitTermination()
    def __init__(self, config):

        self._server = config.content["input"]["options"]["server"]
        self._port = config.content["input"]["options"]["port"]
        self._topic = config.content["input"]["options"]["topic"]
        self._consumer_group = config.content["input"]["options"][
            "consumer_group"]
        self._batchDuration = config.content["input"]["options"][
            "batchDuration"]
        self._sep = config.content["input"]["options"]["sep"]

        self._spark = SparkSession.builder.appName(
            "StreamingDataKafka").getOrCreate()
        sc = self._spark.sparkContext

        sc.addFile(config.content["databases"]["country"])
        sc.addFile(config.content["databases"]["city"])
        sc.addFile(config.content["databases"]["asn"])

        self._ssc = StreamingContext(sc, self._batchDuration)

        list_conversion_function = list(
            (map(lambda x: type_to_func(x.dataType),
                 config.data_structure_pyspark)))
        ranked_pointer = list(enumerate(list_conversion_function))
        functions_list = list(
            map(lambda x: lambda list_string: x[1](list_string[x[0]]),
                ranked_pointer))
        function_convert = lambda x: list(
            map(lambda func: func(x), functions_list))
        try:
            dstream = KafkaUtils.createStream(
                self._ssc, "{0}:{1}".format(self._server, self._port),
                self._consumer_group, {self._topic: 1})
            self._dstream = dstream.map(
                lambda x: function_convert(x[1].split(",")))
        except:
            raise KafkaConnectError(
                "Kafka error: Connection refused: server={} port={} consumer_group={} topic={}"
                .format(self._server, self._port, self._consumer_group,
                        self._topic))
Example #22
0
def main():
#main function to execute code
    sc = SparkContext(appName="CouponCounterPySpark")
    ssc = StreamingContext(sc,10)
    zk_host = "localhost:2181"
    consumer_group = "coupon-event-consumers"
    kafka_partitions={"test":1}
    #create kafka stream
    lines = KafkaUtils.createStream(ssc,zk_host,consumer_group,kafka_partitions)
    events = lines.map(lambda line: line[1].split(','))
    tmpagg = events.map(lambda event: ((event[1]),1) )
    coupon_counts = tmpagg.reduceByKey(lambda x,y: x+y)
    coupon_records = coupon_counts.map(lambda x: {"offer_id" : x[0], "bucket" : str(datetime.datetime.now().strftime("%s")), "count" : int(x[1])})
    #coupon_records.pprint()
    #coupon_records.registerTempTable("coupon_counters")
    #coupon_records.select("offer_id","bucket","count").show()
    #coupon_records = coupon_counts.map(lambda record: {"offer_id" : record[0],"bucket" : str(int(datetime.datetime.now().strftime("%s"))*1000),"count" : int(record[1])}
    coupon_records.pprint()
    coupon_records.foreachRDD(lambda rdd: rdd.saveToCassandra("loyalty","coupon_counters"))
    ssc.start()
    ssc.awaitTermination()
def createContext(zkQuorum, topic):
    # If you do not see this printed, that means the StreamingContext has been loaded
    # from the new checkpoint
    print("Creating new context")
    sc = SparkContext(appName="PythonStreamingRecoverableNetworkWordCount")
    ssc = StreamingContext(sc, 1)

    kvs = KafkaUtils.createStream(ssc, zkQuorum, "spark-streaming-consumer",
                                  {topic: 1})
    lines = kvs.map(lambda x: json.loads(x[1]))
    #lines.pprint()
    # hard one
    #pairs=lines.map(lambda x: (x['uid'],x['topic']))
    # simple one
    pairs = lines.map(lambda x: (x['uid'], 1))
    #pairs.pprint()

    windowedWordCounts = pairs.reduceByKeyAndWindow(lambda x, y: x + y,
                                                    lambda x, y: x - y, 30, 10)
    windowedWordCounts.pprint()
    return ssc
Example #24
0
def main():
    global topic
    topic = "topic_name"
    global errortopic
    errortopic = 'error_topic_data'

    sc = CreateSparkContext()
    ssc = StreamingContext(sc, 10)
    try:
        kafka_stream = KafkaUtils.createStream(ssc, "192.168.0.1:2181", "spark-streaming-consumer", {topic:12})

        raw = kafka_stream.flatMap(lambda kafkaS: [kafkaS])

        lines = raw.flatMap(lambda xs: xs[1].split(","))

        counts = lines.map(lambda word: (str(datetime.now()), "api", word))

        counts.foreachRDD(lambda k: saveToCassandra(k, sc, ssc))

    except Exception, e:
        print('error :'+str(e))
def main():
    conf = SparkConf().setAppName("pyspark read")
    sc = SparkContext(conf=conf)
    ssc = StreamingContext(sc, 1)

    kafkaStream = KafkaUtils.createStream(ssc, "localhost:2181", "spark-streaming-consumer", {"splash_json": 2})
    stream = kafkaStream.map(lambda xs:xs)
    stream.foreachRDD(lambda rdd: rdd.foreach(printRdd))

    # stream.foreachRDD(lambda rdd: rdd.saveToMongodb(mongodb_uri))

    # stream.pprint()

    # filter out flights not departing from United States
    # "depAirportCntry": "United States"
    # messages = kafkaStream.map(lambda xs: json.load(xs))
    # jsonmessages = messages.map(lambda x: json.loads(x))
    # usdepartures = jsonmessages.map(lambda x: x['depAirportCntry'].filter(lambda x: "United States" in x))\


    ssc.start()             # Start the computation
    ssc.awaitTermination()  # Wait for the computation to terminate
Example #26
0
    def test_kafka_stream(self):
        """Test the Python Kafka stream API."""
        topic = "topic1"
        sendData = {"a": 3, "b": 5, "c": 10}
        jSendData = MapConverter().convert(
            sendData, self.ssc.sparkContext._gateway._gateway_client)

        self._kafkaTestUtils.createTopic(topic)
        self._kafkaTestUtils.sendMessages(topic, jSendData)

        stream = KafkaUtils.createStream(self.ssc,
                                         self._kafkaTestUtils.zkAddress(),
                                         "test-streaming-consumer", {topic: 1},
                                         {"auto.offset.reset": "smallest"})

        result = {}
        for i in chain.from_iterable(
                self._collect(stream.map(lambda x: x[1]),
                              sum(sendData.values()))):
            result[i] = result.get(i, 0) + 1

        self.assertEqual(sendData, result)
Example #27
0
def KafkaWordCount(zkQuorum, group, topics, numThreads):
    spark_conf = SparkConf().setAppName("KafkaWordCount")
    sc = SparkContext(conf=spark_conf)
    sc.setLogLevel("ERROR")
    ssc = StreamingContext(sc, 1)
    #ssc.checkpoint("file:///usr/local/spark/checkpoint")
    # 这里表示把检查点文件写入分布式文件系统HDFS,所以要启动Hadoop
    ssc.checkpoint(".")
    topicAry = topics.split(",")
    # 将topic转换为hashmap形式,而python中字典就是一种hashmap
    topicMap = {}
    for topic in topicAry:
        topicMap[topic] = numThreads
    lines = KafkaUtils.createStream(ssc, zkQuorum, group,
                                    topicMap).map(lambda x: x[1])
    words = lines.flatMap(lambda x: x.split(" "))
    wordcount = words.map(lambda x: (x, 1)).reduceByKeyAndWindow(
        (lambda x, y: x + y), (lambda x, y: x - y), 1, 1, 1)
    wordcount.foreachRDD(lambda x: sendmsg(x))
    wordcount.pprint()
    ssc.start()
    ssc.awaitTermination()
Example #28
0
def main():

    zkQuorum = "localhost:2181"
    topic = "meetup-rsvps-topic"

    sc = SparkContext("local[*]")
    sc.setLogLevel("ERROR")
    ssc = StreamingContext(sc, BATCH_DUR) # 5 sec batch duration

    # utf-8 text stream from kafka
    kvs = KafkaUtils.createStream(ssc, zkQuorum, "spark_consumer", {topic: 1}).cache()

    # recent top N event stream
    event_count = kvs.map(extract_event_count).filter(lambda line: line is not None)
    event_count.reduceByKeyAndWindow(func=lambda x,y:x+y,
                                     invFunc=lambda x,y:x-y,
                                     windowDuration=WIN_DUR,
                                     slideDuration=SLIDE_DUR) \
                .filter(lambda pair: pair[1] > 0) \
                .transform(take_top_rdd) \
                .map(lambda pair: (pair[0][1], pair[1])) \
                .foreachRDD(lambda rdd: rdd.foreachPartition(send_recent_top))

    # running response count stream
    response_count = kvs.map(extract_response).filter(lambda line: line is not None)
    # TODO: may use countByValueAndWindow instead of updateStateByKey
    response_count.updateStateByKey(update_count) \
                  .foreachRDD(lambda rdd: rdd.foreachPartition(send_response_count))

    # count recent rsvps
    rsvp_count = kvs.countByWindow(windowDuration=WIN_DUR, slideDuration=SLIDE_DUR) \
                    .foreachRDD(lambda rdd: rdd.foreachPartition(send_rsvp_count))


    # event_count.pprint()
    ssc.checkpoint("rsvps_checkpoint_dir")
    ssc.start()
    ssc.awaitTermination()
Example #29
0
def initialize_and_parse_input_stream(input_zookeeper, input_topic, microbatch_duration):
    """
    Initialize spark context, streaming context, input DStream and parse json from DStream.

    :param input_zookeeper: input zookeeper hostname:port
    :param input_topic: input kafka topic
    :param microbatch_duration: duration of micro batches in seconds
    :return sc, ssc, parsed_stream: initialized spark and streaming context, and json with data from DStream
    """
    # Application name used as identifier
    application_name = os.path.basename(sys.argv[0])
    # Spark context initialization
    sc = SparkContext(appName=application_name + ' ' + ' '.join(sys.argv[1:]))  # Application name used as the appName
    ssc = StreamingContext(sc, microbatch_duration)

    # Initialize input DStream of flows from specified Zookeeper server and Kafka topic
    input_stream = KafkaUtils.createStream(ssc, input_zookeeper, 'spark-consumer-' + application_name + str(time.time()),
                                           {input_topic: 1})

    # Parse input stream in the json format
    parsed_stream = input_stream.map(lambda line: json.loads(line[1]))

    return sc, ssc, parsed_stream
Example #30
0
def streaming_profile_toDB(topic, re_table, conn, attributes, rule, job_type):
    sc = spark.sparkContext
    ssc = StreamingContext(sc, 5)
    numThread = 3
    print 'in streming profile to db'
    print zkQuorum
    print topic
    kvs = KafkaUtils.createStream(ssc, zkQuorum, "spark-streaming-consumer",
                                  {topic: numThread})
    if job_type == 'accuracy':
        print 'here in accuracy'
        kvs.foreachRDD(lambda t, rdd: streaming_accuracy(
            t, rdd, topic, re_table, conn, attributes, rule))
    elif job_type == 'profile':
        if rule == 'profile':
            kvs.foreachRDD(lambda t, rdd: streaming_profile(
                t, rdd, topic, re_table, conn, attributes))
        else:
            kvs.foreachRDD(lambda t, rdd: streaming_user_define_profile(
                t, rdd, topic, re_table, conn, attributes, rule))

    ssc.start()
    ssc.awaitTermination()
def main():
    conf = SparkConf().setAppName("pyspark read")
    sc = SparkContext(conf=conf)
    ssc = StreamingContext(sc, 1)

    kafkaStream = KafkaUtils.createStream(ssc, "localhost:2181",
                                          "spark-streaming-consumer",
                                          {"splash_json": 2})
    stream = kafkaStream.map(lambda xs: xs)
    stream.foreachRDD(lambda rdd: rdd.foreach(printRdd))

    # stream.foreachRDD(lambda rdd: rdd.saveToMongodb(mongodb_uri))

    # stream.pprint()

    # filter out flights not departing from United States
    # "depAirportCntry": "United States"
    # messages = kafkaStream.map(lambda xs: json.load(xs))
    # jsonmessages = messages.map(lambda x: json.loads(x))
    # usdepartures = jsonmessages.map(lambda x: x['depAirportCntry'].filter(lambda x: "United States" in x))\

    ssc.start()  # Start the computation
    ssc.awaitTermination()  # Wait for the computation to terminate
Example #32
0
def streaming_data(topic_name, window_size):
    '''
    Get data stream from Kafka broker
    '''

    # Spark context
    sc = SparkContext(appName="PythonSparkStreamingKafka")
    # sc.setLogLevel("WARN")

    # Streaming context
    batch_duration = 60  # batch duration in seconds
    stc = StreamingContext(sc, batch_duration)

    # Connect to Kafka and get DStream of input stream data
    kafkaStream = KafkaUtils.createStream(stc, 'localhost:2181',
                                          'raw-event-streaming-consumer',
                                          {topic_name: 1})

    # windowed stream
    windowedStream = kafkaStream.window(window_size)
    # Start the streaming context
    stc.start()
    stc.awaitTermination()
def start():
    sconf = SparkConf()
    # sconf.set('spark.streaming.blockInterval','100')
    sconf.set('spark.cores.max', 8)
    sc = SparkContext(appName='KafkaWordCount', conf=sconf)
    ssc = StreamingContext(sc, 2)

    numStreams = 3
    kafkaStreams = [
        KafkaUtils.createStream(
            ssc,
            "server1-2-5-24-138:2181,server1-3-5-24-139:2181,server1-4-5-24-140:2181",
            "streaming_test_group", {"spark_streaming_test_topic": 1})
        for _ in range(numStreams)
    ]
    unifiedStream = ssc.union(*kafkaStreams)
    print unifiedStream
    #统计生成的随机数的分布情况
    result = unifiedStream.map(lambda x: (x[0], 1)).reduceByKey(
        lambda x, y: x + y)
    result.pprint()
    ssc.start()  # Start the computation
    ssc.awaitTermination()  # Wait for the computation to terminate
Example #34
0
def Stream(KafkaTopic, ssc):
    #global Num
    print(KafkaTopic)
    StrTopic = KafkaTopic.split('-')
    Num = StrTopic[0]
    #print(Num)
    kvs1 = KafkaUtils.createStream(
        ssc, "localhost:2181", KafkaTopic,
        {KafkaTopic: 1})  #接受来自kafka的数据, "localhost:2181"为zookeper的端口号
    #kvs2 = KafkaUtils.createStream(ssc, "localhost:2181", KafkaTopic[1], {KafkaTopic[1]: 1}) #接受来自kafka的数据, "localhost:2181"为zookeper的端口号
    lines1 = kvs1.map(lambda x: eval(x[1])['battery'])  #x[1]代表接受kafka中的值其他都不行
    #lines1.pprint()
    #lines2 = kvs2.map(lambda x: eval(x[1])['battery']) #x[1]代表接受kafka中的值其他都不行
    #val = lines.map(lambda k: (k['battery'])) #取字典中需要的参数组成一个tuple
    batterychange1 = lines1.window(6, 3)  #创建计算窗口
    #batterychange2 = lines2.window(6,6)
    batterymax1 = batterychange1.reduce(lambda x, y: x + y)
    batteryfinal = batterymax1.map(lambda x: {"Exo_ID": Num, "battery": x})
    #topic = kvs1.map(lambda x: x[0])
    #batterymax2 = batterychange2.reduce(lambda x,y : x+y)
    batteryfinal.pprint()
    #topic.pprint()
    #batterymax2.pprint()
    batteryfinal.foreachRDD(lambda rdd: rdd.foreach(sendMsg))  #创建新的用于发送数据的RDD
def main():

    try:
        SetParameters()
        sc = CreateSparkContext()
        ssc = StreamingContext(sc, 5)

        kafka_stream = KafkaUtils.createStream(ssc, "192.168.0.1:2181",
                                               "topic", {topic: 12})

        raw = kafka_stream.flatMap(lambda kafkaS: [kafkaS])

        lines = raw.flatMap(lambda xs: xs[1].split(","))

        counts = lines.map(lambda word: (str(datetime.now()), "api", word))

        counts.foreachRDD(lambda k: saveToCassandra(k, sc, ssc))

        ssc.start()

        ssc.awaitTermination()

    except Exception, e:
        print('error:' + str(e))
def getStreamingData():
    sc = SparkContext(appName="spark_temperature_processor")
    ssc = StreamingContext(sc, 1)
    zkQuorum = 'localhost:2181'
    topic = 'test'

    kvs = KafkaUtils.createStream(ssc, zkQuorum, "spark-streaming-consumer",
                                  {topic: 1})

    lines = kvs.map(lambda x: x[1])
    lines.pprint()
    fo.write(str(lines))

    #counts = lines.flatMap(lambda line: line.split(" ")).map(lambda word: (word, 1)).reduceByKey(lambda a, b: a+b)
    #counts.pprint()

    #.reduceByKey(lambda a,b : "hot" if int(b) > 90 else "cold")
    #maxt.saveAsTextFile("kk.txt")

    maxt = lines.map(lambda x: x[0])
    maxt.pprint()

    ssc.start()
    ssc.awaitTermination()  #terminate in 5 seconds
Example #37
0
def createContext():
    sc = SparkContext(appName="PythonSparkStreamingKafka_RM_02")
    sc.setLogLevel("WARN")
    ssc = StreamingContext(sc, 5)
    
    # Define Kafka Consumer
    kafkaStream = KafkaUtils.createStream(ssc, 'cdh57-01-node-01.moffatt.me:2181', 'spark-streaming2', {'twitter':1})
    
    ## --- Processing
    # Extract tweets
    parsed = kafkaStream.map(lambda v: json.loads(v[1]))
    
    # Count number of tweets in the batch
    count_this_batch = kafkaStream.count().map(lambda x:('Tweets this batch: %s' % x))
    
    # Count by windowed time period
    count_windowed = kafkaStream.countByWindow(60,5).map(lambda x:('Tweets total (One minute rolling count): %s' % x))

    # Get authors
    authors_dstream = parsed.map(lambda tweet: tweet['user']['screen_name'])
    
    # Count each value and number of occurences 
    count_values_this_batch = authors_dstream.countByValue()                                .transform(lambda rdd:rdd                                  .sortBy(lambda x:-x[1]))                              .map(lambda x:"Author counts this batch:\tValue %s\tCount %s" % (x[0],x[1]))

    # Count each value and number of occurences in the batch windowed
    count_values_windowed = authors_dstream.countByValueAndWindow(60,5)                                .transform(lambda rdd:rdd                                  .sortBy(lambda x:-x[1]))                            .map(lambda x:"Author counts (One minute rolling):\tValue %s\tCount %s" % (x[0],x[1]))

    # Write total tweet counts to stdout
    # Done with a union here instead of two separate pprint statements just to make it cleaner to display
    count_this_batch.union(count_windowed).pprint()

    # Write tweet author counts to stdout
    count_values_this_batch.pprint(5)
    count_values_windowed.pprint(5)
    
    return ssc
Example #38
0
def create_dstream(ssc, zk_quorum, group_id, topics):
    '''
        Create an input stream that pulls ids packet messages from Kafka.

    :param ssc      : :class:`pyspark.streaming.context.StreamingContext` object.
    :param zk_quorum: Zookeeper quorum (host[:port],...).
    :param group_id : The group id for this consumer.
    :param topics   : Dictionary of topic -> numOfPartitions to consume. Each
                      partition is consumed in its own thread.
    :returns        : The schema of this :class:`DataFrame`.
    :rtype          : :class:`pyspark.sql.types.StructType`
    '''
    from pyspark.streaming.kafka import KafkaUtils
    from ..serializer import deserialize

    dstream = KafkaUtils.createStream(ssc,
                                      zk_quorum,
                                      group_id,
                                      topics,
                                      keyDecoder=lambda x: x,
                                      valueDecoder=deserialize)

    return dstream.map(lambda x: x[1]).flatMap(lambda x: x).map(
        lambda x: x.split(','))
Example #39
0
def main(tag):

    sc = create_sc("UnitTest")
    sc.setLogLevel("INFO")

    print('SPARK CONTEXT INFO  :')
    print('      VERSION       :', sc.version)
    print('      DRIVER MEMORY :', sc._conf.get('spark.driver.memory'))

    stream = StreamingContext(sc, 60)

    kafka_stream = KafkaUtils.createStream(
        stream, 'localhost:2181', 'spark-historian-consumer',
        {'historian-topic-CRY-TGBT-NORMAL-CRY-act-cons-pow': 1})
    #kafka_stream = KafkaUtils.createStream(stream, 'victoria.com:2181', 'spark-streaming', {'imagetext':1})
    #parsed = kafka_stream.map(lambda v: json.loads(v[1]))
    parsed = kafka_stream.map(lambda v: analyzeLog(v[1]))
    #parsed.pprint()
    parsed.foreachRDD(lambda k: process(k))

    parsed.pprint()

    stream.start()
    stream.awaitTermination()
def main():

    global zookeeper_IP
    global cassandra_IP
    global cassandra_keyspace
    global cassandra_table
    global kafka_handling_api
    global seconds_per_job
    global topic
    global c_dao

    zookeeper_IP = GetConfig('zookeeper_IP')
    cassandra_IP = GetConfig('cassandra_IP')
    cassandra_keyspace = GetConfig('cassandra_keyspace')
    cassandra_table = GetConfig('cassandra_table')
    kafka_handling_api = GetConfig('kafka_handling_api')
    seconds_per_job = GetConfig('seconds_per_job')
    topic = GetConfig('topic')
    c_dao = CassandraDAO(CassandraType.PRODUCTION)

    sc = CreateSparkContext(cassandra_IP)
    ssc = StreamingContext(sc, int(float(seconds_per_job)))

    try:

        kafka_stream = KafkaUtils.createStream(ssc, zookeeper_IP,
                                               'spark-streaming-consumer',
                                               {topic: 12})
        raw = kafka_stream.flatMap(lambda kafkaS: [kafkaS])
        lines = raw.filter(lambda xs: xs[1].split(','))

        counts = lines.map(lambda word: (str(datetime.now()), 'api', word[1]))
        counts.foreachRDD(lambda k: saveToCassandra(sc, ssc, k))

    except Exception, e:
        print('error:' + str(e))
Example #41
0
def streaming():
    os.environ[
        'PYSPARK_SUBMIT_ARGS'] = '--jars spark-streaming-kafka-assembly_2.10-1.6.0.jar pyspark-shell'
    spark = SparkSession.builder.master("spark://t3.dev:7077").appName("test") \
        .config('spark.jars.packages', 'org.apache.spark:spark-streaming-kafka-0-8-assembly_2.11:2.4.5') \
        .getOrCreate()

    print(dir(spark._jvm))

    ssc = StreamingContext(spark.sparkContext, 5)

    kafkaParams = {
        "bootstrap_servers": "t3.dev:9092",
        "kafka.bootstrap.servers": "t3.dev:9092",
        "brokers": "t3.dev:9092",
        "host": "t3.dev:9092"
    }
    topics = {'spark-test': 1}
    lines = KafkaUtils.createStream(ssc, 't3.dev:2181', 'local-test', topics,
                                    kafkaParams)
    print(lines.pprint(10))

    ssc.start()
    ssc.awaitTermination(60)
Example #42
0
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext, HiveContext, SparkSession
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils

conf = SparkConf().setMaster("local[1]").setAppName("StreamProcessor_1")

sc = SparkContext(conf=conf)

print "Setting LOG LEVEL as ERROR"
sc.setLogLevel("ERROR")

ssc = StreamingContext(sparkContext=sc, batchDuration=1)

kafkaStream = KafkaUtils.createStream(ssc=ssc,
                                      zkQuorum='localhost:2181',
                                      topics='test')

print(sc)
Example #43
0
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils

if __name__ == "__main__":
    # Create the Spark context
    sc = SparkContext(appName="DataIngestionApp")
    log4j = sc._jvm.org.apache.log4j
    log4j.LogManager.getRootLogger().setLevel(log4j.Level.WARN)
    # Create the Spark Streaming Context with 10 seconds batch interval
    ssc = StreamingContext(sc, 10)
    # Check point directory setting
    ssc.checkpoint("\tmp")
    # Zookeeper host
    zooKeeperQuorum="localhost"
    # Kaka message group
    messageGroup="sfb-consumer-group"
    # Kafka topic where the programming is listening for the data
	# Reader TODO: Here only one topic is included, it can take a comma separated string containing the list of topics. 
	# Reader TODO: When using multiple topics, use your own logic to extract the right message and persist to its data store
    topics = "message"
    numThreads = 1    
    # Create a Kafka DStream
    kafkaStream = KafkaUtils.createStream(ssc, zooKeeperQuorum, messageGroup, {topics: numThreads})
    messageLines = kafkaStream.map(lambda x: x[1])
    # This is where the messages are printed to the console. Instead of this, implement your own persistence logic
    messageLines.pprint()
    # Start the streaming
    ssc.start()
    # Wait till the application is terminated	
    ssc.awaitTermination()
Example #44
0
    kstream = KafkaUtils.createDirectStream(ssc, topics = ['topic1'], kafkaParams = {"metadata.broker.list":"localhost:9092"})

    tweets = tweets.map(lambda x: x[1].encode("ascii","ignore"))
    return tweets

def process_rdd_queue(twitter_stream):
        # Create the queue through which RDDs can be pushed to
        # a QueueInputDStream
    rddQueue = [] 
    for i in range(3):
        rddQueue += [ssc.sparkContext.parallelize([get_next_tweet(twitter_stream)], 5)]
    lines = ssc.queueStream(rddQueue)
    lines.pprint()

if __name__ == "__main__":
    sc = SparkContext(appName="PythonStreamingQueueStream")
    ssc = StreamingContext(sc, 10)
    # Instantiate the twitter_stream
    #twitter_stream = connect_twitter()
    # Get RDD queue of the streams json or parsed
    #process_rdd_queue(twitter_stream)
    zkQuorum = "localhost:2181"
    topic = "topic1"
    tweets = KafkaUtils.createStream(ssc, zkQuorum, "PythonStreamingQueueStream", {topic: 1})
    #tweets = stream(ssc)
    #process_rdd_queue(twitter_stream)
    tweets.pprint()	
    ssc.start()
    time.sleep(100)
    ssc.stop(stopSparkContext=True, stopGraceFully=True)
Example #45
0
    rdd.foreachPartition(output_partition)


if __name__ == "__main__":

    client = pyhdfs.HdfsClient(hosts="10.120.14.120,9000",
                               user_name="cloudera")

    #ser producer for topic "utime"
    topic = "utime"
    broker_list = '10.120.14.120:9092,10.120.14.120:9093'

    sc = SparkContext()
    ssc = StreamingContext(sc, 3)
    #ser consumer kafkastream take from topic  Pdata
    lines = KafkaUtils.createStream(ssc, "10.120.14.120:2182",
                                    "Pdata_for_model", {"Pdata": 3})

    load_file = open(
        "/home/cloudera/HA_ML_prdict_project/predict_model/rfr_0910_df.pkl",
        'rb')
    MRI_Model = joblib.load(load_file)
    load_file.close()
    rfr_bc = sc.broadcast(MRI_Model)

    r = lines.map(lambda x: x[0])
    r0 = lines.map(lambda x: x[1])
    r1 = r0.map(lambda x: (int(x.split(",")[0]),int(x.split(",")[1]),int(x.split(",")[2]),int(x.split(",")[3]),int(x.split(",")[4]),\
                   int(x.split(",")[5]),int(x.split(",")[6]),int(x.split(",")[7])))
    r2 = r1.map(lambda x: np.array(x, dtype=int))
    r3 = r2.map(lambda x: x.reshape(1, -1))
    RDB_HOST =  os.environ.get('RDB_HOST')
    RDB_PORT = os.environ.get('RDB_PORT')
    RDB_DB = "avrotopic1db"
    zkQuorum, topic, stream_window, RDB_TABLE = sys.argv[1:]
    stream_window = int(stream_window)
    
    sc = SparkContext(appName="PythonStreamingKafkaSums")
    ssc = StreamingContext(sc, batchDuration=stream_window)

    
    streams = []
    schema = avro.schema.parse(open("WaterSensor.avsc").read())
    reader = DatumReader(schema)
    numStreams = 4

    kafkaStreams = [KafkaUtils.createStream(ssc=ssc, zkQuorum=zkQuorum, groupId="avro-topic1-consumer", valueDecoder=io.BytesIO, topics={topic: 1}) for _ in range (numStreams)]
    
    #kvs = kafkaStreams[1]
    #kkvvss = ssc.union(*kafkaStreams)#.partitionBy(numPartitions=20)
    #kvs.print()


    #kvs = KafkaUtils.createStream(ssc, zkQuorum, "my-topic2-consumer", {topic: 1})
    def sendRDDCount(count):
        #print('index: ' + str(index))
        connection = createNewConnection()#todo: use-connection-pool
        #print('count' + str(count))
        #r.table(RDB_TABLE).filter(r.row["partition"] == index).update({"count": count}).run(connection)
        r.table(RDB_TABLE).insert({"count": count, "time":time.time()}).run(connection)
        connection.close()
    def sendPartitionCount(index, count):
Example #47
0
		try:
			s = db.session.query(Station).filter(Station.id == station['station_id'])

			s.update({Station.num_bikes_available: station['num_bikes_available'],
									Station.num_docks_available: station['num_docks_available']})
			db.session.commit()
		except exc.IntegrityError:
			db.session.rollback()


def consume_data():
	 sc = SparkContext(appName="Lets Go")
	 ssc = StreamingContext(sc, 1)

    	zkQuorum, topic = sys.argv[1:]
    	kvs = KafkaUtils.createStream(ssc, zkQuorum, "spark-streaming-consumer", {topic: 1})
    	lines = kvs.map(lambda x: x[1])
    	counts = lines.flatMap(lambda line: line.split(" ")) \
        	.map(lambda row: (row, update_station_status())) \
	        .reduceByKey(lambda a, b: a+b)
    

    	ssc.start()
    	ssc.awaitTermination()


def system_alerts():
	"""Get alerts about the system. 

		https://gbfs.citibikenyc.com/gbfs/en/system_alerts.json"""
	pass
		"mapreduce.job.output.value.class": "org.apache.hadoop.io.Writable"}
	keyConv = "org.apache.spark.examples.pythonconverters.StringToImmutableBytesWritableConverter"
	valueConv = "org.apache.spark.examples.pythonconverters.StringListToPutConverter"

	#execute send
	rdd.map(writeHbase).saveAsNewAPIHadoopDataset(
			conf=conf,
			keyConverter=keyConv,
			valueConverter=valueConv)	 
	
if __name__ == "__main__":

	sc = SparkContext(appName = "Hbase")
	ssc = StreamingContext(sc, 1)
	# ssc.checkpoint(checkpointDir)
	# ssc = StreamingContext.getOrCreate(checkpointDir, buildContext)

	zkQuorum = "localhost:2181"
	kvs = KafkaUtils.createStream(ssc, zkQuorum, "spark-streaming-consumer", {KAFKA_TOPIC: 1})
	#data stream of data dictionaries
	
	ds = kvs.map(lambda data: ast.literal_eval(data[1]))
	ds.pprint()
	if ds is not None:
		ds.foreachRDD(sendRecord)
	
	ssc.start()
	ssc.awaitTermination()
	
	# sumstats = ds.map(partitionCount).updateStateByKey(partitionCount)
	# ssc.stop(stopGraceFully=True) 
Example #49
0
                                schema=['MinPrice', 'Direct', 'OutboundLeg'])
        df.show()
        df.write.saveAsTable(name="default.flights",
                             format="hive",
                             mode="append")


sc = SparkContext("local[*]", "FlightData")
ssc = StreamingContext(sc, 5)

ss = SparkSession.builder.appName("FlightData").config(
    "spark.sql.warehouse.dir", "/user/hive/warehouse").config(
        "hive.metastore.uris",
        "thrift://localhost:9083").enableHiveSupport().getOrCreate()

kafkaStream = KafkaUtils.createStream(ssc, 'localhost:2181', 'Flights',
                                      {'flights': 1})

parsed = kafkaStream.map(lambda v: json.loads(v[1]))

#user_counts = parsed.map(lambda tweet: (tweet['user']["screen_name"], 1)).reduceByKey(lambda x,y: x + y)

#user_counts.pprint()

longest_duration = parsed.flatMap(lambda v: v.get("Quotes"))
#longest_duration.pprint()

table = longest_duration.map(
    lambda v: (v.get("MinPrice"), v.get("Direct"), v.get("OutboundLeg")))

longest_duration.pprint()
import sys

from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils

sc = SparkContext(appName="kafkaTest")
ssc = StreamingContext(sc,5)


kvs = KafkaUtils.createStream(ssc, "localhost:2181", "spark_streaming", {"inter_transact": 1})
kvs.pprint(10)



ssc.start()

ssc.awaitTermination()
'''
Created on Jul 7, 2016

@author: rbhat
'''
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils


# Create a local StreamingContext with two working thread and batch interval of 2 second
sc = SparkContext("local[*]", "MyKafkaStream")
ssc = StreamingContext(sc, 1)

kafkaStream = KafkaUtils.createStream(ssc, "deepc04.acis.ufl.edu:2181", "GroupNameDoesntMatter", {"test": 2})

messages = kafkaStream.map(lambda xs:xs)
messages.pprint()

ssc.start()             # Start the computation
ssc.awaitTermination()  # Wait for the computation to terminate
    stream_window = int(stream_window)
    
    sc = SparkContext(appName="PythonStreamingKafkaJSONSums")
    ssc = StreamingContext(sc, batchDuration=stream_window)
    def createNewConnection():
        return r.connect(host=RDB_HOST, port=RDB_PORT, db=RDB_DB)
    
    #delete any data in table
    connection = createNewConnection()
    r.table(RDB_TABLE).delete().run(connection)
    connection.close()
    
    streams = []
    
    numStreams = 6 #read parallelism
    kafkaStreams = [KafkaUtils.createStream(ssc, zkQuorum, "JSON-consumer", {topic: 1}) for _ in range (numStreams)]
    #set up kafkaStreams into a list
    def sendRDDCount(count):
        connection = createNewConnection()
        r.table(RDB_TABLE).insert(count).run(connection)
        connection.close()
    for idx,kvs in enumerate(kafkaStreams):
        countsDstream=kvs.count()
        countsDstream = countsDstream.map(lambda x: {"count":x, "time":time.time()})
        records = kvs.map(lambda x: bytesDecoder(x[1]))
        sums = records.map(lambda obj: (obj['unique_id'], obj['quantity'])) \
            .reduceByKey(lambda a, b: a+b)
        countsDstream.foreachRDD(lambda rdd: sendRDDCount(rdd.take(1)))

    ssc.start()
    ssc.awaitTermination()
Example #53
0
# 6.2.4절 예제 6-12
from pyspark import SparkContext, SparkConf, storagelevel
from pyspark.streaming.context import StreamingContext
from pyspark.streaming.kafka import KafkaUtils

## pyspark에서 실행할 경우 sparkContext는 생성하지 않습니다!
# ./pyspark --packages org.apache.spark:spark-streaming-kafka-0-8-assembly_2.11:2.0.2
conf = SparkConf()
sc = SparkContext(master="local[*]", appName="KafkaSample", conf=conf)
ssc = StreamingContext(sc, 3)

ds1 = KafkaUtils.createStream(ssc, "localhost:2181", "test-consumer-group1", {"test": 3})
ds2 = KafkaUtils.createDirectStream(ssc, ["test"], {"metadata.broker.list": "localhost:9092"})

ds1.pprint()
ds2.pprint()

ssc.start()
ssc.awaitTermination()
Example #54
0
                                    record_id = colleciton.insert(push_user)

                                    colleciton = mongodb.notify_record
                                    notify_user = {"createtime": int(time.time()), "sendid": sendid, "type": channel,"purchaseinfoid":purchaseinfoid,"recordid":record_id}
                                    colleciton.insert(notify_user)
                                    #print sendid, str(num),purchaseinfo["name"].encode("utf8"),purchaseinfo["price"].encode("utf8"),purchaseinfo["unit"].encode("utf8"),str(purchaseinfo["purchaseid"]),sendtype
                                    #reply_wx_notify(sendid, str(num), purchaseinfo["name"],purchaseinfo["price"], purchaseinfo["unit"], str(purchaseinfoid),str(purchaseinfo["purchaseid"]))
                                    thread.start_new_thread(reply_wx_notify, (sendid, str(num), purchaseinfo["name"],purchaseinfo["price"], purchaseinfo["unit"], str(purchaseinfoid),str(purchaseinfo["purchaseid"]),uuid,sendtype))
                                    pass




def handlestream(kvs):
    parsed = kvs.map(lambda (k, v): json.loads(v))#获取消息的json格式
    #处理发送任务
    send=parsed.filter(lambda x: True if x["messagetype"] == 2  else False)
    send.foreachRDD(sendPush)





if __name__ == "__main__":
    sc = SparkContext(appName="sendKafka")
    ssc = StreamingContext(sc, 1)
    kvs = KafkaUtils.createStream(ssc, zk_server, "send-group", {send_task_topic: 1})
    handlestream(kvs)
    ssc.start()
    ssc.awaitTermination()
Example #55
0
                                    colleciton.insert(notify_user)
                                    #print sendid, str(num),purchaseinfo["name"].encode("utf8"),purchaseinfo["price"].encode("utf8"),purchaseinfo["unit"].encode("utf8"),str(purchaseinfo["purchaseid"]),sendtype
                                    #reply_wx_notify(sendid, str(num), purchaseinfo["name"],purchaseinfo["price"], purchaseinfo["unit"], str(purchaseinfoid),str(purchaseinfo["purchaseid"]))
                                    thread.start_new_thread(
                                        reply_wx_notify,
                                        (sendid, str(num),
                                         purchaseinfo["name"],
                                         purchaseinfo["price"],
                                         purchaseinfo["unit"],
                                         str(purchaseinfoid),
                                         str(purchaseinfo["purchaseid"]), uuid,
                                         sendtype))
                                    pass


def handlestream(kvs):
    parsed = kvs.map(lambda (k, v): json.loads(v))  #获取消息的json格式
    #处理发送任务
    send = parsed.filter(lambda x: True if x["messagetype"] == 2 else False)
    send.foreachRDD(sendPush)


if __name__ == "__main__":
    sc = SparkContext(appName="sendKafka")
    ssc = StreamingContext(sc, 1)
    kvs = KafkaUtils.createStream(ssc, zk_server, "send-group",
                                  {send_task_topic: 1})
    handlestream(kvs)
    ssc.start()
    ssc.awaitTermination()
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils

if __name__ == "__main__":
    if len(sys.argv) != 3:
        print("Usage: script.py <zk> <topic>", file=sys.stderr)
        exit(-1)

    zkQuorum, topic = sys.argv[1:]

    sc = SparkContext(appName="KafkaSparkStreaming")
    sc.setLogLevel("WARN")
    ssc = StreamingContext(sc, 10)
    ssc.checkpoint("checkpoint")

    ks = KafkaUtils.createStream(ssc, zkQuorum, "spark-streaming-consumer", {topic: 42})

    def processInput(line):
        fields = line[1].split("\t")
        return ((str(fields[6]), 1), (str(fields[7]), 1))

    def updateFunction(newValues, runningCount):
        return sum(newValues, runningCount or 0)

    digest = ks.flatMap(processInput)\
               .updateStateByKey(updateFunction)\
               .transform(lambda rdd: rdd.sortBy(lambda x: x[1], ascending=False)\
                                         .map(lambda (x, y): y).zipWithIndex().map(lambda (x, y): (y, x))
               )

    def toCSVLine(data):
Example #57
0
    return cv2.imdecode(np.frombuffer(s, dtype=np.uint8), -1)
    # the dtype of the input data (.jpg) is np.uint8
   
def write_img(rdd):
    imgs = rdd.collect()
    for img in imgs:
        cv2.imwrite('res1.jpg', img)
        
if __name__ == '__main__':
    player_dict = getDict()
    
    # spark = SparkSession.builder.getOrCreate()
    sc = SparkContext()
    ssc = StreamingContext(sc, 1)

    raw_stream = KafkaUtils.createStream(ssc, 'localhost:2182', 'dl', {'dl_input':3}, valueDecoder=imdecoder)
    # Kafka default valueDecoder is str.decode('utf-8')
    imgs = raw_stream.map(lambda x:x[1])

    # load model
    model = get_testing_model()
    keras_weights_file = 'model/keras/model.h5'
    model.load_weights(keras_weights_file)

    # load config
    params, model_params = config_reader()
    
    # process as rdd
    info_list,temp = imgs.transform(body_and_num_recog)
    temp.foreachRDD(write_img)
    # num = temp.map(lambda x:int(x[1]))
Example #58
0

if __name__ == "__main__":
    # SparkContext represents connection to a Spark cluster.
    conf = SparkConf()
    conf.setAppName("Kafka Spark App")
    conf.setMaster('local[2]')
    sc = SparkContext(conf=conf)
    sc.setLogLevel("WARN")

    # StreamingContext represents connection to a Spark cluster from existing SparkContext.
    ssc = StreamingContext(
        sc, 60)  # the number indicates how many seconds each batch lasts.

    # Creates an input stream that pulls events from Kafka.
    kvs = KafkaUtils.createStream(ssc, "streamsetApp:2181",
                                  "spark-streaming-consumer", {"NETFLOW": 1})
    parsed = kvs.map(lambda x: json.loads(x[1]))

    # Get only elements that are needed and rename to make it clear.
    netflow_dict = parsed.map(lambda x: ({
        'srcAddr': x['srcaddr_s'],
        'srcPort': x['srcport'],
        'dstAddr': x['dstaddr_s'],
        'dstPort': x['dstport'],
        'tcpFlags': x['tcp_flags'],
        'protocol': x['proto'],
        'timestampStart': x['first'],
        'timestampEnd': x['last'],
        'numBytes': x['dOctets'],
        'numFlows': x['count']
    }))
def my_decoder(s):
    return s


def eye_aspect_ratio(eye):
    A = distance.euclidean(eye[1], eye[5])
    B = distance.euclidean(eye[2], eye[4])
    C = distance.euclidean(eye[0], eye[3])
    ear = (A + B) / (2.0 * C)
    return ear


kafkaStream = KafkaUtils.createStream(ssc,
                                      brokers,
                                      'test-consumer-group-1',
                                      {input_topic: 15},
                                      valueDecoder=my_decoder)
producer = KafkaProducer(bootstrap_servers='G01-01:9092',
                         compression_type='gzip',
                         batch_size=163840,
                         buffer_memory=33554432,
                         max_request_size=20485760)
thresh = 0.25
frame_check = 20
detect = dlib.get_frontal_face_detector()
predict = dlib.shape_predictor(
    predictor_path)  # Dat file is the crux of the code

(lStart, lEnd) = face_utils.FACIAL_LANDMARKS_68_IDXS["left_eye"]
(rStart, rEnd) = face_utils.FACIAL_LANDMARKS_68_IDXS["right_eye"]
Example #60
0
    virtualMachine = 'local'
    if socket.gethostname() == 'ubuntu':
        virtualMachine = socket.gethostname()

    if virtualMachine == 'local':
        dirTrainingModel = config.get('StreamingProperties', 'URLTrainingModelLocal')

    else:
        dirTrainingModel = config.get('StreamingProperties', 'URLTrainingModelHDFS')

    if virtualMachine == 'ubuntu':
        ssc = StreamingContext(sc, 2)
        brokers = "localhost:2181"

        kvs = KafkaUtils.createStream(ssc, \
                                       "localhost:2181", \
                                       topicName,
                                        {"topic":1})


        #kvs = KafkaUtils.createDirectStream (ssc, [topicName], {"metadata.broker.list": brokers})
        kvs.pprint()
        #kvs.foreachRDD (saveData)
        #brokers = "localhost:9092"
        #kvs = KafkaUtils.createDirectStream (ssc, [topicName], {"metadata.broker.list": brokers})
        #KafkaUtils.createStream(scc, )
        #kvs.pprint()
        #kvs.foreachRDD (saveStream)

        #rowData = data.map(lambda row: row.asDict())
        #rowData.saveToMongoDB(mongodb_connection + 'test.resultsStreaming')
        ssc.start()