Ejemplo n.º 1
1
def main():
    # Create a local StreamingContext with two working thread and batch interval of 5 second
    sc = SparkContext("spark://ip-172-31-29-29:7077", "MyKafkaStream")

    # stream interval of 5 seconds
    ssc = StreamingContext(sc, 5)
    kafkaStream = KafkaUtils.createStream(ssc, "52.3.61.194:2181", "GroupNameDoesntMatter", {"parking_sensor_data": 2})
    messages = kafkaStream.flatMap(lambda s: create_tuple(s[1])).reduceByKey(lambda a,b: (int(a)+int(b))/2)
    messages1 = messages.filter(lambda s: s[1] > 0)
    messages1.pprint()

    ssc.start()             # Start the computation
    ssc.awaitTermination()  # Wait for the computation to terminate
Ejemplo n.º 2
0
def bro_parse(zk,topic,db,db_table,num_of_workers):
    
    app_name = "ONI-INGEST-{0}".format(topic)
    wrks = int(num_of_workers)

 	# create spark context
    sc = SparkContext(appName=app_name)
    ssc = StreamingContext(sc,1)
    sqc = HiveContext(sc)

    # create DStream for each topic partition.
    topic_dstreams = [ KafkaUtils.createStream(ssc, zk, app_name, {topic: 1}, keyDecoder=oni_decoder, valueDecoder=oni_decoder) for _ in range (wrks)  ] 
    tp_stream = ssc.union(*topic_dstreams)

    # Parallelism in Data Processing
    #processingDStream = tp_stream(wrks)

    # parse the RDD content.
    proxy_logs = tp_stream.map(lambda x: proxy_parser(x[1]))

    # save RDD into hive .
    proxy_logs.foreachRDD(lambda x: save_to_hive(x,sqc,db,db_table,topic))

    ssc.start()
    ssc.awaitTermination()
def ss_direct_kafka_bucket_counter(brokers, topic, bucket_interval, output_msg, message_parse, valueDecoder=None):
    """Starts a Spark Streaming job from a Kafka input and parses message time

	WARNING!! This function only works for spark 1.4.0+ 

	Args:
		brokers: the kafka broker that we look at for the topic
		topic: the kafka topic for input
		timeinterval: the time interval in seconds (int) that the job will 
			bucket

	Returns:
		None
		
	"""
    sc = SparkContext(appName="PythonKafkaBucketCounter")
    ssc = StreamingContext(sc, timeinterval + 5)

    if valueDecoder:
        kvs = KafkaUtils.createDirectStream(ssc, [topic], {"metadata.broker.list": brokers}, valueDecoder=valueDecoder)
    else:
        kvs = KafkaUtils.createDirectStream(ssc, [topic], {"metadata.broker.list": brokers})

    lines = kvs.map(lambda x: x[1])
    interval_counts = lines.map(lambda line: (message_parse(line), 1)).reduceByKey(lambda a, b: a + b)

    output_msg_func = output_msg(sc, ssc)

    interval_counts.foreachRDD(output_msg_func)

    ssc.start()
    ssc.awaitTermination()
Ejemplo n.º 4
0
def main():
    sym_dict = {}
    conf = SparkConf().setAppName("symbol stream")
    sc = SparkContext(conf=conf)
    ssc = StreamingContext(sc, .1)

    lines = ssc.socketTextStream("localhost", 1337)
    
    def print_now():
        print sym_dict

    def predict(prices):
        print prices

    def add_to_dict(line):
        symbol, price, volume = line.split(',') 
        if symbol in sym_dict:
            print 'made it here'
            sym_dict[symbol][0].append(price)
            sym_dict[symbol][1].append(volume)
            if len(sym_dict[0]) > 10:
                sym_dict[0].pop(0)
                sym_dict[1].pop(0)
                predict(sym_dict[0])
        else:
            sym_dict[symbol] = [[price],[volume]]
    
    
    #test = lines.map(lambda line: json.dumps(line)) 
    test = lines.map(lambda line: line)
    test.pprint()
    ssc.start()
    ssc.awaitTermination()
Ejemplo n.º 5
0
def main():
    parser = argparse.ArgumentParser(
        description='process some log messages, storing them and signaling '
                    'a rest server')
    parser.add_argument('--mongo', help='the mongodb url',
                        required=True)
    parser.add_argument('--rest', help='the rest endpoint to signal',
                        required=True)
    parser.add_argument('--port', help='the port to receive from '
                        '(default: 1984)',
                        default=1984, type=int)
    parser.add_argument('--appname', help='the name of the spark application '
                        '(default: SparkharaLogCounter)',
                        default='SparkharaLogCounter')
    parser.add_argument('--master',
                        help='the master url for the spark cluster')
    parser.add_argument('--socket',
                        help='the socket to attach for streaming text data '
                        '(default: caravan-pathfinder)',
                        default='caravan-pathfinder')
    args = parser.parse_args()
    mongo_url = args.mongo
    rest_url = args.rest

    sconf = SparkConf().setAppName(args.appname)
    if args.master:
        sconf.setMaster(args.master)
    sc = SparkContext(conf=sconf)
    ssc = StreamingContext(sc, 1)

    lines = ssc.socketTextStream(args.socket, args.port)
    lines.foreachRDD(lambda rdd: process_generic(rdd, mongo_url, rest_url))

    ssc.start()
    ssc.awaitTermination()
Ejemplo n.º 6
0
def start_spark(timeout=None, max_items_per_rdd_sent=None):
    sc = SparkContext("local[4]", "twitter.trending")
    ssc = StreamingContext(sc, 5)

    ssc.checkpoint('hdfs://localhost:9000/user/spark/checkpoint/')

    kafka_params = {
        'zookeeper.connect': config.get('zookeeper', 'host'),
        'group.id': config.get('kafka', 'group_id'),
        'metadata.broker.list': config.get('kafka', 'hosts')
    }

    ksc = KafkaUtils.createDirectStream(ssc,
                                        [config.get('kafka', 'topic')],
                                        kafka_params)

    hashtag_counts = get_word_counts(ksc)
    filtered_tweet_count = filter_tweets(hashtag_counts)
    send_dstream_data(filtered_tweet_count, max_items_per_rdd_sent)
    ssc.start()
    if timeout:
        ssc.awaitTermination(timeout)
        ssc.stop(stopSparkContext=True, stopGraceFully=True)
    else:
        ssc.awaitTermination()
Ejemplo n.º 7
0
def main():
    if len(sys.argv) != 4:
        print("Usage: kafka_wordcount.py <zk> <topic> <timeout>",
              file=sys.stderr)
        exit(-1)

    sc = SparkContext(appName="PythonStreamingKafkaWordCount")
    ssc = StreamingContext(sc, 1)
    timeout = None
    if len(sys.argv) == 4:
        zk, topic, timeout = sys.argv[1:]
        timeout = int(timeout)
    else:
        zk, topic = sys.argv[1:]
    kvs = KafkaUtils.createStream(
        ssc, zk, "spark-streaming-consumer", {topic: 1})
    lines = kvs.map(lambda x: x[1])
    counts = lines.flatMap(lambda line: (line.split(" "))
                           .map(lambda word: (word, 1))
                           .reduceByKey(lambda a, b: a+b))
    counts.pprint()
    kwargs = {}
    if timeout:
        kwargs['timeout'] = timeout
    ssc.start()
    ssc.awaitTermination(**kwargs)
Ejemplo n.º 8
0
def start():
    sconf = SparkConf()
    sconf.set('spark.cores.max', 2)
    sc = SparkContext(appName='KafkaDirectWordCount', conf=sconf)
    ssc = StreamingContext(sc, 2)

    brokers = "192.192.0.27:9092"
    topics = ['topic7']

    kafkaStreams_lines = KafkaUtils.createDirectStream(ssc, topics, kafkaParams={"metadata.broker.list": brokers})

    lines1 = kafkaStreams_lines.map(lambda x: x[1])  # 注意 取tuple下的第二个即为接收到的kafka流

    words = lines1.flatMap(lambda line: line.split(" "))

    pairs = words.map(lambda word: (word, 1))

    wordcounts = pairs.reduceByKey(lambda x, y: x + y)

    wordcounts.saveAsTextFiles("/var/lib/hadoop-hdfs/spark-libin/kafka")

    wordcounts.pprint()
    # 统计生成的随机数的分布情况
    ssc.start()  # Start the computation
    ssc.awaitTermination()  # Wait for the computation to terminate
Ejemplo n.º 9
0
def start():
    sconf = SparkConf()
    sconf.set('spark.cores.max', 2)
    sc = SparkContext(appName='KafkaDirectWordCount', conf=sconf)
    ssc = StreamingContext(sc, 2)

    brokers = "localhost:9092"
    topics = ['test']

    kafkaStreams_lines = KafkaUtils.createDirectStream(ssc, topics, kafkaParams={"metadata.broker.list": brokers})

    lines1 = kafkaStreams_lines.map(lambda x: x[1])  # 注意 取tuple下的第二个即为接收到的kafka流

    words = lines1.flatMap(lambda line: line.split(" "))

    pairs = words.map(lambda word: (word, 1))

    wordcounts = pairs.reduceByKey(lambda x, y: x + y)

    print(wordcounts)

    kafkaStreams_lines.transform(storeOffsetRanges).foreachRDD(printOffsetRanges)

    wordcounts.pprint()
    # 统计生成的随机数的分布情况
    ssc.start()  # Start the computation
    ssc.awaitTermination()  # Wait for the computation to terminate
Ejemplo n.º 10
0
def main():
    sc = SparkContext(appName="IntrusionDetector")
    ssc = StreamingContext(sc, batch_durations)

    kvs = KafkaUtils.createDirectStream(ssc, [input_topic], {"metadata.broker.list": broker})
    kvs.foreachRDD(processRDD)
    ssc.start()
    ssc.awaitTermination()
Ejemplo n.º 11
0
def kafka_spark_streaming_sql_main(app_name, brokers, topic, interval_seconds, sql_function):
    sc = SparkContext(appName=app_name)
    sqlContext = SQLContext(sc)
    # ssc = StreamingContext(sc, interval_seconds)
    ssc = StreamingContext(sc, 10)
    kvs = KafkaUtils.createDirectStream(ssc, [topic], {"metadata.broker.list": brokers})
    kvs.foreachRDD(sql_function)
    ssc.start()
    ssc.awaitTermination()
def read_tweets():

    sc = SparkContext(appName="sentimentProducer")
    ssc = StreamingContext(sc,600)  # Test 60 segundos
    brokers = "localhost:9092"
    kvs = KafkaUtils.createDirectStream(ssc, ["test"], {"metadata.broker.list": brokers})
    kvs.foreachRDD(create_format)
    producer.flush()
    ssc.start()
    ssc.awaitTermination()
def main():
    conf = SparkConf().setAppName("kafka_source_mongo_sink_pymongo_filtered")
    sc = SparkContext(conf=conf)
    ssc = StreamingContext(sc, 1)
    try:
        kafka_streams = KafkaUtils.createStream(ssc, "localhost:2181", "spark-streaming-consumer", {"splash_json": 2})
        kafka_streams.foreachRDD(process_rdd)
    except Exception as e:
        print e
    ssc.start()
    ssc.awaitTermination()
Ejemplo n.º 14
0
def start():
    sc=SparkContext(appName='HdfsWordCount')
    ssc=StreamingContext(sc,5)
    #只会统计从启动streaming开始,新创建的文件
    lines = ssc.textFileStream('/user/hive/warehouse/streaming_status/2016091101')
    words = lines.flatMap(lambda line: line.split(" "))
    pairs = words.map(lambda word: (word, 1))
    wordCounts = pairs.reduceByKey(lambda x, y: x + y)
    wordCounts.pprint()
    ssc.start()             # Start the computation
    ssc.awaitTermination()  # Wait for the computation to terminate
def invoke():
    # object to keep track of offsets
    ConfigInitializer.basic_config()

    # app name
    application_name = "mon_metrics_kafka"

    my_spark_conf = SparkConf().setAppName(application_name)

    spark_context = SparkContext(conf=my_spark_conf)

    # read at the configured interval
    spark_streaming_context = \
        StreamingContext(spark_context, cfg.CONF.service.stream_interval)

    kafka_stream = MonMetricsKafkaProcessor.get_kafka_stream(
        cfg.CONF.messaging.topic,
        spark_streaming_context)

    # transform to recordstore
    MonMetricsKafkaProcessor.transform_to_recordstore(kafka_stream)

    # catch interrupt, stop streaming context gracefully
    # signal.signal(signal.SIGINT, signal_handler)

    # start processing
    spark_streaming_context.start()

    # FIXME: stop spark context to relinquish resources

    # FIXME: specify cores, so as not to use all the resources on the cluster.

    # FIXME: HA deploy multiple masters, may be one on each control node

    try:
        # Wait for the Spark driver to "finish"
        spark_streaming_context.awaitTermination()
    except Exception as e:
        MonMetricsKafkaProcessor.log_debug(
            "Exception raised during Spark execution : " + str(e))
        # One exception that can occur here is the result of the saved
        # kafka offsets being obsolete/out of range.  Delete the saved
        # offsets to improve the chance of success on the next execution.

        # TODO(someone) prevent deleting all offsets for an application,
        # but just the latest revision
        MonMetricsKafkaProcessor.log_debug(
            "Deleting saved offsets for chance of success on next execution")

        MonMetricsKafkaProcessor.reset_kafka_offsets(application_name)

        # delete pre hourly processor offsets
        if cfg.CONF.stage_processors.pre_hourly_processor_enabled:
            PreHourlyProcessor.reset_kafka_offsets()
def sparkTask():
    from textblob import TextBlob
    import re    
    from pyspark import SparkContext
    from pyspark.streaming import StreamingContext
    sc = SparkContext()
    ssc = StreamingContext(sc, 1)
    quotes = ssc.socketTextStream("localhost", 9999)
    dataSentencesPolarity = quotes.map(lambda x: TextBlob(re.sub('[^A-Za-z0-9 \.\']+', '',x))).map(lambda y: (str(y.upper())[:60], y.sentiment.polarity))
    dataSentencesPolarity.pprint()
    ssc.start()             # Start the computation
    ssc.awaitTermination(20)  # Wait for the computation to terminate    
def main():
    parser = OptionParser()
    parser.add_option('', '--enriched_data_path', action='store', dest='enriched_data_path', help='path to write enriched data')
    parser.add_option('', '--model_path', action='store', dest='model_path', help='path for model data')
    parser.add_option('', '--kafka_zookeeper_hosts', action='store', dest='kafka_zookeeper_hosts', help='list of Zookeeper hosts (host:port)')
    parser.add_option('', '--kafka_broker_list', action='store', dest='kafka_broker_list', help='list of Kafka brokers (host:port)')
    parser.add_option('', '--kafka_message_topic', action='store', dest='kafka_message_topic', help='topic to consume input messages from')
    parser.add_option('', '--kafka_alert_topic', action='store', dest='kafka_alert_topic', help='topic to produce alert messages to')
    parser.add_option('', '--kafka_enriched_data_topic', action='store', dest='kafka_enriched_data_topic', help='topic to produce enriched data to')
    parser.add_option('', '--streaming_batch_duration_sec', type='float', default=15.0,
        action='store', dest='streaming_batch_duration_sec', help='Streaming batch duration in seconds')
    parser.add_option('', '--max_batches', type='int', default=0,
        action='store', dest='max_batches', help='Number of batches to process (0 means forever)')
    options, args = parser.parse_args()

    sc = SparkContext()
    ssc = StreamingContext(sc, options.streaming_batch_duration_sec)
    sqlContext = getSqlContextInstance(sc)

    # Load saved model.
    model = None
    if options.model_path:
        model = RandomForestModel.load(sc, options.model_path)
    else:
        print('No model loaded.')

    # Create Kafka stream to receive new messages.
    kvs = KafkaUtils.createDirectStream(ssc, [options.kafka_message_topic], {
        'metadata.broker.list': options.kafka_broker_list,
        'group.id': 'spark_streaming_processor.py'})

    # Take only the 2nd element of the tuple.
    messages = kvs.map(lambda x: x[1])

    # Convert RDD of JSON strings to RDD of Rows.
    rows = messages.map(json_to_row)

    # Process messages.
    rows.foreachRDD(lambda time, rdd: 
        process_messages(time, rdd,
            ssc=ssc,
            model=model,
            enriched_data_path=options.enriched_data_path,
            zookeeper_hosts=options.kafka_zookeeper_hosts,
            kafka_alert_topic=options.kafka_alert_topic,
            kafka_enriched_data_topic=options.kafka_enriched_data_topic,
            max_batches=options.max_batches))

    ssc.start()
    ssc.awaitTermination()
Ejemplo n.º 18
0
def main():
    sc = SparkContext(appName="PythonStreamingKafkaWordCount")
    ssc = StreamingContext(sc, 1)

    zkQuorum = "localhost:2181"
    topic = "twitter_raw"
    kvs = KafkaUtils.createStream(ssc, zkQuorum, "spark-streaming-consumer", {topic: 1})
    lines = kvs.map(lambda x: pickle.loads(x[1].decode("utf-8"))["text"])  # fetch the text
    count = lines.map(lambda line: len(line.split())).reduce(add)  # split into words and count
    count.foreachRDD(publishToRedis)  # publish to redis
    count.pprint()

    ssc.start()
    ssc.awaitTermination()
Ejemplo n.º 19
0
    def start(self):

        sc = SparkContext(appName="PythonStreamingNOTHS")
        ssc = StreamingContext(sc, 10)

        kvs = KafkaUtils.createStream(ssc, self.zkQuorum, "spark-streaming-consumer", {self.topic: 1})
        print('******* Event received in window: ', kvs.pprint())

        if topic == 'NOTHS-crawler-topic':
            kvs.foreachRDD(self.save_crawler_hbase)
        elif topic == 'NOTHS-trends-topic':
            kvs.foreachRDD(self.save_trends_hbase)

        ssc.start()
        ssc.awaitTermination()
class xStreamProcessor:
    ip = socket.gethostbyname(socket.gethostname())
    port = 9999
    dstream = None
    sc = None
    ssc = None

    #def __init__(self,ip=None,port=None,spark_master = 'spark://localhost:7077'):
    def __init__(self,ip=None,port=None,spark_master = 'mesos://10.0.2.85:5050'):
        if ip is not None:
            self.ip = ip
        if port is not None:
            self.port = port
        self.sc = SparkContext(master=spark_master,appName='StreamProcessor')
        self.ssc = StreamingContext(self.sc, 1)
        #self.ssc.checkpoint(directory=None)
        hiveContext = HiveContext(self.sc)
        hiveContext.sql('DROP TABLE IF EXISTS default.tweet_stream')
        hiveContext.sql('CREATE TABLE IF NOT EXISTS default.tweet_stream (ip STRING, port STRING, date_time STRING, user STRING, msg STRING)')

        hiveContext.sql('DROP TABLE IF EXISTS default.email_stream')
        hiveContext.sql('CREATE TABLE IF NOT EXISTS default.email_stream (ip STRING, port STRING, date_time STRING, \
        fr STRING,to STRING, subject STRING, content STRING, subject_sentiment INT, content_sentiment INT, \
        subject_power INT, content_power INT,  subject_topic INT, content_topic INT, fraud_score DOUBLE)')

        hiveContext.sql('DROP TABLE IF EXISTS default.email_graph')
        hiveContext.sql('CREATE TABLE IF NOT EXISTS default.email_graph (fr STRING,to STRING, dt STRING)')

        hiveContext.sql('DROP TABLE IF EXISTS default.trans_stream')
        hiveContext.sql('CREATE TABLE IF NOT EXISTS default.trans_stream (ip STRING,port STRING, date_time STRING, user STRING, amount DOUBLE, \
        big_trans INT, is_in_odd_day INT, is_at_odd_time INT)')

        self.dstream = self.ssc.socketTextStream(self.ip, self.port)


        self.process_stream()

        self.ssc.start()
        self.ssc.awaitTermination()

    def process_stream(self):
        parts = self.dstream.flatMap(lambda line: line.split("|"))
        words = parts.map(lambda p: p[3])
        pairs = words.map(lambda word: (word, 1))
        wordCounts = pairs.reduceByKey(lambda x, y: x + y)

        # Print the first ten elements of each RDD generated in this DStream to the console
        wordCounts.pprint()
Ejemplo n.º 21
0
def bluecoat_parse(zk,topic,db,db_table,num_of_workers,batch_size):
    
    app_name = topic
    wrks = int(num_of_workers)

    # create spark context
    sc = SparkContext(appName=app_name)
    ssc = StreamingContext(sc,int(batch_size))
    sqc = HiveContext(sc)

    tp_stream = KafkaUtils.createStream(ssc, zk, app_name, {topic: wrks}, keyDecoder=spot_decoder, valueDecoder=spot_decoder)

    proxy_data = tp_stream.map(lambda row: row[1]).flatMap(lambda row: row.split("\n")).filter(lambda row: rex_date.match(row)).map(lambda row: row.strip("\n").strip("\r").replace("\t", " ").replace("  ", " ")).map(lambda row:  split_log_entry(row)).map(lambda row: proxy_parser(row))
    saved_data = proxy_data.foreachRDD(lambda row: save_data(row,sqc,db,db_table,topic))
    ssc.start();
    ssc.awaitTermination()
def main():
    brokers = 'localhost:9092'
    topic = 'openbmp.parsed.unicast_prefix'
    sc = SparkContext(appName='BGPPrefixOriginValidation')
    ssc = StreamingContext(sc,2)
 
    directKafkaStream = KafkaUtils.createDirectStream(ssc, [topic], {'metadata.broker.list':brokers})
    #directKafkaStream.pprint()

    lines = directKafkaStream.flatMap(lambda x: x[1].splitlines()).filter(lambda line: line.startswith('add'))
    structured_rdd = lines.map(structure_data)
 
    structured_rdd.foreachRDD(lambda rdd: rdd.foreachPartition(validate_bgp_prefix)) 
    
    ssc.start()
    ssc.awaitTermination()
Ejemplo n.º 23
0
def start_spark_streaming():
    question_db = QuestionDatabase(QB_QUESTION_DB)
    features = {name: instantiate_feature(name, question_db) for name in FEATURE_NAMES}

    sc = create_sc()
    b_features = sc.broadcast(features)
    ssc = StreamingContext(sc, 5)

    ssc.socketTextStream('localhost', 9999) \
        .repartition(QB_STREAMING_CORES - 1) \
        .flatMap(lambda line: generate_guesses(line, b_features)) \
        .map(lambda sg: evaluate_features(sg, b_features)) \
        .foreachRDD(score_and_save)

    ssc.start()
    ssc.awaitTermination()
    sc.stop()
def main():

    master = 'local[2]'
    app_name = 'reduce_demo1'

    # print(range(0, 3))

    sc = SparkContext(master, app_name)
    ssc = StreamingContext(sc, 15)

    host = 'localhost'
    port = 9999
    stream = ssc.socketTextStream(host, port)
    stream.foreachRDD(fun_union_in_dstream_foreachRDD)

    ssc.start()
    ssc.awaitTermination()
Ejemplo n.º 25
0
def start():
    # local test mode
    # sc=SparkContext('local[2]',appName='NetworkWordCount')
    sc=SparkContext(appName='NetworkWordCount')
    ssc=StreamingContext(sc,1)
    # Create a DStream that will connect to hostname:port, like localhost:9999
    lines = ssc.socketTextStream("10.5.24.137", 9999)
    # Split each line into words
    words = lines.flatMap(lambda line: line.split(" "))
    # Count each word in each batch
    pairs = words.map(lambda word: (word, 1))
    wordCounts = pairs.reduceByKey(lambda x, y: x + y)
    print wordCounts
    # Print the first ten elements of each RDD generated in this DStream to the console
    wordCounts.pprint()
    ssc.start()             # Start the computation
    ssc.awaitTermination()  # Wait for the computation to terminate
def main():
    parser = argparse.ArgumentParser(
        description='process some log messages, storing them and signaling '
                    'a rest server')
    parser.add_argument('--mongo', help='the mongodb url',
                        required=True)
    parser.add_argument('--rest', help='the rest endpoint to signal',
                        required=True)
    parser.add_argument('--port', help='the port to receive from '
                        '(default: 1984)',
                        default=1984, type=int)
    parser.add_argument('--appname', help='the name of the spark application '
                        '(default: SparkharaLogCounter)',
                        default='SparkharaLogCounter')
    parser.add_argument('--master',
                        help='the master url for the spark cluster')
    parser.add_argument('--socket',
                        help='the socket ip address to attach for streaming '
                        'text data (default: caravan-pathfinder)',
                        default='caravan-pathfinder')
    parser.add_argument('--model',
                        help='the serialized model to use',
                        default='model.json')
    args = parser.parse_args()
    mongo_url = args.mongo
    rest_url = args.rest
    model = args.model

    sconf = SparkConf().setAppName(args.appname)
    if args.master:
        sconf.setMaster(args.master)
    sc = SparkContext(conf=sconf)
    ssc = StreamingContext(sc, 1)
    somv = fromJSON(model)
    som = sc.broadcast(somv)

    log4j = sc._jvm.org.apache.log4j
    log4j.LogManager.getRootLogger().setLevel(log4j.Level.WARN)

    lines = ssc.socketTextStream(args.socket, args.port)
    lines.foreachRDD(lambda rdd: process_generic(rdd, mongo_url,
                                                 rest_url, som))

    ssc.start()
    ssc.awaitTermination()
Ejemplo n.º 27
0
def consumer():
    def process(time, rdd):
        global words
        words += Counter(dict(rdd.collect()))

    sc = SparkContext(appName='graaftel')
    ssc = StreamingContext(sc, 5)

    lines = ssc.socketTextStream(os.getenv('PRODUCER_SERVICE_HOST', 'localhost'),
                                 int(os.getenv('PRODUCER_SERVICE_PORT', 8080)))
    counts = lines.flatMap(lambda line: line.encode('ascii', 'ignore').lower().split()) \
                  .map(lambda word: word.translate(None, string.punctuation)) \
                  .filter(lambda word: word not in stop_words) \
                  .map(lambda word: (word, 1)) \
                  .reduceByKey(add)
    counts.foreachRDD(process)

    ssc.start()
    ssc.awaitTermination()
Ejemplo n.º 28
0
def main():
#main function to execute code
    sc = SparkContext(appName="ReadingWriter")
    ssc = StreamingContext(sc,10)
    sqlContext = SQLContext(sc)
    zk_host = zk_ip+":2181"
    consumer_group = "reading-consumer-group"
    kafka_partitions={"amtest":1}
    #create kafka stream
    kvs = KafkaUtils.createStream(ssc,zk_host,consumer_group,kafka_partitions,valueDecoder=decoder)
    lines = kvs.map(lambda x: x[1])
    #readings = lines.map(lambda x: {"device_id":x["device_id"],"metric_time":x["metric_time"],"metric_name":x["metric_name"],"metric_value":x["metric_value"]})
    readings = lines.map(lambda x: {"device_id":x["device_id"],"metric_time":datetime.datetime.fromtimestamp(int(x["metric_time"])),"metric_name":x["metric_name"],"metric_value":float(x["metric_value"])})
    readings.foreachRDD(lambda rdd: rdd.saveToCassandra("metrics", "raw_metrics"))
    #readingdf.show()
    #readings.pprint()
    #lines.saveToCassandra("metrics", "raw_metrics")
    ssc.start()
    ssc.awaitTermination()
Ejemplo n.º 29
0
def main():
#main function to execute code
    sc = SparkContext(appName="CouponCounterPySpark")
    ssc = StreamingContext(sc,10)
    zk_host = "localhost:2181"
    consumer_group = "coupon-event-consumers"
    kafka_partitions={"test":1}
    #create kafka stream
    lines = KafkaUtils.createStream(ssc,zk_host,consumer_group,kafka_partitions)
    events = lines.map(lambda line: line[1].split(','))
    tmpagg = events.map(lambda event: ((event[1]),1) )
    coupon_counts = tmpagg.reduceByKey(lambda x,y: x+y)
    coupon_records = coupon_counts.map(lambda x: {"offer_id" : x[0], "bucket" : str(datetime.datetime.now().strftime("%s")), "count" : int(x[1])})
    #coupon_records.pprint()
    #coupon_records.registerTempTable("coupon_counters")
    #coupon_records.select("offer_id","bucket","count").show()
    #coupon_records = coupon_counts.map(lambda record: {"offer_id" : record[0],"bucket" : str(int(datetime.datetime.now().strftime("%s"))*1000),"count" : int(record[1])}
    coupon_records.pprint()
    coupon_records.foreachRDD(lambda rdd: rdd.saveToCassandra("loyalty","coupon_counters"))
    ssc.start()
    ssc.awaitTermination()
Ejemplo n.º 30
0
class Consumer:
	'Simple spark kafka streaming consumer'

	def __init__(self, casshost, interval, zookeeper, topic):
		self.conf = SparkConf().setAppName("KafkaSpark").set("spark.cassandra.connection.host", casshost)
		self.sc   = SparkContext(conf=self.conf)
		self.sqlContext = SQLContext(sparkContext=self.sc)
		self.ssc = StreamingContext(self.sc, batchDuration=interval)
		self.zookeeper = zookeeper
		self.topic = topic

	def check_and_write(self, x):
		try:
			x.toDF().write.format("org.apache.spark.sql.cassandra").options(table="test1", keyspace = "mykeyspace").save(mode ="append") 
		except ValueError:
			print "No rdd found!"

	def consume(self):
		messages = KafkaUtils.createStream(self.ssc, self.zookeeper, "spark-streaming-consumer", {self.topic: 1})
		lines = messages.map(lambda x: x[1])

		rows = lines.map(lambda x: { 
			"data": json.loads(x)['data'],
			"time": json.loads(x)['time']
		})

		rows.foreachRDD(lambda x: {
			self.check_and_write(x)
		})

		self.ssc.start()
		self.ssc.awaitTermination()

	def stop(self):
		if self.sqlContext != None:
			self.sqlContext.stop()
		if self.ssc != None:
			self.ssc.stop()
		if self.sc != None:
			self.sc.stop()
Ejemplo n.º 31
0
class StreamingDriver(object):
    def __init__(self, conf):
        # initialize config params
        self.batch_interval = conf['batch_interval']
        self.window_length = conf['window_length']
        self.sliding_interval = conf['sliding_interval']
        self.sm_socket = tuple(conf['sm_socket'])
        self.sm_listener = Listener(self.sm_socket)
        self.op_handler_socket = conf['op_handler_socket']

        self.spark_stream_address = conf['spark_stream_address']
        self.spark_stream_port = conf['spark_stream_port']

        self.start_time = time.time()

        self.sc = SparkContext(appName="Sonata-Streaming")
        self.sc.setLogLevel("OFF")
        self.ssc = StreamingContext(self.sc, self.batch_interval)

    def start(self):
        lines = self.ssc.socketTextStream(self.spark_stream_address,
                                          self.spark_stream_port)
        pktstream = (lines.map(lambda line: processLogLine(line)))
        print(self.window_length, self.sliding_interval)
        self.process_pktstream(pktstream)
        self.ssc.start()
        self.ssc.awaitTermination()

    def process_pktstream(self, pktstream):
        print("pktstream")

        spark_queries = {}

        conn = self.sm_listener.accept()
        raw_data = conn.recv()
        data = pickle.loads(raw_data)

        queries = data['queries']
        join_queries = data['join_queries']

        for queryId in queries:
            query = queries[queryId]

            if not query.has_join and queryId not in join_queries:
                query_str = "pktstream.window(self.window_length, self.sliding_interval).transform(lambda rdd: (rdd.filter(lambda p : (p[1]==str('" + str(
                    queryId
                ) + "'))).map(lambda p : (p[2:]))." + query.compile(
                ) + ")).foreachRDD(lambda rdd:send_reduction_keys(rdd, " + str(
                    self.op_handler_socket) + "," + str(
                        self.start_time) + ",\'" + str(queryId) + "\'))"
                print(query_str)
                spark_queries[queryId] = eval(query_str)
            elif not query.has_join and queryId in join_queries:
                query_str = "pktstream.window(self.window_length, self.sliding_interval).transform(lambda rdd: (rdd.filter(lambda p : (p[1]==str('" + str(
                    queryId
                ) + "'))).map(lambda p : (p[2:]))." + query.compile(
                ) + "))"  #.foreachRDD(lambda rdd:send_reduction_keys(rdd, " + str(self.op_handler_socket) + "," + str(self.start_time) + ",\'" + str(queryId) + "\'))"
                print(query_str)
                spark_queries[queryId] = eval(query_str)
            else:
                query_str = query.compile(
                ) + ".foreachRDD(lambda rdd: print(\"Join \" + str(rdd.take(5))))"
                print(query_str)
                spark_queries[queryId] = eval(query_str)
class Spark_Tracker():
    """Stream WebCam Images to Kafka Endpoint."""
    def __init__(self,
                 interval=0.1,
                 topic_to_consume='test',
                 topic_for_produce='position',
                 kafka_endpoint='master:6667'):
        """Initialize our yolo model."""
        self.yolo = YOLO()

        # Create Kafka Producer for sending results
        self.topic_to_consume = topic_to_consume
        self.topic_for_produce = topic_for_produce
        self.kafka_endpoint = kafka_endpoint
        self.producer = KafkaProducer(
            bootstrap_servers=kafka_endpoint,
            value_serializer=lambda m: json.dumps(m).encode('utf8'))
        """Initialize Spark environment."""

        sc = SparkContext(appName='VideoTics')
        self.ssc = StreamingContext(sc, interval)  # , 3)

        # Make Spark logging less extensive
        log4jLogger = sc._jvm.org.apache.log4j
        log_level = log4jLogger.Level.ERROR
        log4jLogger.LogManager.getLogger('org').setLevel(log_level)
        log4jLogger.LogManager.getLogger('akka').setLevel(log_level)
        log4jLogger.LogManager.getLogger('kafka').setLevel(log_level)
        self.logger = log4jLogger.LogManager.getLogger(__name__)
        self.objects_detected_view_text = ""

        # Set deep_sort param
        self.max_cosine_distance = 0.3
        self.nn_budget = None
        self.model_filename = 'model_data/mars-small128.pb'
        self.nms_max_overlap = 1.0
        self.encoder = gdet.create_box_encoder(self.model_filename,
                                               batch_size=1)
        self.metric = nn_matching.NearestNeighborDistanceMetric(
            "cosine", self.max_cosine_distance, self.nn_budget)
        self.tracker = Tracker(self.metric)

    def start_processing(self):
        """Start consuming from Kafka endpoint and detect objects."""
        kvs = KafkaUtils.createDirectStream(
            self.ssc, [self.topic_to_consume],
            {'metadata.broker.list': self.kafka_endpoint})
        kvs.foreachRDD(self.process_frame)
        self.ssc.start()
        self.ssc.awaitTermination()

    def detect_person_track(self, event):
        """Use Yolo to detect person."""
        try:
            decoded = base64.b64decode(event['image'])
        except TypeError:
            return

        # TODO: Picking unique filenames or find a way to send it to kafka

        filename = 'codev1frame.jpg'  # find a way to pick unique filenames
        with open(filename, 'wb') as f:
            f.write(decoded)
        frame = cv2.imread(filename)
        image = Image.fromarray(frame[..., ::-1])  # bgr to rgb
        boxs = self.yolo.detect_image(image)
        #print("box_num", len(boxs))
        features = self.encoder(frame, boxs)

        # score to 1.0 here).
        detections = [
            Detection(bbox, 1.0, feature)
            for bbox, feature in zip(boxs, features)
        ]

        # Run non-maxima suppression.
        boxes = np.array([d.tlwh for d in detections])
        scores = np.array([d.confidence for d in detections])
        indices = preprocessing.non_max_suppression(boxes,
                                                    self.nms_max_overlap,
                                                    scores)
        detections = [detections[i] for i in indices]
        """Use deep-sort to track person."""
        # Call the tracker
        self.tracker.predict()
        self.tracker.update(detections)

        for track in self.tracker.tracks:
            if not track.is_confirmed() or track.time_since_update > 1:
                continue
            bbox = track.to_tlbr()
            cv2.rectangle(frame, (int(bbox[0]), int(bbox[1])),
                          (int(bbox[2]), int(bbox[3])), (255, 255, 255), 2)
            cv2.putText(frame, str(track.track_id),
                        (int(bbox[0]), int(bbox[1])), 0, 5e-3 * 200,
                        (0, 255, 0), 2)
        for det in detections:
            bbox = det.to_tlbr()
            cv2.rectangle(frame, (int(bbox[0]), int(bbox[1])),
                          (int(bbox[2]), int(bbox[3])), (255, 0, 0), 2)
        # sent result to kafka
        if len(boxs) > 0:
            for i in range(0, len(boxs)):
                self.objects_detected_view_text = 'ID:' + str(
                    track.track_id) + '  x:' + str(boxs[i][0]) + '  y:' + str(
                        boxs[i][1]) + '  width:' + str(
                            boxs[i][2]) + '  height:' + str(boxs[i][3])
                result = {
                    'ID': str(self.tracker.tracks[i].track_id),
                    'timestamp': dt.datetime.now().isoformat(),
                    'location_x': str(boxs[i][0]),
                    'w': str(boxs[i][2]),
                    'image': self.convert_image_to_text(frame)
                }
                self.producer.send('position', result)
                self.producer.flush()
                self.logger.info('prediction: ' +
                                 self.objects_detected_view_text)
            return

    def convert_image_to_text(self, frame):
        img_str = cv2.imencode('.jpeg', frame)[1]
        img_as_text = base64.b64encode(img_str).decode('utf-8')
        return img_as_text

    def process_frame(self, timestamp, dataset):
        # Definition of the parameters
        to_process = {}
        data = dataset.collect()
        self.logger.info('\033[3' + str(randint(1, 7)) + ';1m' +  # Color
                         '-' * 25 + '[ NEW MESSAGES: ' + str(len(data)) +
                         ' ]' + '-' * 25 + '\033[0m'  # End color
                         )
        dt_now = dt.datetime.now()

        for datum in data:
            event = json.loads(datum[1])
            self.logger.info('Received Message: ' + event['camera_id'] +
                             ' - ' + event['timestamp'])
            dt_event = dt.datetime.strptime(event['timestamp'],
                                            '%Y-%m-%dT%H:%M:%S.%f')
            delta = dt_now - dt_event
            #print("timestamp = " + str(dt_event))
            if delta.seconds > 5:
                continue
            to_process[event['camera_id']] = event

        if len(to_process) == 0:
            self.logger.info('Skipping processing...')

        for key, event in to_process.items():
            self.logger.info('Processing Message: ' + event['camera_id'] +
                             ' - ' + event['timestamp'])
            start = timer()
            detection_result = self.detect_person_track(event)

            end = timer()
            delta = end - start
            self.logger.info('Done after ' + str(delta) + ' seconds.')

            try:
                if detection_result:
                    self.logger.info('Sent image to Kafka endpoint.')

            except AssertionError:
                self.objects_detected_view_text = 'No person found!'
                continue
            # Press Q to stop!
            if cv2.waitKey(1) & 0xFF == ord('q'):
                break
Ejemplo n.º 33
0
from pyspark import SparkContext
from pyspark.sql import SparkSession

conf = SparkConf()
conf.setAppName("TwitterStreamApp")
# create spark instance with the above configuration
sc = SparkContext(conf=conf)
ssc = StreamingContext(sc, 5)  # 5 second batch interval

IP = "localhost"  # Replace with your stream IP
Port = 9009  # Replace with your stream port

lines = ssc.socketTextStream(IP, Port)
lines.pprint()  # Print tweets we find to the console
ssc.start()  # Start reading the stream
ssc.awaitTermination()  # Wait for the process to terminate

# TweetRead.py
# This first python script doesn’t use Spark at all:
import os
import tweepy
from tweepy import OAuthHandler
from tweepy import Stream
from tweepy.streaming import StreamListener
import socket
import json

from apiConfigs import twitterConfigs
consumer_key = twitterConfigs.apiKey
consumer_secret = twitterConfigs.secretKey
access_token = twitterConfigs.token
Ejemplo n.º 34
0
def hastags_func(line):
    n = line.split(";")[7]
    if (',' in n):
        return n.split(",")
    return [n]


conf = SparkConf()
conf.setAppName("BigData")
sc = SparkContext(conf=conf)

ssc = StreamingContext(sc, int(sys.argv[2]))
ssc.checkpoint("~/checkpoint_BIGDATA")

dataStream = ssc.socketTextStream("localhost", 9009)
"""
hashtags_count=dataStream.flatMap(hastags)\
                .map(lambda hashtag : (hashtag, 1))\
                .reduceByKeyAndWindow(lambda x,y:int(x)+int(y),int(sys.argv[1]),1)
"""
hash_count=dataStream.window(int(sys.argv[1]),1)\
                .flatMap(hastags_func)\
                .map(lambda hashtag : (hashtag, 1))\
                .reduceByKey(lambda x,y:int(x)+int(y))

hash_count.foreachRDD(sorted_print)

ssc.start()
ssc.awaitTermination(25)
ssc.stop()
Ejemplo n.º 35
0
        for record in taken[:num]:
            with open("my_chem.csv", "a") as myfile:
                writer = csv.writer(myfile)
                writer.writerow(list(record))

    val.foreachRDD(takeAndPrint)


player_r = player_r.reduceByKey(lambda x, y: (x + y) / 2)
player_rr = player_r.map(lambda x: (x[0][0], x[1]))
player_rr = player_rr.map(lambda x: (1, x))
date = date.map(lambda x: (1, x))
player_rr = player_rr.join(date)
player_rr = player_rr.map(lambda x: (x[1][0][0], x[1][0][1], x[1][1]))
#player_rr.pprint()
tpprint_rating(player_rr)
#tpprint_chemistry_same(chemistry_same)
#tpprint_chemistry_opp(chemistry_opp)
player_profile1 = player_profile.map(lambda x: (x[0][0], (x[0][0], x[1][0], x[
    1][1], x[1][2], x[1][3], x[1][4])))
tpprint_player_profile(player_profile1)

all_chemistry = chemistry_same.union(chemistry_opp)
all_chemistry = all_chemistry.map(lambda x: (x[0][0], x[0][1], x[1]))
tpprint_chem(all_chemistry)
#all_chemistry.pprint()
ssc.start()
ssc.awaitTermination(
    250)  #check ssc.awaitTermination() # set how many seconds we want in ()
ssc.stop()
Ejemplo n.º 36
0
                                 optimizing_fn=max)


if __name__ == '__main__':
    if len(sys.argv) < 4:
        raise Exception(
            "Insufficient Arguments: python3 main.py <port> <stream_interval> <path_to_test_data>"
        )

    APPNAME = "Poker Hand Classification"
    HOSTNAME = "localhost"
    MASTER = "local[4]"
    PORT = int(sys.argv[1])
    STREAM_INTERVAL = int(sys.argv[2])
    TEST_DATA_FILE_PATH = sys.argv[3]
    MINIMUM_RDD_LIST_LEN = 3
    K = 2
    RDD_LIST = list()
    ec_spark_context = SparkContext(MASTER, APPNAME)
    ec_spark_streaming_context = StreamingContext(
        ec_spark_context, batchDuration=STREAM_INTERVAL)

    # Read the test data set.
    poker_hands_test_rdd = ec_spark_context.textFile(TEST_DATA_FILE_PATH).map(
        tokenize).filter(not_contains_null).map(make_labeled_point)

    submit_spark_app(HOSTNAME, PORT)

    ec_spark_streaming_context.start()
    ec_spark_streaming_context.awaitTermination()
Ejemplo n.º 37
0
#)
"""
data = kafkaStream.map(lambda line: json.loads(line)
"""
rsvp = kafkaStream.map(lambda line: line[1])
rsvp2 = rsvp.map(lambda line: json.loads(line.encode("ascii", "ignore")))

#kafkaStream.pprint()
"""
process = data.mapValues(lambda line: line.encode('ascii')).cache()
"""

#event = data["topic_name"]
#print(data.pprint())

print(rsvp.pprint())
"""
print (data.mapValues(enc).pprint())
"""

#print (event.pprint())
ssc.start()
time.sleep(100)
ssc.stop(stopSparkContext=True, stopGraceFully=True)
"""
ssc.awaitTermination().stop(stopSparkContext=True, stopGraceFully=True)
"""

#bin/spark-submit --packages org.apache.spark:spark-streaming-kafka-0-8:2.1.0
#bin/spark-submit --jars <spark-streaming-kafka-0-8-assembly.jar>
    return line.split(";")[7].split(",")


def f1(r):
    ab1 = r.sortBy(lambda x: (-x[1], x[0]))
    ab2 = ab1.collect()
    c = 0
    i = 0
    if (ab2 != []):
        while (c != 5):
            if (ab2[i][0] != ''):
                if (c != 4):
                    print(ab2[i][0], end=',')
                else:
                    print(ab2[i][0])
                c += 1
            i += 1


context = SparkContext(conf=conf)
s1 = StreamingContext(context, int(sys.argv[2]))
s1.checkpoint("~/checkpoint_BIGDATA")
stream = s1.socketTextStream("localhost", 9009)
x1 = stream.window(int(sys.argv[1]), 1)
x2 = x1.flatMap(rc).map(lambda nam: (nam, 1))
output = x2.reduceByKey(lambda a, b: int(a) + int(b))
output.foreachRDD(f1)
s1.start()
s1.awaitTermination(25)
s1.stop()
Ejemplo n.º 39
0
class Spark_Object_Detector():
    """Stream WebCam Images to Kafka Endpoint."""
    def __init__(self,
                 interval=10,
                 topic_to_consume='test',
                 topic_for_produce='resultstream',
                 kafka_endpoint='127.0.0.1:9092'):
        """ Initialize our yolo and firearm model"""

        self.detector = SuspicionDetection.SuspicionDetection()
        self.detector.enable_yolo_detection()
        self.detector.enable_firearm_detection()
        """Initialize Spark & TensorFlow environment."""

        self.topic_to_consume = topic_to_consume
        self.topic_for_produce = topic_for_produce
        self.kafka_endpoint = kafka_endpoint

        # Create Kafka Producer for sending results
        self.producer = KafkaProducer(bootstrap_servers=kafka_endpoint)

        sc = SparkContext(appName='FirmArmDetection')
        self.ssc = StreamingContext(sc, interval)  # , 3)

        # Make Spark logging less extensive
        log4jLogger = sc._jvm.org.apache.log4j
        log_level = log4jLogger.Level.ERROR
        log4jLogger.LogManager.getLogger('org').setLevel(log_level)
        log4jLogger.LogManager.getLogger('akka').setLevel(log_level)
        log4jLogger.LogManager.getLogger('kafka').setLevel(log_level)
        self.logger = log4jLogger.LogManager.getLogger(__name__)
        self.objects_detector_prediction = []
        self.objects_detected_view_text = ""

    def _update_predictions(self):
        self.objects_detector_prediction = self.detector.get_yolo_prediction()
        self.firearm_detector_prediction = (
            self.detector.get_firearm_detector_prediction())
        self.activity_detector_prediction = (
            self.detector.get_activity_detector_prediction())
        self.event_detector_prediction = (
            self.detector.get_event_detector_prediction())

        self.detected_objects = []
        if self.objects_detector_prediction:
            self.detected_objects.extend(self.objects_detector_prediction)
        if self.firearm_detector_prediction:
            self.detected_objects.extend(self.firearm_detector_prediction)

        if self.detected_objects:
            self._update_detected_objects(self.detected_objects)

    def _update_detected_objects(self, objects_prediction):
        parsed_objects = [p['label'] for p in objects_prediction]
        parsed_objects_dict = collections.Counter(parsed_objects)
        detected_suspicious_objects = False
        objects = ''

        for (obj, count) in parsed_objects_dict.items():
            objects += '%s (%d)\n' % (obj, count)
            if obj in vgconf.SUSPICIOUS_OBJECTS_LIST:
                detected_suspicious_objects = True

        self.objects_detected_view_text = objects
        """ Do when suspicious object is detected """
        # Start alert if suspicious object is detected.
        # if detected_suspicious_objects:
        #     self._start_alert()

    def start_processing(self):
        """Start consuming from Kafka endpoint and detect objects."""
        kvs = KafkaUtils.createDirectStream(
            self.ssc, [self.topic_to_consume],
            {'metadata.broker.list': self.kafka_endpoint})
        kvs.foreachRDD(self.handler)
        self.ssc.start()
        self.ssc.awaitTermination()

    def detect_objects(self, event):
        """Use Yolo and Incepiton Model to detect objects."""

        decoded = base64.b64decode(event['image'])

        # TODO: Picking unique filenames or find a way to send it to kafka

        filename = 'C:\\Users\\hp\\Desktop\\codev1frame.jpg'  # I assume you have a way of picking unique filenames
        with open(filename, 'wb') as f:
            f.write(decoded)
        img = cv2.imread(filename)

        # Prepare object for sending to endpoint
        result = {
            'timestamp': event['timestamp'],
            'camera_id': event['camera_id'],
            'image': self.get_box_plot(img),
            'prediction': self.objects_detected_view_text
        }
        return json.dumps(result)

    def get_box_plot(self, img):
        self.detector.detect(img)
        frame = self.detector.plot_objects(img)
        self._update_predictions()
        img_str = cv2.imencode('.jpeg', frame)[1]
        img_as_text = base64.b64encode(img_str).decode('utf-8')
        return img_as_text

    def handler(self, timestamp, message):
        """Collect messages, detect object and send to kafka endpoint."""
        records = message.collect()
        # For performance reasons, we only want to process the newest message
        # for every camera_id
        to_process = {}
        self.logger.info('\033[3' + str(randint(1, 7)) + ';1m' +  # Color
                         '-' * 25 + '[ NEW MESSAGES: ' + str(len(records)) +
                         ' ]' + '-' * 25 + '\033[0m'  # End color
                         )
        dt_now = dt.datetime.now()
        for record in records:
            event = json.loads(record[1])
            self.logger.info('Received Message: ' + event['camera_id'] +
                             ' - ' + event['timestamp'])
            dt_event = dt.datetime.strptime(event['timestamp'],
                                            '%Y-%m-%dT%H:%M:%S.%f')
            delta = dt_now - dt_event
            print("timestamp = " + str(dt_event))
            if delta.seconds > 5:
                continue
            to_process[event['camera_id']] = event

        if len(to_process) == 0:
            self.logger.info('Skipping processing...')

        for key, event in to_process.items():
            self.logger.info('Processing Message: ' + event['camera_id'] +
                             ' - ' + event['timestamp'])
            start = timer()
            detection_result = self.detect_objects(event)
            self.logger.info('prediction: ' + self.objects_detected_view_text)
            end = timer()
            delta = end - start
            self.logger.info('Done after ' + str(delta) + ' seconds.')
            self.producer.send(self.topic_for_produce,
                               detection_result.encode('utf-8'))
            self.logger.info('Sent image to Kafka endpoint.')
            self.producer.flush()
            print(','.join(array))
            return
        else:
            array.append(element[0])
            count += 1


window_size = int(sys.argv[1])
batch_size = int(sys.argv[2])

conf = SparkConf()
conf.setAppName("BigData")
sc = SparkContext(conf=conf)
#sc.setLogLevel('ERROR')

ssc = StreamingContext(sc, 1)
ssc.checkpoint("./checkpoint_BIGDATA")

dataStream = ssc.socketTextStream("localhost", 9009)
hashtags = dataStream.map(lambda x: ret_tags(''.join(x.split(';')[7]))).window(
    window_size, batch_size).flatMap(lambda x: parse(x)).filter(
        lambda x: not (x == '')).map(lambda x: (x, 1))
tagcounts = hashtags.reduceByKey(lambda x, y: x + y)
sorted_tagcounts = tagcounts.foreachRDD(lambda rdd: rdd.sortByKey(
)).foreachRDD(lambda rdd: rdd.sortBy(lambda x: x[1], ascending=False))

sorted_tagcounts.foreachRDD(lambda rdd: print_top_5(rdd.collect()))
ssc.start()
ssc.awaitTermination(100)
ssc.stop()
Ejemplo n.º 41
0
    temp_rdd_2 = temp_rdd.collect()
    if (temp_rdd_2 != []):
        for i in range(5):
            if (i != 4):
                print(temp_rdd_2[i][0], end=",")
            else:
                print(temp_rdd_2[i][0])


window_size = int(sys.argv[1])
batch = int(sys.argv[2])

conf = SparkConf()
conf.setAppName("BigData")
sc = SparkContext(conf=conf)

ssc = StreamingContext(sc, batch)
ssc.checkpoint("~/checkpoint_BIGDATA")

dataStream = ssc.socketTextStream("localhost", 9009)

tweets = dataStream.window(window_size, 1)
flat_tweets = tweets.flatMap(line_split).map(lambda w: (w, 1))
reduced_tweets = flat_tweets.reduceByKey(lambda x, y: int(x) + int(y))

reduced_tweets.foreachRDD(printrdd2)

ssc.start()
ssc.awaitTermination(60)
ssc.stop()
def main():
    # 解析配置
    app_id = int(sys.argv[1])
    master = sys.argv[2]
    app_name = sys.argv[3]

    # 应用配置
    assert APP_CONFIG.get(app_id) is not None, \
        '[myapp streaming_app_main.main()] configuration error invalid APP_CONFIG with app.id = ' + str(app_id)
    app_conf = map_conf_properties(APP_CONFIG.get(app_id), 'app.id')[app_id]
    spark_home = app_conf['sparkHome']
    pyFiles = app_conf['pyFiles.list']
    di_id = app_conf.get('app.interfaceId')

    # 数据接口配置
    di_in_conf_with_ds_conf = get_di_conf_with_ds_conf(
        di_id,
        DATAINTERFACE_CONFIG,
        DATASOURCE_CONFIG,
        di_key='interface.id',
        di_ds_key='interface.sourceId',
        ds_key='source.id',
        merge_key_name='interface.id')[di_id]
    print('= = ' * 20, type(di_in_conf_with_ds_conf),
          'di_in_conf_with_ds_conf = ')
    pprint(di_in_conf_with_ds_conf)

    schema_conf_string = di_in_conf_with_ds_conf['schema']
    struct_type = generate_df_schmea(schema_conf_string)
    # schema_field_list = [x.name for x in struct_type.fields]
    di_in_conf_with_ds_conf['struct.type'] = struct_type
    # di_in_conf_with_ds_conf['struct.field.list'] = schema_field_list

    di_out_confs = [
        kv for kv in DATAINTERFACE_CONFIG.iteritems()
        if kv[1].get('interface.type', '') == 'output'
    ]
    print('= = ' * 20, type(di_out_confs), 'di_out_confs = ')
    pprint(di_out_confs)

    di_out_confs_with_ds_conf = list_dict_merge([
        get_di_conf_with_ds_conf(kv[0],
                                 DATAINTERFACE_CONFIG,
                                 DATASOURCE_CONFIG,
                                 di_key='interface.id',
                                 di_ds_key='interface.sourceId',
                                 ds_key='source.id',
                                 merge_key_name='interface.id')
        for kv in DATAINTERFACE_CONFIG.iteritems()
        if kv[1].get('interface.type', '') == 'output'
    ])

    print('= = ' * 20, type(di_out_confs_with_ds_conf),
          'di_out_confs_with_ds_conf = ')
    pprint(di_out_confs_with_ds_conf)

    # 外部缓存配置
    cache_confs_with_ds_conf = list_dict_merge([
        get_di_conf_with_ds_conf(kv[0],
                                 CACHE_CONFIG,
                                 DATASOURCE_CONFIG,
                                 di_key='cache.id',
                                 di_ds_key='cache.sourceId',
                                 ds_key='source.id',
                                 merge_key_name='cache.id')
        for kv in CACHE_CONFIG.iteritems()
    ])
    print('= = ' * 20, type(cache_confs_with_ds_conf),
          'cache_confs_with_ds_conf = ')
    pprint(cache_confs_with_ds_conf)

    # 指定输入接口准备阶段的配置
    # 准备阶段配置中有效步骤的配置
    # Note: 对 dict 进行 filter,传给function的参数是 dict 的 key
    prepares_config_active = PREPARES_CONFIG[di_id] \
        if PREPARES_CONFIG.get(di_id, {}).get('prepares.enabled', False) else {}
    # print('= = ' * 20, type(prepares_config_active), 'prepares_config_active = ')
    # pprint(prepares_config_active)

    # TODO: 2中方法的结果==测试False, 删除注释
    # prepares_config_active_steps = filter(
    # lambda step_conf: step_conf[1].get('step.enabled', False),
    #     map(lambda step_conf: (step_conf[0], map_conf_properties(step_conf[1])),
    #         prepares_config_active.get('steps', {}).iteritems()
    #     )
    # )
    prepares_config_active_steps = \
        [(k, map_conf_properties(v)) for k, v in prepares_config_active.get('steps', {}).iteritems()
         if v.get('step.enabled', False)]

    print('= = ' * 20, type(prepares_config_active_steps),
          'prepares_config_active_steps = ')
    pprint(prepares_config_active_steps)

    # 指定输入接口计算阶段的配置
    # filter 之后变成 list,list 的每个元素是 tuple(computeStatistics.id, computeStatistics.conf_dict)
    computes_config_active = COMPUTES_CONFIG[di_id] \
        if COMPUTES_CONFIG.get(di_id, {}).get('computeStatistics.enabled', False) else {}

    # list[{computeStatistic.id: {conf}}, ...]
    # # TODO: 2中方法的结果==测试False, 删除注释
    # compute_computeStatistics_config_active = filter(
    #     lambda computeStatistic_conf: computeStatistic_conf[1].get('computeStatistic.enabled', False),
    #     computes_config_active.get('computeStatistics', {}).iteritems())

    compute_computeStatistics_config_active = [
        kv for kv in computes_config_active.get('computeStatistics',
                                                {}).iteritems()
        if kv[1].get('computeStatistic.enabled', False)
    ]
    print('= = ' * 20, type(compute_computeStatistics_config_active),
          'compute_computeStatistics_config_active = ')
    pprint(compute_computeStatistics_config_active)

    # {computeStatistic.id -> list[step_conf_tuple]}, 其中 step_conf_tuple = (step_id, step_conf_dict)
    compute_prepares_config_active = dict(
        map(
            lambda computeStatistic_conf:
            (computeStatistic_conf[0],
             sorted(
                 list_dict_merge(
                     map(
                         lambda step_conf: map_conf_properties(
                             step_conf[1], 'step.id'),
                         filter(
                             lambda step_conf: step_conf[1].get(
                                 'step.enabled', False), computeStatistic_conf[
                                     1].get('prepares.steps', {}).iteritems()))
                 ).iteritems())), compute_computeStatistics_config_active))
    # print('= = ' * 30, compute_prepares_config_active2 == compute_prepares_config_active)

    print('= = ' * 20, type(compute_prepares_config_active),
          'compute_prepares_config_active = ')
    pprint(compute_prepares_config_active)

    compute_computes_config_active = dict(
        map(
            lambda computeStatistic_conf:
            (computeStatistic_conf[0],
             sorted(
                 list_dict_merge(
                     map(
                         lambda step_conf: map_conf_properties(
                             step_conf[1], 'step.id'),
                         filter(
                             lambda step_conf: step_conf[1].get(
                                 'step.enabled', False), computeStatistic_conf[
                                     1].get('computes.steps', {}).iteritems()))
                 ).iteritems())), compute_computeStatistics_config_active))
    print('= = ' * 20, type(compute_computes_config_active),
          'compute_computes_config_active = ')
    pprint(compute_computes_config_active)

    test_flag = False
    if not test_flag:
        # 初始化
        # 测试 serializer
        # serializer 默认取值 PickleSerializer()  #UnpicklingError: invalid load key, '{'.
        # serializer=MarshalSerializer()  # ValueError: bad marshal data
        # serializer=AutoSerializer()  # ValueError: invalid sevialization type: {
        # serializer=CompressedSerializer(PickleSerializer())  # error: Error -3 while decompressing data: incorrect header check

        # sc = SparkContext(master, app_name, sparkHome = spark_home, pyFiles=pyFiles)
        # sc = SparkContext(master, app_name, sparkHome = sparkHome, pyFiles=pyFiles, serializer=MarshalSerializer())
        # sc = SparkContext(master, app_name, sparkHome = sparkHome, pyFiles=pyFiles, serializer=AutoSerializer())
        # sc = SparkContext(master, app_name, sparkHome = sparkHome, pyFiles=pyFiles, serializer=CompressedSerializer(PickleSerializer()))

        spark_conf = SparkConf()
        spark_conf.setMaster(master).setAppName(app_name).setSparkHome(
            spark_home)

        # spark streaming 调优配置
        spark_streaming_blockInterval = str(
            app_conf.get('spark.streaming.blockInterval', '')).strip()
        if spark_streaming_blockInterval:
            spark_conf.set('spark.streaming.blockInterval',
                           spark_streaming_blockInterval)

        spark_streaming_kafka_maxRatePerPartition = str(
            app_conf.get('spark.streaming.kafka.maxRatePerPartition',
                         '')).strip()
        if spark_streaming_kafka_maxRatePerPartition:
            spark_conf.set('spark.streaming.kafka.maxRatePerPartition',
                           spark_streaming_kafka_maxRatePerPartition)

        spark_streaming_receiver_maxRate = str(
            app_conf.get('spark.streaming.receiver.maxRate', '')).strip()
        if spark_streaming_receiver_maxRate:
            spark_conf.set('spark.streaming.receiver.maxRate',
                           spark_streaming_receiver_maxRate)

        spark_streaming_concurrentJobs = str(
            app_conf.get('spark.streaming.concurrentJobs', '')).strip()
        if spark_streaming_concurrentJobs:
            spark_conf.set('spark.streaming.concurrentJobs',
                           spark_streaming_concurrentJobs)

        # spark sql 调优配置
        spark_sql_shuffle_partitions = str(
            app_conf.get('spark.sql.shuffle.partitions', '')).strip()
        if spark_sql_shuffle_partitions:
            spark_conf.set('spark.sql.shuffle.partitions',
                           spark_sql_shuffle_partitions)

        sc = SparkContext(conf=spark_conf)
        for path in (pyFiles or []):
            sc.addPyFile(path)

        # 外部缓存优化,broadcast 分发
        cache_manager = CacheManager()
        cache_broadcast_list = \
            [(cache_id, cache_manager.cache_dataset(sc, cache_conf))
             for cache_id, cache_conf in cache_confs_with_ds_conf.iteritems()
             if cache_conf.get('broadcast.enabled', False)]

        for cache_id, cache_broadcast in cache_broadcast_list:
            cache_confs_with_ds_conf[cache_id]['broadcast'] = cache_broadcast

        batchDruationSeconds = app_conf['batchDuration.seconds']
        ssc = StreamingContext(sc, batchDruationSeconds)
        sqlc = SQLContext(sc)

        # 读取数据源
        stream = StreamingReader.readSource(ssc, di_in_conf_with_ds_conf,
                                            app_conf)
        # 流处理: 1 根据配置初始化处理指定数据接口的类的实例, 2 调用指定处理类实例的流数据处理方法
        # 测试 kafka_wordcount
        # counts = stream.flatMap(lambda line: line.split(" ")) \
        # .map(lambda word: (word, 1)) \
        # .reduceByKey(lambda a, b: a+b)
        # counts.pprint()
        StreamingApp.process(stream, sc, sqlc, di_in_conf_with_ds_conf,
                             di_out_confs_with_ds_conf,
                             cache_confs_with_ds_conf,
                             prepares_config_active_steps,
                             compute_prepares_config_active,
                             compute_computes_config_active)

        ssc.start()
        ssc.awaitTermination()
Ejemplo n.º 43
0
checkpoint_dir = './Checkpoint/spark'
ssc.checkpoint(checkpoint_dir)

kafka_params = {
    "bootstrap.servers": "localhost:9092",
    "group.id": "myUserGroup",
    "enable.auto.commit": "false",
    "auto.offset.reset": "largest"
}
dstream = [KafkaUtils.createDirectStream(ssc, [tlist[i]], kafka_params,\
      keyDecoder=spot_decoder,\
      valueDecoder=spot_decoder,\
            messageHandler=setHandler )\
           for i in range(len(tlist))
           ]
countList = []

for index in range(len(tlist)):
    print(tlist[index])
    tempt = ( dstream[index].map( lambda x : getID(x) )\
                    .map( lambda x : ( 1,  x))\
                    .updateStateByKey( updatefunction )\
                )
    print("lalalaall")
    countList.append(tempt)
    countList[index].foreachRDD(lambda x: displayID(x))

ssc.start()
ssc.awaitTermination(5000)
ssc.stop()
Ejemplo n.º 44
0
                             index_col=0).transpose()
        oldlog['cnt'] = oldlog['cnt'].astype(int)
        if user_id in oldlog.columns:
            oldlog[user_id] += self.chkedwords_df[user_id]
            oldlog[user_id] = oldlog[user_id].fillna(0).astype(int)
            oldlog.transpose().to_csv('./newslog.csv',
                                      encoding='euc_kr',
                                      mode='w')
        else:
            newlog = pd.concat([oldlog, self.chkedwords_df],
                               axis=1,
                               join_axes=[oldlog.index],
                               join='inner')
            newlog[user_id] = newlog[user_id].fillna(0).astype(int)
            newlog.transpose().to_csv('./newslog.csv', encoding='euc-kr')


if __name__ == '__main__':
    sc = SparkContext()
    ssc = StreamingContext(sc, 10)

    date = time.strftime("%y%m%d")
    tstream = ssc.textFileStream('hdfs://192.168.56.102:9000/cplogs/news/' +
                                 date)

    logprocess = NewsLogToCSV()
    tstream.foreachRDD(logprocess.process_newslog)

    ssc.start()
    ssc.awaitTermination()
Ejemplo n.º 45
0
from pyspark import SparkContext
from pyspark.streaming import StreamingContext

# 创建一个sc(使用本地2个core)   和时间间隔为4s 的ssc
sc = SparkContext("spark://master:7077", "NetworkWordCount")
ssc = StreamingContext(sc, 4)

# 创建DStream,并读取套接字
lines = ssc.socketTextStream("master", 9999)

# 单词计数
words = lines.flatMap(lambda line: line.split(" "))
pairs = words.map(lambda word: (word, 1))
wordCounts = pairs.reduceByKey(lambda x, y: x + y)

# 将每个RDD的前10个元素打印
wordCounts.pprint()

ssc.start()  # 启动计算
ssc.awaitTermination()  # 等待终止
Ejemplo n.º 46
0
wordCounts.pprint(5)

#Count lines
totalLines = 0
linesCount = 0


def computeMetrics(rdd):
    global totalLines
    global linesCount
    linesCount = rdd.count()
    totalLines += linesCount
    print rdd.collect()
    print "Lines in RDD :", linesCount, " Total Lines:", totalLines


lines.foreachRDD(computeMetrics)


#Compute window metrics
def windowMetrics(rdd):
    print "Window RDD size:", rdd.count()


windowedRDD = lines.window(6, 3)
windowedRDD.foreachRDD(windowMetrics)

streamContext.start()
#streamContext.stop()
streamContext.awaitTermination()
print "Overall lines :", totalLines
Ejemplo n.º 47
0
class SparkConsumer:
    """
    Class for spark consumer reading from kafka topic which contains the ecg timeseries data.
    """

    def __init__(self, kafka_config_infile, ecg_spark_config_infile, postgres_config_infile, s3bucket_config_infile,
                 batch_interval):
        if not os.path.exists('./tmp'):
            os.makedirs('./tmp')
        logging.basicConfig(level=logging.DEBUG,
                            format='%(asctime)s %(levelname)s %(message)s',
                            filename='./tmp/spark_consumer.log',
                            filemode='w')
        self.logger = logging.getLogger('py4j')
        self.logger.setLevel(logging.WARN)
        self.ecg_spark_config = helpers.parse_config(ecg_spark_config_infile)
        self.postgres_config = helpers.parse_config(postgres_config_infile)
        self.s3bucket_config = helpers.parse_config(s3bucket_config_infile)
        self.kafka_config = helpers.parse_config(kafka_config_infile)
        self.sc = SparkContext(appName='ECGDashboardApp')
        self.sc.setLogLevel("FATAL")
        self.ssc = StreamingContext(self.sc, batch_interval)
        self.logger.warn('Opened spark Context')
        self.kafkastream = self.connectToKafkaBrokers()
        self.logger.warn('Opened connection to Kafka brokers')
        self.a = self.sc.accumulator(0)

    def start(self):
        """
        Starts the streaming context to start subscribing to kafka topic
        """
        self.ssc.start()
        self.logger.warn('Spark context started')
        self.ssc.awaitTermination()
        self.logger.warn('Spark context terminated')

    def connectToKafkaBrokers(self):
        """
        Setup subscription to kafka topic
        """
        kafkastream = KafkaUtils.createDirectStream(self.ssc, [self.kafka_config["topic"]],
                                                    {"metadata.broker.list": self.kafka_config['ip-addr'],
                                                     "group.id": self.ecg_spark_config['group-id'],
                                                     "num.partitions": str(self.kafka_config['partitions'])})
        self.logger.warn('Connected kafka stream to spark context')
        return kafkastream

    def runECG(self):
        """
        Grouping and insertion of ecg samples into database
        :return:
        """
        lines = self.kafkastream.map(lambda x: x[1])
        self.logger.warn('Reading in kafka stream line')

        raw_record = lines.map(lambda line: line.encode('utf-8')). \
            map(lambda line: line.split(','))
        if raw_record is not None:
            raw_record.pprint()
        else:
            print('raw_record is none')

        record_interval = raw_record.map(lambda x: (x[0], x[1:])). \
            groupByKey().map(lambda x: (x[0], list(x[1])))

        record_interval.foreachRDD(lambda x: insertECGSamples(self.logger, self.postgres_config, accum(self.a), x))

        self.ssc.start()
        self.logger.warn('Spark context started')
        self.ssc.awaitTermination()
        self.logger.warn('Spark context terminated')

    def runHR(self):
        """
        Grouping and calculation of HR for insertion in database
        :return:
        """
        s3 = boto3.client('s3')
        obj = s3.get_object(Bucket=self.s3bucket_config['bucket'],
                            Key="mgh001_metadata.txt")
        file_content = obj['Body'].read().decode('utf-8')
        meta_data = json.loads(file_content)
        fs = meta_data['fs']
        lines = self.kafkastream.map(lambda x: x[1])
        self.logger.warn('Reading in kafka stream line')
        raw_record = lines.map(lambda line: line.encode('utf-8')). \
            map(lambda line: line.split(','))
        if raw_record is not None:
            raw_record.pprint()
        else:
            print('raw_record is none')
        record_interval = raw_record.map(lambda line: (line[0], line[1:])). \
            groupByKey().map(lambda x: (x[0], list(x[1])))
        record_interval.foreachRDD(
            lambda x: processHRSample(self.logger, self.postgres_config, accum(self.a), fs, x))
        self.logger.warn('Saved records to DB')

        self.ssc.start()
        self.logger.warn('Spark context started')
        self.ssc.awaitTermination()
        self.logger.warn('Spark context terminated')
Ejemplo n.º 48
0
## Spark Streaming ###

import sys
## Develop Spark streaming context ##
# import os
# os.environ["SPARK_HOME"] = '/usr/lib/spark'
from pyspark import SparkConf, SparkContext
from pyspark.streaming import StreamingContext
## creating spark stream for word count
if __name__ == "__main__":
    conf = SparkConf().setMaster("local[2]").setAppName(
        "SparkStreamingcount").set("spark.executor.memory", "1g")
    sc = SparkContext(conf=conf)
    strc = StreamingContext(sc, 1)

    strc.checkpoint(
        "hdfs://quickstart.cloudera:8020/user/cloudera/sparkstream")

    lines = strc.socketTextStream(sys.argv[1], int(sys.argv[2]))
    count = lines.flatMap(lambda x: x.split(' ')).map(
        lambda x: (x, 1)).reduceByKey(lambda x, y: x + y)

    count.pprint()
    strc.start()
    strc.awaitTermination()
Ejemplo n.º 49
0
import sys

from pyspark import SparkContext
from pyspark.streaming import StreamingContext


if __name__ == "__main__":

	sc = SparkContext(master="local[2]", appName="StreamingErrorCount")
	ssc = StreamingContext(sc, 2) # 2 segundos de intervalo de tempo

	ssc.checkpoint("file:///home/felipe/checkpoint2") # tolerancia a falhas
	
	# lines eh uma sequencia de RDDs
	lines = ssc.socketTextStream(sys.argv[1], int(sys.argv[2])) # host e porta

	counts = lines.countByWindow(10,2) # (window size, sliding interval) 

	counts.pprint() # imprime para cada intervalo. nao sao necessarios loops

	ssc.start() # inicia a escuta peloas dados de streaming
	ssc.awaitTermination() # aplicacao espera terminar os dados de transmissao 
Ejemplo n.º 50
0
def shutdown_hook(producer):
	try:
 		producer.flush(10)

 	except KafkaError as kafka_error:
 		logger.warn('Failed to flush pending messages to kafka, caused by:%s', kafka_error.message)
 	
 	finally:
 
 	try:
 		producer.close(10)
 
 	except Exception as e:
 		logger.warn('Failed to close kafka connection, caused by %s',def process_stream(stream, kafka_producer, target_topic):
 
 def send_to_kafka(rdd):
 	results = rdd.collect()
 	for r in results:
 		data = json.dumps({
 		'Symbol': r[0],
 		'Timestamp': time.time(),
 		'Average': r[1]
 	})
 	
 	try:
 		logger.info('Sending average price %s to kafka', data)
 		kafka_producer.send(target_topic, value=data)
 
 	except KafkaError as error:
 		logger.warn('Failed to send average price to kafka, caused by:%s', error.message)
 
 	def pair(data):
 		record = json.loads(data.encode('utf-8'))
 	
 	return record.get('Symbol'), (float(record.get('LastTradePrice')), 1)
	
	# (symbol, (price, count))
	stream.map(pair).reduceByKey(lambda a, b: (a[0] + b[0], a[1] + b[1])).map(lambda (k, v):(k, v[0]/v[1])).foreachRDD(send_to_kafka)
	
if __name__ == '__main__':
 	# Setup command line arguments.
 	parser = argparse.ArgumentParser()
	parser.add_argument('source_topic', help='the kafka topic to subscribe from.')
 	parser.add_argument('target_topic', help='the kafka topic to send message to.')
 	parser.add_argument('kafka_broker', help='the kafka broker.')
 	parser.add_argument('batch_duration', help='the batch duration in secs.')
 	
 	# Parse arguments.
 	args = parser.parse_args()
 	source_topic = args.source_topic
 	target_topic = args.target_topic
 	kafka_broker = args.kafka_broker
 	batch_duration = int(args.batch_duration)
 	
 	# Create SparkContext and SparkStreamingContext
 	sc = SparkContext('local[2]', 'AveragePrice')
 	sc.setLogLevel('INFO')
 	ssc = StreamingContext(sc, batch_duration)
 	
 	# Instantiate a Kafka stream for processing.
 	directKafkaStream = KafkaUtils.createDirectStream(ssc, [source_topic],
	{'metadata.broker.list':kafka_broker})
 	
 	# Extract value
 	stream = directKafkaStream.map(lambda x : x[1])
 	
 	# Instantiate a simple Kafka producer.
 	kafka_producer = KafkaProducer(bootstrap_servers=kafka_broker)
 	process_stream(stream, kafka_producer, target_topic)
 	
 	# Setup shutdown hook
 	atexit.register(shutdown_hook, kafka_producer)
 	ssc.start()
 	ssc.awaitTermination()
Ejemplo n.º 51
0
ssc = StreamingContext(sc, batch_interval)

# streaming application must run 24 hours a day. Thus, it needs to be resilient to failures caused by some unexpected errors such as system failures, driver failure, JVM crashes, etc. Checkpointing saves the generated RDDs to a reliable storate and performs receovery from an error.
#To summarise, checkpoints provide a way of recovering to a safe stable application snapshot.
# Using the ssc.checkpoint() method, we can tell the Spark engine where to store the checkpoint files.
ssc.checkpoint("checkpoint")

host = "localhost"
port = 9999

lines = ssc.socketTextStream(host, int(port))

# Split each line into words
words = lines.flatMap(lambda line: line.split(" "))

# Count each word in each batch
pairs = words.map(lambda word: (word, 1))
wordCounts = pairs.updateStateByKey(updateFunc)

# Print the result
wordCounts.pprint()

ssc.start()
try:
    ssc.awaitTermination(timeout=60)
except KeyboardInterrupt:
    ssc.stop()
    sc.stop()

ssc.stop()
sc.stop()
def main():
    parser = OptionParser()
    parser.add_option('',
                      '--enriched_data_path',
                      action='store',
                      dest='enriched_data_path',
                      help='path to write enriched data')
    parser.add_option('',
                      '--model_path',
                      action='store',
                      dest='model_path',
                      help='path for model data')
    parser.add_option('',
                      '--kafka_zookeeper_hosts',
                      action='store',
                      dest='kafka_zookeeper_hosts',
                      help='list of Zookeeper hosts (host:port)')
    parser.add_option('',
                      '--kafka_broker_list',
                      action='store',
                      dest='kafka_broker_list',
                      help='list of Kafka brokers (host:port)')
    parser.add_option('',
                      '--kafka_message_topic',
                      action='store',
                      dest='kafka_message_topic',
                      help='topic to consume input messages from')
    parser.add_option('',
                      '--kafka_alert_topic',
                      action='store',
                      dest='kafka_alert_topic',
                      help='topic to produce alert messages to')
    parser.add_option('',
                      '--kafka_enriched_data_topic',
                      action='store',
                      dest='kafka_enriched_data_topic',
                      help='topic to produce enriched data to')
    parser.add_option('',
                      '--streaming_batch_duration_sec',
                      type='float',
                      default=15.0,
                      action='store',
                      dest='streaming_batch_duration_sec',
                      help='Streaming batch duration in seconds')
    parser.add_option('',
                      '--max_batches',
                      type='int',
                      default=0,
                      action='store',
                      dest='max_batches',
                      help='Number of batches to process (0 means forever)')
    options, args = parser.parse_args()

    sc = SparkContext()
    ssc = StreamingContext(sc, options.streaming_batch_duration_sec)
    sqlContext = getSqlContextInstance(sc)

    # Load saved model.
    model = None
    if options.model_path:
        model = RandomForestModel.load(sc, options.model_path)
    else:
        print('No model loaded.')

    # Create Kafka stream to receive new messages.
    kvs = KafkaUtils.createDirectStream(
        ssc, [options.kafka_message_topic], {
            'metadata.broker.list': options.kafka_broker_list,
            'group.id': 'spark_streaming_processor.py'
        })

    # Take only the 2nd element of the tuple.
    messages = kvs.map(lambda x: x[1])

    # Convert RDD of JSON strings to RDD of Rows.
    rows = messages.map(json_to_row)

    # Process messages.
    rows.foreachRDD(lambda time, rdd: process_messages(
        time,
        rdd,
        ssc=ssc,
        model=model,
        enriched_data_path=options.enriched_data_path,
        zookeeper_hosts=options.kafka_zookeeper_hosts,
        kafka_alert_topic=options.kafka_alert_topic,
        kafka_enriched_data_topic=options.kafka_enriched_data_topic,
        max_batches=options.max_batches))

    ssc.start()
    ssc.awaitTermination()
Ejemplo n.º 53
0
	return sum(new_values) + (total_sum or 0)
def flatt(url):
	parts=re.split(r',',url)
	for u in parts:
		yield (str(u),1)
if __name__=="__main__":
	if len(sys.argv)!=3:
		print("3 arguements required")
		sys.exit(-1)
	a=int(sys.argv[1])
	b=int(sys.argv[2])
	conf=SparkConf()
	conf.setAppName("BigData")
	sc=SparkContext(conf=conf)
	ssc=StreamingContext(sc,b)
	ssc.checkpoint("/some")
	dataStream=ssc.socketTextStream("localhost",9009)
	tweet2=dataStream.filter(lambda w:w.split(';')[7]!="")
	tweet=tweet2.map(lambda x:x.split(';')[7])
	job=tweet.flatMap(lambda x:flatt(x))
	windowedWordCounts = job.reduceByKeyAndWindow(lambda x, y: x + y, lambda x, y: x - y, a, 1)
	def gunf(time,rdd):		
		val=sorted(rdd.collect(),key=lambda x:(-x[1],x[0]))
		i=0
		if(len(val)>4):
			print(val[0][0]+","+val[1][0]+","+val[2][0]+","+val[3][0]+","+val[4][0])
	windowedWordCounts.foreachRDD(gunf)
	ssc.start()
	ssc.awaitTermination(30)
	ssc.stop()
Ejemplo n.º 54
0
    if ',' not in t:
        return [t]
    else:
        y=t.split(",")
        return y
def fab(r):
    sr = r.sortBy(lambda x: (-x[1],x[0]))
    srr = sr.collect()
    c=0
    i=0
    if(srr!=[]):
        while(c!=5):
            if(srr[i][0]!=''):
                if(c!=4):
                    print(srr[i][0],end=',')
                else:
                    print(srr[i][0])
                c+=1
            i+=1
conf=SparkConf()
conf.setAppName("BigData")
ab=SparkContext(conf=conf)
cc=StreamingContext(ab,int(sys.argv[2]))
cc.checkpoint("~/checkpoint_BIGDATA")
stream=cc.socketTextStream("localhost",9009)
finalans=stream.window(int(sys.argv[1]),1).flatMap(rc).map(lambda x : (x, 1)).reduceByKey(lambda a,b:int(a)+int(b))
finalans.foreachRDD(fab)
cc.start()
cc.awaitTermination(25)
cc.stop()
    r = aw1.collect()
    if (r != []):
        f1(r)


def f1(inp):
    count = 0
    j = 0
    while (count != 5):
        if (inp[j][0] != ""):
            if (count != 4):
                print(inp[j][0], end=",")
            else:
                print(inp[j][0])
            count += 1
        j = j + 1


configuration = SparkConf()
configuration.setAppName("Assign3")
spark_context = SparkContext(configuration=configuration)
stream_context = StreamingContext(spark_context, int(sys.argv[2]))
stream_context.checkpoint("~/checkpoint_Assign3")
stream = stream_context.socketTextStream("localhost", 8000)
o = dataStream.window(int(sys.argv[1]), 1).flatMap(cr).map(
    lambda z: (z, 1)).reduceByKey(lambda a, b: int(a) + int(b))
o.foreachRDD(f)
stream_context.start()
stream_context.awaitTermination(60)
stream_context.stop()
Ejemplo n.º 56
0
from pyspark import SparkContext, SparkConf
from pyspark.streaming import StreamingContext

conf = SparkConf()
conf.set("spark.master", "yarn")
conf.set("spark.app.name", "streamingapp")

sc = SparkContext(conf=conf)

streamc = StreamingContext(sc, batchDuration=15)

r1 = sc.textFile("s3://datasets-spark-learning/flat_files/au-500.csv")

ds1 = streamc.textFileStream(
    "s3://datasets-spark-learning/flat_files/csvfiles/")

ds2 = ds1.transform(lambda rdd: rdd.union(r1).map(lambda x: x + "spark"))
ds2.pprint()

streamc.start()
streamc.awaitTermination()
Ejemplo n.º 57
0
# Get relevant data
rows = rows.filter(lambda row: len(row) > 8)
airports_fromto = rows.map(lambda row: ( \
  (row[0], row[1], row[2], AMOrPM(row[5])), \
  (row[3], row[4], departureTimePretty(row[5]), float(row[8])) \
 ) \
)
# Filtering just necessary flights
airports_fromto = airports_fromto.filter(lambda row: row[0] == ('BOS', 'ATL', '2008-04-03', 'AM')) \
  .union(airports_fromto.filter(lambda row: row[0] == ('ATL', 'LAX', '2008-04-05', 'PM'))) \
  .union(airports_fromto.filter(lambda row: row[0] == ('PHX', 'JFK', '2008-09-07', 'AM'))) \
  .union(airports_fromto.filter(lambda row: row[0] == ('JFK', 'MSP', '2008-09-09', 'PM'))) \
  .union(airports_fromto.filter(lambda row: row[0] == ('DFW', 'STL', '2008-01-24', 'AM'))) \
  .union(airports_fromto.filter(lambda row: row[0] == ('STL', 'ORD', '2008-01-26', 'PM'))) \
  .union(airports_fromto.filter(lambda row: row[0] == ('LAX', 'MIA', '2008-05-16', 'AM'))) \
  .union(airports_fromto.filter(lambda row: row[0] == ('MIA', 'LAX', '2008-05-18', 'PM')))

# Minimum search
airports_fromto = airports_fromto.updateStateByKey(getMinimum)

# Print and save
airports_fromto.foreachRDD(printResults)
airports_fromto.foreachRDD(saveResults)

# Kafka Sink
airports_fromto.foreachRDD(lambda rdd: rdd.foreachPartition(sendToKafka))

ssc.start()  # Start the computation
ssc.awaitTermination()  # Wait for the computation to terminate
Ejemplo n.º 58
0
import sys
Ejemplo n.º 59
0
    b = a.split(',')
    for i in b:
        if (i != ''):
            yield (i, 1)


conf = SparkConf()
conf.setAppName("BigData")
sc = SparkContext(conf=conf)

batch = int(sys.argv[2])
window_size = int(sys.argv[1])

ssc = StreamingContext(sc, batch)
ssc.checkpoint("/checkpoint_BIGDATA")

dataStream = ssc.socketTextStream("localhost", 9009)

tweet = dataStream.map(tmp)

data = dataStream.window(
    window_size,
    1).flatMap(get_hashtag).reduceByKey(lambda x, y: x + y).transform(
        lambda rdd: rdd.sortBy(lambda x: x[1], ascending=False))

data.foreachRDD(process_rdd)
#data.pprint(3)
ssc.start()
ssc.awaitTermination(12)
ssc.stop()
Ejemplo n.º 60
0
        words_df.registerTempTable("Words")
        # get the words from the table using SQL and print them
        words_df = sql_context.sql(
            "select word, word_count from Words order by word_count desc")
        words_df.show()
        # words_df.saveAsTextFiles("wc_output")
    except:
        e = sys.exc_info()[0]
        print("Error: %s" % e)


# initializing spark context
sc = SparkContext("local[2]", "TCP Streaming word count")
# streaming context
ssc = StreamingContext(sc, 5)
ssc.checkpoint("checkpoint_TwitterApp")
# getting the data from the stream
lines = ssc.socketTextStream("localhost", 9009)
# splitting the data using space as delimiter
words = lines.flatMap(lambda line: line.split(" "))
# mapping the words as jey and value
pairs = words.map(lambda word: (word, 1))
# wordCounts = pairs.reduceByKey(lambda x, y: x + y)
# passing the words to the aggregate funtion which adds them to the previous count
words_total = pairs.updateStateByKey(aggregate_words_count)
words_total.foreachRDD(rdd_processing)
# wordCounts.pprint()

ssc.start()  # Start the computation
ssc.awaitTermination()  # Wait before termination