def main(): # Create a local StreamingContext with two working thread and batch interval of 5 second sc = SparkContext("spark://ip-172-31-29-29:7077", "MyKafkaStream") # stream interval of 5 seconds ssc = StreamingContext(sc, 5) kafkaStream = KafkaUtils.createStream(ssc, "52.3.61.194:2181", "GroupNameDoesntMatter", {"parking_sensor_data": 2}) messages = kafkaStream.flatMap(lambda s: create_tuple(s[1])).reduceByKey(lambda a,b: (int(a)+int(b))/2) messages1 = messages.filter(lambda s: s[1] > 0) messages1.pprint() ssc.start() # Start the computation ssc.awaitTermination() # Wait for the computation to terminate
def bro_parse(zk,topic,db,db_table,num_of_workers): app_name = "ONI-INGEST-{0}".format(topic) wrks = int(num_of_workers) # create spark context sc = SparkContext(appName=app_name) ssc = StreamingContext(sc,1) sqc = HiveContext(sc) # create DStream for each topic partition. topic_dstreams = [ KafkaUtils.createStream(ssc, zk, app_name, {topic: 1}, keyDecoder=oni_decoder, valueDecoder=oni_decoder) for _ in range (wrks) ] tp_stream = ssc.union(*topic_dstreams) # Parallelism in Data Processing #processingDStream = tp_stream(wrks) # parse the RDD content. proxy_logs = tp_stream.map(lambda x: proxy_parser(x[1])) # save RDD into hive . proxy_logs.foreachRDD(lambda x: save_to_hive(x,sqc,db,db_table,topic)) ssc.start() ssc.awaitTermination()
def ss_direct_kafka_bucket_counter(brokers, topic, bucket_interval, output_msg, message_parse, valueDecoder=None): """Starts a Spark Streaming job from a Kafka input and parses message time WARNING!! This function only works for spark 1.4.0+ Args: brokers: the kafka broker that we look at for the topic topic: the kafka topic for input timeinterval: the time interval in seconds (int) that the job will bucket Returns: None """ sc = SparkContext(appName="PythonKafkaBucketCounter") ssc = StreamingContext(sc, timeinterval + 5) if valueDecoder: kvs = KafkaUtils.createDirectStream(ssc, [topic], {"metadata.broker.list": brokers}, valueDecoder=valueDecoder) else: kvs = KafkaUtils.createDirectStream(ssc, [topic], {"metadata.broker.list": brokers}) lines = kvs.map(lambda x: x[1]) interval_counts = lines.map(lambda line: (message_parse(line), 1)).reduceByKey(lambda a, b: a + b) output_msg_func = output_msg(sc, ssc) interval_counts.foreachRDD(output_msg_func) ssc.start() ssc.awaitTermination()
def main(): sym_dict = {} conf = SparkConf().setAppName("symbol stream") sc = SparkContext(conf=conf) ssc = StreamingContext(sc, .1) lines = ssc.socketTextStream("localhost", 1337) def print_now(): print sym_dict def predict(prices): print prices def add_to_dict(line): symbol, price, volume = line.split(',') if symbol in sym_dict: print 'made it here' sym_dict[symbol][0].append(price) sym_dict[symbol][1].append(volume) if len(sym_dict[0]) > 10: sym_dict[0].pop(0) sym_dict[1].pop(0) predict(sym_dict[0]) else: sym_dict[symbol] = [[price],[volume]] #test = lines.map(lambda line: json.dumps(line)) test = lines.map(lambda line: line) test.pprint() ssc.start() ssc.awaitTermination()
def main(): parser = argparse.ArgumentParser( description='process some log messages, storing them and signaling ' 'a rest server') parser.add_argument('--mongo', help='the mongodb url', required=True) parser.add_argument('--rest', help='the rest endpoint to signal', required=True) parser.add_argument('--port', help='the port to receive from ' '(default: 1984)', default=1984, type=int) parser.add_argument('--appname', help='the name of the spark application ' '(default: SparkharaLogCounter)', default='SparkharaLogCounter') parser.add_argument('--master', help='the master url for the spark cluster') parser.add_argument('--socket', help='the socket to attach for streaming text data ' '(default: caravan-pathfinder)', default='caravan-pathfinder') args = parser.parse_args() mongo_url = args.mongo rest_url = args.rest sconf = SparkConf().setAppName(args.appname) if args.master: sconf.setMaster(args.master) sc = SparkContext(conf=sconf) ssc = StreamingContext(sc, 1) lines = ssc.socketTextStream(args.socket, args.port) lines.foreachRDD(lambda rdd: process_generic(rdd, mongo_url, rest_url)) ssc.start() ssc.awaitTermination()
def start_spark(timeout=None, max_items_per_rdd_sent=None): sc = SparkContext("local[4]", "twitter.trending") ssc = StreamingContext(sc, 5) ssc.checkpoint('hdfs://localhost:9000/user/spark/checkpoint/') kafka_params = { 'zookeeper.connect': config.get('zookeeper', 'host'), 'group.id': config.get('kafka', 'group_id'), 'metadata.broker.list': config.get('kafka', 'hosts') } ksc = KafkaUtils.createDirectStream(ssc, [config.get('kafka', 'topic')], kafka_params) hashtag_counts = get_word_counts(ksc) filtered_tweet_count = filter_tweets(hashtag_counts) send_dstream_data(filtered_tweet_count, max_items_per_rdd_sent) ssc.start() if timeout: ssc.awaitTermination(timeout) ssc.stop(stopSparkContext=True, stopGraceFully=True) else: ssc.awaitTermination()
def main(): if len(sys.argv) != 4: print("Usage: kafka_wordcount.py <zk> <topic> <timeout>", file=sys.stderr) exit(-1) sc = SparkContext(appName="PythonStreamingKafkaWordCount") ssc = StreamingContext(sc, 1) timeout = None if len(sys.argv) == 4: zk, topic, timeout = sys.argv[1:] timeout = int(timeout) else: zk, topic = sys.argv[1:] kvs = KafkaUtils.createStream( ssc, zk, "spark-streaming-consumer", {topic: 1}) lines = kvs.map(lambda x: x[1]) counts = lines.flatMap(lambda line: (line.split(" ")) .map(lambda word: (word, 1)) .reduceByKey(lambda a, b: a+b)) counts.pprint() kwargs = {} if timeout: kwargs['timeout'] = timeout ssc.start() ssc.awaitTermination(**kwargs)
def start(): sconf = SparkConf() sconf.set('spark.cores.max', 2) sc = SparkContext(appName='KafkaDirectWordCount', conf=sconf) ssc = StreamingContext(sc, 2) brokers = "192.192.0.27:9092" topics = ['topic7'] kafkaStreams_lines = KafkaUtils.createDirectStream(ssc, topics, kafkaParams={"metadata.broker.list": brokers}) lines1 = kafkaStreams_lines.map(lambda x: x[1]) # 注意 取tuple下的第二个即为接收到的kafka流 words = lines1.flatMap(lambda line: line.split(" ")) pairs = words.map(lambda word: (word, 1)) wordcounts = pairs.reduceByKey(lambda x, y: x + y) wordcounts.saveAsTextFiles("/var/lib/hadoop-hdfs/spark-libin/kafka") wordcounts.pprint() # 统计生成的随机数的分布情况 ssc.start() # Start the computation ssc.awaitTermination() # Wait for the computation to terminate
def start(): sconf = SparkConf() sconf.set('spark.cores.max', 2) sc = SparkContext(appName='KafkaDirectWordCount', conf=sconf) ssc = StreamingContext(sc, 2) brokers = "localhost:9092" topics = ['test'] kafkaStreams_lines = KafkaUtils.createDirectStream(ssc, topics, kafkaParams={"metadata.broker.list": brokers}) lines1 = kafkaStreams_lines.map(lambda x: x[1]) # 注意 取tuple下的第二个即为接收到的kafka流 words = lines1.flatMap(lambda line: line.split(" ")) pairs = words.map(lambda word: (word, 1)) wordcounts = pairs.reduceByKey(lambda x, y: x + y) print(wordcounts) kafkaStreams_lines.transform(storeOffsetRanges).foreachRDD(printOffsetRanges) wordcounts.pprint() # 统计生成的随机数的分布情况 ssc.start() # Start the computation ssc.awaitTermination() # Wait for the computation to terminate
def main(): sc = SparkContext(appName="IntrusionDetector") ssc = StreamingContext(sc, batch_durations) kvs = KafkaUtils.createDirectStream(ssc, [input_topic], {"metadata.broker.list": broker}) kvs.foreachRDD(processRDD) ssc.start() ssc.awaitTermination()
def kafka_spark_streaming_sql_main(app_name, brokers, topic, interval_seconds, sql_function): sc = SparkContext(appName=app_name) sqlContext = SQLContext(sc) # ssc = StreamingContext(sc, interval_seconds) ssc = StreamingContext(sc, 10) kvs = KafkaUtils.createDirectStream(ssc, [topic], {"metadata.broker.list": brokers}) kvs.foreachRDD(sql_function) ssc.start() ssc.awaitTermination()
def read_tweets(): sc = SparkContext(appName="sentimentProducer") ssc = StreamingContext(sc,600) # Test 60 segundos brokers = "localhost:9092" kvs = KafkaUtils.createDirectStream(ssc, ["test"], {"metadata.broker.list": brokers}) kvs.foreachRDD(create_format) producer.flush() ssc.start() ssc.awaitTermination()
def main(): conf = SparkConf().setAppName("kafka_source_mongo_sink_pymongo_filtered") sc = SparkContext(conf=conf) ssc = StreamingContext(sc, 1) try: kafka_streams = KafkaUtils.createStream(ssc, "localhost:2181", "spark-streaming-consumer", {"splash_json": 2}) kafka_streams.foreachRDD(process_rdd) except Exception as e: print e ssc.start() ssc.awaitTermination()
def start(): sc=SparkContext(appName='HdfsWordCount') ssc=StreamingContext(sc,5) #只会统计从启动streaming开始,新创建的文件 lines = ssc.textFileStream('/user/hive/warehouse/streaming_status/2016091101') words = lines.flatMap(lambda line: line.split(" ")) pairs = words.map(lambda word: (word, 1)) wordCounts = pairs.reduceByKey(lambda x, y: x + y) wordCounts.pprint() ssc.start() # Start the computation ssc.awaitTermination() # Wait for the computation to terminate
def invoke(): # object to keep track of offsets ConfigInitializer.basic_config() # app name application_name = "mon_metrics_kafka" my_spark_conf = SparkConf().setAppName(application_name) spark_context = SparkContext(conf=my_spark_conf) # read at the configured interval spark_streaming_context = \ StreamingContext(spark_context, cfg.CONF.service.stream_interval) kafka_stream = MonMetricsKafkaProcessor.get_kafka_stream( cfg.CONF.messaging.topic, spark_streaming_context) # transform to recordstore MonMetricsKafkaProcessor.transform_to_recordstore(kafka_stream) # catch interrupt, stop streaming context gracefully # signal.signal(signal.SIGINT, signal_handler) # start processing spark_streaming_context.start() # FIXME: stop spark context to relinquish resources # FIXME: specify cores, so as not to use all the resources on the cluster. # FIXME: HA deploy multiple masters, may be one on each control node try: # Wait for the Spark driver to "finish" spark_streaming_context.awaitTermination() except Exception as e: MonMetricsKafkaProcessor.log_debug( "Exception raised during Spark execution : " + str(e)) # One exception that can occur here is the result of the saved # kafka offsets being obsolete/out of range. Delete the saved # offsets to improve the chance of success on the next execution. # TODO(someone) prevent deleting all offsets for an application, # but just the latest revision MonMetricsKafkaProcessor.log_debug( "Deleting saved offsets for chance of success on next execution") MonMetricsKafkaProcessor.reset_kafka_offsets(application_name) # delete pre hourly processor offsets if cfg.CONF.stage_processors.pre_hourly_processor_enabled: PreHourlyProcessor.reset_kafka_offsets()
def sparkTask(): from textblob import TextBlob import re from pyspark import SparkContext from pyspark.streaming import StreamingContext sc = SparkContext() ssc = StreamingContext(sc, 1) quotes = ssc.socketTextStream("localhost", 9999) dataSentencesPolarity = quotes.map(lambda x: TextBlob(re.sub('[^A-Za-z0-9 \.\']+', '',x))).map(lambda y: (str(y.upper())[:60], y.sentiment.polarity)) dataSentencesPolarity.pprint() ssc.start() # Start the computation ssc.awaitTermination(20) # Wait for the computation to terminate
def main(): parser = OptionParser() parser.add_option('', '--enriched_data_path', action='store', dest='enriched_data_path', help='path to write enriched data') parser.add_option('', '--model_path', action='store', dest='model_path', help='path for model data') parser.add_option('', '--kafka_zookeeper_hosts', action='store', dest='kafka_zookeeper_hosts', help='list of Zookeeper hosts (host:port)') parser.add_option('', '--kafka_broker_list', action='store', dest='kafka_broker_list', help='list of Kafka brokers (host:port)') parser.add_option('', '--kafka_message_topic', action='store', dest='kafka_message_topic', help='topic to consume input messages from') parser.add_option('', '--kafka_alert_topic', action='store', dest='kafka_alert_topic', help='topic to produce alert messages to') parser.add_option('', '--kafka_enriched_data_topic', action='store', dest='kafka_enriched_data_topic', help='topic to produce enriched data to') parser.add_option('', '--streaming_batch_duration_sec', type='float', default=15.0, action='store', dest='streaming_batch_duration_sec', help='Streaming batch duration in seconds') parser.add_option('', '--max_batches', type='int', default=0, action='store', dest='max_batches', help='Number of batches to process (0 means forever)') options, args = parser.parse_args() sc = SparkContext() ssc = StreamingContext(sc, options.streaming_batch_duration_sec) sqlContext = getSqlContextInstance(sc) # Load saved model. model = None if options.model_path: model = RandomForestModel.load(sc, options.model_path) else: print('No model loaded.') # Create Kafka stream to receive new messages. kvs = KafkaUtils.createDirectStream(ssc, [options.kafka_message_topic], { 'metadata.broker.list': options.kafka_broker_list, 'group.id': 'spark_streaming_processor.py'}) # Take only the 2nd element of the tuple. messages = kvs.map(lambda x: x[1]) # Convert RDD of JSON strings to RDD of Rows. rows = messages.map(json_to_row) # Process messages. rows.foreachRDD(lambda time, rdd: process_messages(time, rdd, ssc=ssc, model=model, enriched_data_path=options.enriched_data_path, zookeeper_hosts=options.kafka_zookeeper_hosts, kafka_alert_topic=options.kafka_alert_topic, kafka_enriched_data_topic=options.kafka_enriched_data_topic, max_batches=options.max_batches)) ssc.start() ssc.awaitTermination()
def main(): sc = SparkContext(appName="PythonStreamingKafkaWordCount") ssc = StreamingContext(sc, 1) zkQuorum = "localhost:2181" topic = "twitter_raw" kvs = KafkaUtils.createStream(ssc, zkQuorum, "spark-streaming-consumer", {topic: 1}) lines = kvs.map(lambda x: pickle.loads(x[1].decode("utf-8"))["text"]) # fetch the text count = lines.map(lambda line: len(line.split())).reduce(add) # split into words and count count.foreachRDD(publishToRedis) # publish to redis count.pprint() ssc.start() ssc.awaitTermination()
def start(self): sc = SparkContext(appName="PythonStreamingNOTHS") ssc = StreamingContext(sc, 10) kvs = KafkaUtils.createStream(ssc, self.zkQuorum, "spark-streaming-consumer", {self.topic: 1}) print('******* Event received in window: ', kvs.pprint()) if topic == 'NOTHS-crawler-topic': kvs.foreachRDD(self.save_crawler_hbase) elif topic == 'NOTHS-trends-topic': kvs.foreachRDD(self.save_trends_hbase) ssc.start() ssc.awaitTermination()
class xStreamProcessor: ip = socket.gethostbyname(socket.gethostname()) port = 9999 dstream = None sc = None ssc = None #def __init__(self,ip=None,port=None,spark_master = 'spark://localhost:7077'): def __init__(self,ip=None,port=None,spark_master = 'mesos://10.0.2.85:5050'): if ip is not None: self.ip = ip if port is not None: self.port = port self.sc = SparkContext(master=spark_master,appName='StreamProcessor') self.ssc = StreamingContext(self.sc, 1) #self.ssc.checkpoint(directory=None) hiveContext = HiveContext(self.sc) hiveContext.sql('DROP TABLE IF EXISTS default.tweet_stream') hiveContext.sql('CREATE TABLE IF NOT EXISTS default.tweet_stream (ip STRING, port STRING, date_time STRING, user STRING, msg STRING)') hiveContext.sql('DROP TABLE IF EXISTS default.email_stream') hiveContext.sql('CREATE TABLE IF NOT EXISTS default.email_stream (ip STRING, port STRING, date_time STRING, \ fr STRING,to STRING, subject STRING, content STRING, subject_sentiment INT, content_sentiment INT, \ subject_power INT, content_power INT, subject_topic INT, content_topic INT, fraud_score DOUBLE)') hiveContext.sql('DROP TABLE IF EXISTS default.email_graph') hiveContext.sql('CREATE TABLE IF NOT EXISTS default.email_graph (fr STRING,to STRING, dt STRING)') hiveContext.sql('DROP TABLE IF EXISTS default.trans_stream') hiveContext.sql('CREATE TABLE IF NOT EXISTS default.trans_stream (ip STRING,port STRING, date_time STRING, user STRING, amount DOUBLE, \ big_trans INT, is_in_odd_day INT, is_at_odd_time INT)') self.dstream = self.ssc.socketTextStream(self.ip, self.port) self.process_stream() self.ssc.start() self.ssc.awaitTermination() def process_stream(self): parts = self.dstream.flatMap(lambda line: line.split("|")) words = parts.map(lambda p: p[3]) pairs = words.map(lambda word: (word, 1)) wordCounts = pairs.reduceByKey(lambda x, y: x + y) # Print the first ten elements of each RDD generated in this DStream to the console wordCounts.pprint()
def bluecoat_parse(zk,topic,db,db_table,num_of_workers,batch_size): app_name = topic wrks = int(num_of_workers) # create spark context sc = SparkContext(appName=app_name) ssc = StreamingContext(sc,int(batch_size)) sqc = HiveContext(sc) tp_stream = KafkaUtils.createStream(ssc, zk, app_name, {topic: wrks}, keyDecoder=spot_decoder, valueDecoder=spot_decoder) proxy_data = tp_stream.map(lambda row: row[1]).flatMap(lambda row: row.split("\n")).filter(lambda row: rex_date.match(row)).map(lambda row: row.strip("\n").strip("\r").replace("\t", " ").replace(" ", " ")).map(lambda row: split_log_entry(row)).map(lambda row: proxy_parser(row)) saved_data = proxy_data.foreachRDD(lambda row: save_data(row,sqc,db,db_table,topic)) ssc.start(); ssc.awaitTermination()
def main(): brokers = 'localhost:9092' topic = 'openbmp.parsed.unicast_prefix' sc = SparkContext(appName='BGPPrefixOriginValidation') ssc = StreamingContext(sc,2) directKafkaStream = KafkaUtils.createDirectStream(ssc, [topic], {'metadata.broker.list':brokers}) #directKafkaStream.pprint() lines = directKafkaStream.flatMap(lambda x: x[1].splitlines()).filter(lambda line: line.startswith('add')) structured_rdd = lines.map(structure_data) structured_rdd.foreachRDD(lambda rdd: rdd.foreachPartition(validate_bgp_prefix)) ssc.start() ssc.awaitTermination()
def start_spark_streaming(): question_db = QuestionDatabase(QB_QUESTION_DB) features = {name: instantiate_feature(name, question_db) for name in FEATURE_NAMES} sc = create_sc() b_features = sc.broadcast(features) ssc = StreamingContext(sc, 5) ssc.socketTextStream('localhost', 9999) \ .repartition(QB_STREAMING_CORES - 1) \ .flatMap(lambda line: generate_guesses(line, b_features)) \ .map(lambda sg: evaluate_features(sg, b_features)) \ .foreachRDD(score_and_save) ssc.start() ssc.awaitTermination() sc.stop()
def main(): master = 'local[2]' app_name = 'reduce_demo1' # print(range(0, 3)) sc = SparkContext(master, app_name) ssc = StreamingContext(sc, 15) host = 'localhost' port = 9999 stream = ssc.socketTextStream(host, port) stream.foreachRDD(fun_union_in_dstream_foreachRDD) ssc.start() ssc.awaitTermination()
def start(): # local test mode # sc=SparkContext('local[2]',appName='NetworkWordCount') sc=SparkContext(appName='NetworkWordCount') ssc=StreamingContext(sc,1) # Create a DStream that will connect to hostname:port, like localhost:9999 lines = ssc.socketTextStream("10.5.24.137", 9999) # Split each line into words words = lines.flatMap(lambda line: line.split(" ")) # Count each word in each batch pairs = words.map(lambda word: (word, 1)) wordCounts = pairs.reduceByKey(lambda x, y: x + y) print wordCounts # Print the first ten elements of each RDD generated in this DStream to the console wordCounts.pprint() ssc.start() # Start the computation ssc.awaitTermination() # Wait for the computation to terminate
def main(): parser = argparse.ArgumentParser( description='process some log messages, storing them and signaling ' 'a rest server') parser.add_argument('--mongo', help='the mongodb url', required=True) parser.add_argument('--rest', help='the rest endpoint to signal', required=True) parser.add_argument('--port', help='the port to receive from ' '(default: 1984)', default=1984, type=int) parser.add_argument('--appname', help='the name of the spark application ' '(default: SparkharaLogCounter)', default='SparkharaLogCounter') parser.add_argument('--master', help='the master url for the spark cluster') parser.add_argument('--socket', help='the socket ip address to attach for streaming ' 'text data (default: caravan-pathfinder)', default='caravan-pathfinder') parser.add_argument('--model', help='the serialized model to use', default='model.json') args = parser.parse_args() mongo_url = args.mongo rest_url = args.rest model = args.model sconf = SparkConf().setAppName(args.appname) if args.master: sconf.setMaster(args.master) sc = SparkContext(conf=sconf) ssc = StreamingContext(sc, 1) somv = fromJSON(model) som = sc.broadcast(somv) log4j = sc._jvm.org.apache.log4j log4j.LogManager.getRootLogger().setLevel(log4j.Level.WARN) lines = ssc.socketTextStream(args.socket, args.port) lines.foreachRDD(lambda rdd: process_generic(rdd, mongo_url, rest_url, som)) ssc.start() ssc.awaitTermination()
def consumer(): def process(time, rdd): global words words += Counter(dict(rdd.collect())) sc = SparkContext(appName='graaftel') ssc = StreamingContext(sc, 5) lines = ssc.socketTextStream(os.getenv('PRODUCER_SERVICE_HOST', 'localhost'), int(os.getenv('PRODUCER_SERVICE_PORT', 8080))) counts = lines.flatMap(lambda line: line.encode('ascii', 'ignore').lower().split()) \ .map(lambda word: word.translate(None, string.punctuation)) \ .filter(lambda word: word not in stop_words) \ .map(lambda word: (word, 1)) \ .reduceByKey(add) counts.foreachRDD(process) ssc.start() ssc.awaitTermination()
def main(): #main function to execute code sc = SparkContext(appName="ReadingWriter") ssc = StreamingContext(sc,10) sqlContext = SQLContext(sc) zk_host = zk_ip+":2181" consumer_group = "reading-consumer-group" kafka_partitions={"amtest":1} #create kafka stream kvs = KafkaUtils.createStream(ssc,zk_host,consumer_group,kafka_partitions,valueDecoder=decoder) lines = kvs.map(lambda x: x[1]) #readings = lines.map(lambda x: {"device_id":x["device_id"],"metric_time":x["metric_time"],"metric_name":x["metric_name"],"metric_value":x["metric_value"]}) readings = lines.map(lambda x: {"device_id":x["device_id"],"metric_time":datetime.datetime.fromtimestamp(int(x["metric_time"])),"metric_name":x["metric_name"],"metric_value":float(x["metric_value"])}) readings.foreachRDD(lambda rdd: rdd.saveToCassandra("metrics", "raw_metrics")) #readingdf.show() #readings.pprint() #lines.saveToCassandra("metrics", "raw_metrics") ssc.start() ssc.awaitTermination()
def main(): #main function to execute code sc = SparkContext(appName="CouponCounterPySpark") ssc = StreamingContext(sc,10) zk_host = "localhost:2181" consumer_group = "coupon-event-consumers" kafka_partitions={"test":1} #create kafka stream lines = KafkaUtils.createStream(ssc,zk_host,consumer_group,kafka_partitions) events = lines.map(lambda line: line[1].split(',')) tmpagg = events.map(lambda event: ((event[1]),1) ) coupon_counts = tmpagg.reduceByKey(lambda x,y: x+y) coupon_records = coupon_counts.map(lambda x: {"offer_id" : x[0], "bucket" : str(datetime.datetime.now().strftime("%s")), "count" : int(x[1])}) #coupon_records.pprint() #coupon_records.registerTempTable("coupon_counters") #coupon_records.select("offer_id","bucket","count").show() #coupon_records = coupon_counts.map(lambda record: {"offer_id" : record[0],"bucket" : str(int(datetime.datetime.now().strftime("%s"))*1000),"count" : int(record[1])} coupon_records.pprint() coupon_records.foreachRDD(lambda rdd: rdd.saveToCassandra("loyalty","coupon_counters")) ssc.start() ssc.awaitTermination()
class Consumer: 'Simple spark kafka streaming consumer' def __init__(self, casshost, interval, zookeeper, topic): self.conf = SparkConf().setAppName("KafkaSpark").set("spark.cassandra.connection.host", casshost) self.sc = SparkContext(conf=self.conf) self.sqlContext = SQLContext(sparkContext=self.sc) self.ssc = StreamingContext(self.sc, batchDuration=interval) self.zookeeper = zookeeper self.topic = topic def check_and_write(self, x): try: x.toDF().write.format("org.apache.spark.sql.cassandra").options(table="test1", keyspace = "mykeyspace").save(mode ="append") except ValueError: print "No rdd found!" def consume(self): messages = KafkaUtils.createStream(self.ssc, self.zookeeper, "spark-streaming-consumer", {self.topic: 1}) lines = messages.map(lambda x: x[1]) rows = lines.map(lambda x: { "data": json.loads(x)['data'], "time": json.loads(x)['time'] }) rows.foreachRDD(lambda x: { self.check_and_write(x) }) self.ssc.start() self.ssc.awaitTermination() def stop(self): if self.sqlContext != None: self.sqlContext.stop() if self.ssc != None: self.ssc.stop() if self.sc != None: self.sc.stop()
class StreamingDriver(object): def __init__(self, conf): # initialize config params self.batch_interval = conf['batch_interval'] self.window_length = conf['window_length'] self.sliding_interval = conf['sliding_interval'] self.sm_socket = tuple(conf['sm_socket']) self.sm_listener = Listener(self.sm_socket) self.op_handler_socket = conf['op_handler_socket'] self.spark_stream_address = conf['spark_stream_address'] self.spark_stream_port = conf['spark_stream_port'] self.start_time = time.time() self.sc = SparkContext(appName="Sonata-Streaming") self.sc.setLogLevel("OFF") self.ssc = StreamingContext(self.sc, self.batch_interval) def start(self): lines = self.ssc.socketTextStream(self.spark_stream_address, self.spark_stream_port) pktstream = (lines.map(lambda line: processLogLine(line))) print(self.window_length, self.sliding_interval) self.process_pktstream(pktstream) self.ssc.start() self.ssc.awaitTermination() def process_pktstream(self, pktstream): print("pktstream") spark_queries = {} conn = self.sm_listener.accept() raw_data = conn.recv() data = pickle.loads(raw_data) queries = data['queries'] join_queries = data['join_queries'] for queryId in queries: query = queries[queryId] if not query.has_join and queryId not in join_queries: query_str = "pktstream.window(self.window_length, self.sliding_interval).transform(lambda rdd: (rdd.filter(lambda p : (p[1]==str('" + str( queryId ) + "'))).map(lambda p : (p[2:]))." + query.compile( ) + ")).foreachRDD(lambda rdd:send_reduction_keys(rdd, " + str( self.op_handler_socket) + "," + str( self.start_time) + ",\'" + str(queryId) + "\'))" print(query_str) spark_queries[queryId] = eval(query_str) elif not query.has_join and queryId in join_queries: query_str = "pktstream.window(self.window_length, self.sliding_interval).transform(lambda rdd: (rdd.filter(lambda p : (p[1]==str('" + str( queryId ) + "'))).map(lambda p : (p[2:]))." + query.compile( ) + "))" #.foreachRDD(lambda rdd:send_reduction_keys(rdd, " + str(self.op_handler_socket) + "," + str(self.start_time) + ",\'" + str(queryId) + "\'))" print(query_str) spark_queries[queryId] = eval(query_str) else: query_str = query.compile( ) + ".foreachRDD(lambda rdd: print(\"Join \" + str(rdd.take(5))))" print(query_str) spark_queries[queryId] = eval(query_str)
class Spark_Tracker(): """Stream WebCam Images to Kafka Endpoint.""" def __init__(self, interval=0.1, topic_to_consume='test', topic_for_produce='position', kafka_endpoint='master:6667'): """Initialize our yolo model.""" self.yolo = YOLO() # Create Kafka Producer for sending results self.topic_to_consume = topic_to_consume self.topic_for_produce = topic_for_produce self.kafka_endpoint = kafka_endpoint self.producer = KafkaProducer( bootstrap_servers=kafka_endpoint, value_serializer=lambda m: json.dumps(m).encode('utf8')) """Initialize Spark environment.""" sc = SparkContext(appName='VideoTics') self.ssc = StreamingContext(sc, interval) # , 3) # Make Spark logging less extensive log4jLogger = sc._jvm.org.apache.log4j log_level = log4jLogger.Level.ERROR log4jLogger.LogManager.getLogger('org').setLevel(log_level) log4jLogger.LogManager.getLogger('akka').setLevel(log_level) log4jLogger.LogManager.getLogger('kafka').setLevel(log_level) self.logger = log4jLogger.LogManager.getLogger(__name__) self.objects_detected_view_text = "" # Set deep_sort param self.max_cosine_distance = 0.3 self.nn_budget = None self.model_filename = 'model_data/mars-small128.pb' self.nms_max_overlap = 1.0 self.encoder = gdet.create_box_encoder(self.model_filename, batch_size=1) self.metric = nn_matching.NearestNeighborDistanceMetric( "cosine", self.max_cosine_distance, self.nn_budget) self.tracker = Tracker(self.metric) def start_processing(self): """Start consuming from Kafka endpoint and detect objects.""" kvs = KafkaUtils.createDirectStream( self.ssc, [self.topic_to_consume], {'metadata.broker.list': self.kafka_endpoint}) kvs.foreachRDD(self.process_frame) self.ssc.start() self.ssc.awaitTermination() def detect_person_track(self, event): """Use Yolo to detect person.""" try: decoded = base64.b64decode(event['image']) except TypeError: return # TODO: Picking unique filenames or find a way to send it to kafka filename = 'codev1frame.jpg' # find a way to pick unique filenames with open(filename, 'wb') as f: f.write(decoded) frame = cv2.imread(filename) image = Image.fromarray(frame[..., ::-1]) # bgr to rgb boxs = self.yolo.detect_image(image) #print("box_num", len(boxs)) features = self.encoder(frame, boxs) # score to 1.0 here). detections = [ Detection(bbox, 1.0, feature) for bbox, feature in zip(boxs, features) ] # Run non-maxima suppression. boxes = np.array([d.tlwh for d in detections]) scores = np.array([d.confidence for d in detections]) indices = preprocessing.non_max_suppression(boxes, self.nms_max_overlap, scores) detections = [detections[i] for i in indices] """Use deep-sort to track person.""" # Call the tracker self.tracker.predict() self.tracker.update(detections) for track in self.tracker.tracks: if not track.is_confirmed() or track.time_since_update > 1: continue bbox = track.to_tlbr() cv2.rectangle(frame, (int(bbox[0]), int(bbox[1])), (int(bbox[2]), int(bbox[3])), (255, 255, 255), 2) cv2.putText(frame, str(track.track_id), (int(bbox[0]), int(bbox[1])), 0, 5e-3 * 200, (0, 255, 0), 2) for det in detections: bbox = det.to_tlbr() cv2.rectangle(frame, (int(bbox[0]), int(bbox[1])), (int(bbox[2]), int(bbox[3])), (255, 0, 0), 2) # sent result to kafka if len(boxs) > 0: for i in range(0, len(boxs)): self.objects_detected_view_text = 'ID:' + str( track.track_id) + ' x:' + str(boxs[i][0]) + ' y:' + str( boxs[i][1]) + ' width:' + str( boxs[i][2]) + ' height:' + str(boxs[i][3]) result = { 'ID': str(self.tracker.tracks[i].track_id), 'timestamp': dt.datetime.now().isoformat(), 'location_x': str(boxs[i][0]), 'w': str(boxs[i][2]), 'image': self.convert_image_to_text(frame) } self.producer.send('position', result) self.producer.flush() self.logger.info('prediction: ' + self.objects_detected_view_text) return def convert_image_to_text(self, frame): img_str = cv2.imencode('.jpeg', frame)[1] img_as_text = base64.b64encode(img_str).decode('utf-8') return img_as_text def process_frame(self, timestamp, dataset): # Definition of the parameters to_process = {} data = dataset.collect() self.logger.info('\033[3' + str(randint(1, 7)) + ';1m' + # Color '-' * 25 + '[ NEW MESSAGES: ' + str(len(data)) + ' ]' + '-' * 25 + '\033[0m' # End color ) dt_now = dt.datetime.now() for datum in data: event = json.loads(datum[1]) self.logger.info('Received Message: ' + event['camera_id'] + ' - ' + event['timestamp']) dt_event = dt.datetime.strptime(event['timestamp'], '%Y-%m-%dT%H:%M:%S.%f') delta = dt_now - dt_event #print("timestamp = " + str(dt_event)) if delta.seconds > 5: continue to_process[event['camera_id']] = event if len(to_process) == 0: self.logger.info('Skipping processing...') for key, event in to_process.items(): self.logger.info('Processing Message: ' + event['camera_id'] + ' - ' + event['timestamp']) start = timer() detection_result = self.detect_person_track(event) end = timer() delta = end - start self.logger.info('Done after ' + str(delta) + ' seconds.') try: if detection_result: self.logger.info('Sent image to Kafka endpoint.') except AssertionError: self.objects_detected_view_text = 'No person found!' continue # Press Q to stop! if cv2.waitKey(1) & 0xFF == ord('q'): break
from pyspark import SparkContext from pyspark.sql import SparkSession conf = SparkConf() conf.setAppName("TwitterStreamApp") # create spark instance with the above configuration sc = SparkContext(conf=conf) ssc = StreamingContext(sc, 5) # 5 second batch interval IP = "localhost" # Replace with your stream IP Port = 9009 # Replace with your stream port lines = ssc.socketTextStream(IP, Port) lines.pprint() # Print tweets we find to the console ssc.start() # Start reading the stream ssc.awaitTermination() # Wait for the process to terminate # TweetRead.py # This first python script doesn’t use Spark at all: import os import tweepy from tweepy import OAuthHandler from tweepy import Stream from tweepy.streaming import StreamListener import socket import json from apiConfigs import twitterConfigs consumer_key = twitterConfigs.apiKey consumer_secret = twitterConfigs.secretKey access_token = twitterConfigs.token
def hastags_func(line): n = line.split(";")[7] if (',' in n): return n.split(",") return [n] conf = SparkConf() conf.setAppName("BigData") sc = SparkContext(conf=conf) ssc = StreamingContext(sc, int(sys.argv[2])) ssc.checkpoint("~/checkpoint_BIGDATA") dataStream = ssc.socketTextStream("localhost", 9009) """ hashtags_count=dataStream.flatMap(hastags)\ .map(lambda hashtag : (hashtag, 1))\ .reduceByKeyAndWindow(lambda x,y:int(x)+int(y),int(sys.argv[1]),1) """ hash_count=dataStream.window(int(sys.argv[1]),1)\ .flatMap(hastags_func)\ .map(lambda hashtag : (hashtag, 1))\ .reduceByKey(lambda x,y:int(x)+int(y)) hash_count.foreachRDD(sorted_print) ssc.start() ssc.awaitTermination(25) ssc.stop()
for record in taken[:num]: with open("my_chem.csv", "a") as myfile: writer = csv.writer(myfile) writer.writerow(list(record)) val.foreachRDD(takeAndPrint) player_r = player_r.reduceByKey(lambda x, y: (x + y) / 2) player_rr = player_r.map(lambda x: (x[0][0], x[1])) player_rr = player_rr.map(lambda x: (1, x)) date = date.map(lambda x: (1, x)) player_rr = player_rr.join(date) player_rr = player_rr.map(lambda x: (x[1][0][0], x[1][0][1], x[1][1])) #player_rr.pprint() tpprint_rating(player_rr) #tpprint_chemistry_same(chemistry_same) #tpprint_chemistry_opp(chemistry_opp) player_profile1 = player_profile.map(lambda x: (x[0][0], (x[0][0], x[1][0], x[ 1][1], x[1][2], x[1][3], x[1][4]))) tpprint_player_profile(player_profile1) all_chemistry = chemistry_same.union(chemistry_opp) all_chemistry = all_chemistry.map(lambda x: (x[0][0], x[0][1], x[1])) tpprint_chem(all_chemistry) #all_chemistry.pprint() ssc.start() ssc.awaitTermination( 250) #check ssc.awaitTermination() # set how many seconds we want in () ssc.stop()
optimizing_fn=max) if __name__ == '__main__': if len(sys.argv) < 4: raise Exception( "Insufficient Arguments: python3 main.py <port> <stream_interval> <path_to_test_data>" ) APPNAME = "Poker Hand Classification" HOSTNAME = "localhost" MASTER = "local[4]" PORT = int(sys.argv[1]) STREAM_INTERVAL = int(sys.argv[2]) TEST_DATA_FILE_PATH = sys.argv[3] MINIMUM_RDD_LIST_LEN = 3 K = 2 RDD_LIST = list() ec_spark_context = SparkContext(MASTER, APPNAME) ec_spark_streaming_context = StreamingContext( ec_spark_context, batchDuration=STREAM_INTERVAL) # Read the test data set. poker_hands_test_rdd = ec_spark_context.textFile(TEST_DATA_FILE_PATH).map( tokenize).filter(not_contains_null).map(make_labeled_point) submit_spark_app(HOSTNAME, PORT) ec_spark_streaming_context.start() ec_spark_streaming_context.awaitTermination()
#) """ data = kafkaStream.map(lambda line: json.loads(line) """ rsvp = kafkaStream.map(lambda line: line[1]) rsvp2 = rsvp.map(lambda line: json.loads(line.encode("ascii", "ignore"))) #kafkaStream.pprint() """ process = data.mapValues(lambda line: line.encode('ascii')).cache() """ #event = data["topic_name"] #print(data.pprint()) print(rsvp.pprint()) """ print (data.mapValues(enc).pprint()) """ #print (event.pprint()) ssc.start() time.sleep(100) ssc.stop(stopSparkContext=True, stopGraceFully=True) """ ssc.awaitTermination().stop(stopSparkContext=True, stopGraceFully=True) """ #bin/spark-submit --packages org.apache.spark:spark-streaming-kafka-0-8:2.1.0 #bin/spark-submit --jars <spark-streaming-kafka-0-8-assembly.jar>
return line.split(";")[7].split(",") def f1(r): ab1 = r.sortBy(lambda x: (-x[1], x[0])) ab2 = ab1.collect() c = 0 i = 0 if (ab2 != []): while (c != 5): if (ab2[i][0] != ''): if (c != 4): print(ab2[i][0], end=',') else: print(ab2[i][0]) c += 1 i += 1 context = SparkContext(conf=conf) s1 = StreamingContext(context, int(sys.argv[2])) s1.checkpoint("~/checkpoint_BIGDATA") stream = s1.socketTextStream("localhost", 9009) x1 = stream.window(int(sys.argv[1]), 1) x2 = x1.flatMap(rc).map(lambda nam: (nam, 1)) output = x2.reduceByKey(lambda a, b: int(a) + int(b)) output.foreachRDD(f1) s1.start() s1.awaitTermination(25) s1.stop()
class Spark_Object_Detector(): """Stream WebCam Images to Kafka Endpoint.""" def __init__(self, interval=10, topic_to_consume='test', topic_for_produce='resultstream', kafka_endpoint='127.0.0.1:9092'): """ Initialize our yolo and firearm model""" self.detector = SuspicionDetection.SuspicionDetection() self.detector.enable_yolo_detection() self.detector.enable_firearm_detection() """Initialize Spark & TensorFlow environment.""" self.topic_to_consume = topic_to_consume self.topic_for_produce = topic_for_produce self.kafka_endpoint = kafka_endpoint # Create Kafka Producer for sending results self.producer = KafkaProducer(bootstrap_servers=kafka_endpoint) sc = SparkContext(appName='FirmArmDetection') self.ssc = StreamingContext(sc, interval) # , 3) # Make Spark logging less extensive log4jLogger = sc._jvm.org.apache.log4j log_level = log4jLogger.Level.ERROR log4jLogger.LogManager.getLogger('org').setLevel(log_level) log4jLogger.LogManager.getLogger('akka').setLevel(log_level) log4jLogger.LogManager.getLogger('kafka').setLevel(log_level) self.logger = log4jLogger.LogManager.getLogger(__name__) self.objects_detector_prediction = [] self.objects_detected_view_text = "" def _update_predictions(self): self.objects_detector_prediction = self.detector.get_yolo_prediction() self.firearm_detector_prediction = ( self.detector.get_firearm_detector_prediction()) self.activity_detector_prediction = ( self.detector.get_activity_detector_prediction()) self.event_detector_prediction = ( self.detector.get_event_detector_prediction()) self.detected_objects = [] if self.objects_detector_prediction: self.detected_objects.extend(self.objects_detector_prediction) if self.firearm_detector_prediction: self.detected_objects.extend(self.firearm_detector_prediction) if self.detected_objects: self._update_detected_objects(self.detected_objects) def _update_detected_objects(self, objects_prediction): parsed_objects = [p['label'] for p in objects_prediction] parsed_objects_dict = collections.Counter(parsed_objects) detected_suspicious_objects = False objects = '' for (obj, count) in parsed_objects_dict.items(): objects += '%s (%d)\n' % (obj, count) if obj in vgconf.SUSPICIOUS_OBJECTS_LIST: detected_suspicious_objects = True self.objects_detected_view_text = objects """ Do when suspicious object is detected """ # Start alert if suspicious object is detected. # if detected_suspicious_objects: # self._start_alert() def start_processing(self): """Start consuming from Kafka endpoint and detect objects.""" kvs = KafkaUtils.createDirectStream( self.ssc, [self.topic_to_consume], {'metadata.broker.list': self.kafka_endpoint}) kvs.foreachRDD(self.handler) self.ssc.start() self.ssc.awaitTermination() def detect_objects(self, event): """Use Yolo and Incepiton Model to detect objects.""" decoded = base64.b64decode(event['image']) # TODO: Picking unique filenames or find a way to send it to kafka filename = 'C:\\Users\\hp\\Desktop\\codev1frame.jpg' # I assume you have a way of picking unique filenames with open(filename, 'wb') as f: f.write(decoded) img = cv2.imread(filename) # Prepare object for sending to endpoint result = { 'timestamp': event['timestamp'], 'camera_id': event['camera_id'], 'image': self.get_box_plot(img), 'prediction': self.objects_detected_view_text } return json.dumps(result) def get_box_plot(self, img): self.detector.detect(img) frame = self.detector.plot_objects(img) self._update_predictions() img_str = cv2.imencode('.jpeg', frame)[1] img_as_text = base64.b64encode(img_str).decode('utf-8') return img_as_text def handler(self, timestamp, message): """Collect messages, detect object and send to kafka endpoint.""" records = message.collect() # For performance reasons, we only want to process the newest message # for every camera_id to_process = {} self.logger.info('\033[3' + str(randint(1, 7)) + ';1m' + # Color '-' * 25 + '[ NEW MESSAGES: ' + str(len(records)) + ' ]' + '-' * 25 + '\033[0m' # End color ) dt_now = dt.datetime.now() for record in records: event = json.loads(record[1]) self.logger.info('Received Message: ' + event['camera_id'] + ' - ' + event['timestamp']) dt_event = dt.datetime.strptime(event['timestamp'], '%Y-%m-%dT%H:%M:%S.%f') delta = dt_now - dt_event print("timestamp = " + str(dt_event)) if delta.seconds > 5: continue to_process[event['camera_id']] = event if len(to_process) == 0: self.logger.info('Skipping processing...') for key, event in to_process.items(): self.logger.info('Processing Message: ' + event['camera_id'] + ' - ' + event['timestamp']) start = timer() detection_result = self.detect_objects(event) self.logger.info('prediction: ' + self.objects_detected_view_text) end = timer() delta = end - start self.logger.info('Done after ' + str(delta) + ' seconds.') self.producer.send(self.topic_for_produce, detection_result.encode('utf-8')) self.logger.info('Sent image to Kafka endpoint.') self.producer.flush()
print(','.join(array)) return else: array.append(element[0]) count += 1 window_size = int(sys.argv[1]) batch_size = int(sys.argv[2]) conf = SparkConf() conf.setAppName("BigData") sc = SparkContext(conf=conf) #sc.setLogLevel('ERROR') ssc = StreamingContext(sc, 1) ssc.checkpoint("./checkpoint_BIGDATA") dataStream = ssc.socketTextStream("localhost", 9009) hashtags = dataStream.map(lambda x: ret_tags(''.join(x.split(';')[7]))).window( window_size, batch_size).flatMap(lambda x: parse(x)).filter( lambda x: not (x == '')).map(lambda x: (x, 1)) tagcounts = hashtags.reduceByKey(lambda x, y: x + y) sorted_tagcounts = tagcounts.foreachRDD(lambda rdd: rdd.sortByKey( )).foreachRDD(lambda rdd: rdd.sortBy(lambda x: x[1], ascending=False)) sorted_tagcounts.foreachRDD(lambda rdd: print_top_5(rdd.collect())) ssc.start() ssc.awaitTermination(100) ssc.stop()
temp_rdd_2 = temp_rdd.collect() if (temp_rdd_2 != []): for i in range(5): if (i != 4): print(temp_rdd_2[i][0], end=",") else: print(temp_rdd_2[i][0]) window_size = int(sys.argv[1]) batch = int(sys.argv[2]) conf = SparkConf() conf.setAppName("BigData") sc = SparkContext(conf=conf) ssc = StreamingContext(sc, batch) ssc.checkpoint("~/checkpoint_BIGDATA") dataStream = ssc.socketTextStream("localhost", 9009) tweets = dataStream.window(window_size, 1) flat_tweets = tweets.flatMap(line_split).map(lambda w: (w, 1)) reduced_tweets = flat_tweets.reduceByKey(lambda x, y: int(x) + int(y)) reduced_tweets.foreachRDD(printrdd2) ssc.start() ssc.awaitTermination(60) ssc.stop()
def main(): # 解析配置 app_id = int(sys.argv[1]) master = sys.argv[2] app_name = sys.argv[3] # 应用配置 assert APP_CONFIG.get(app_id) is not None, \ '[myapp streaming_app_main.main()] configuration error invalid APP_CONFIG with app.id = ' + str(app_id) app_conf = map_conf_properties(APP_CONFIG.get(app_id), 'app.id')[app_id] spark_home = app_conf['sparkHome'] pyFiles = app_conf['pyFiles.list'] di_id = app_conf.get('app.interfaceId') # 数据接口配置 di_in_conf_with_ds_conf = get_di_conf_with_ds_conf( di_id, DATAINTERFACE_CONFIG, DATASOURCE_CONFIG, di_key='interface.id', di_ds_key='interface.sourceId', ds_key='source.id', merge_key_name='interface.id')[di_id] print('= = ' * 20, type(di_in_conf_with_ds_conf), 'di_in_conf_with_ds_conf = ') pprint(di_in_conf_with_ds_conf) schema_conf_string = di_in_conf_with_ds_conf['schema'] struct_type = generate_df_schmea(schema_conf_string) # schema_field_list = [x.name for x in struct_type.fields] di_in_conf_with_ds_conf['struct.type'] = struct_type # di_in_conf_with_ds_conf['struct.field.list'] = schema_field_list di_out_confs = [ kv for kv in DATAINTERFACE_CONFIG.iteritems() if kv[1].get('interface.type', '') == 'output' ] print('= = ' * 20, type(di_out_confs), 'di_out_confs = ') pprint(di_out_confs) di_out_confs_with_ds_conf = list_dict_merge([ get_di_conf_with_ds_conf(kv[0], DATAINTERFACE_CONFIG, DATASOURCE_CONFIG, di_key='interface.id', di_ds_key='interface.sourceId', ds_key='source.id', merge_key_name='interface.id') for kv in DATAINTERFACE_CONFIG.iteritems() if kv[1].get('interface.type', '') == 'output' ]) print('= = ' * 20, type(di_out_confs_with_ds_conf), 'di_out_confs_with_ds_conf = ') pprint(di_out_confs_with_ds_conf) # 外部缓存配置 cache_confs_with_ds_conf = list_dict_merge([ get_di_conf_with_ds_conf(kv[0], CACHE_CONFIG, DATASOURCE_CONFIG, di_key='cache.id', di_ds_key='cache.sourceId', ds_key='source.id', merge_key_name='cache.id') for kv in CACHE_CONFIG.iteritems() ]) print('= = ' * 20, type(cache_confs_with_ds_conf), 'cache_confs_with_ds_conf = ') pprint(cache_confs_with_ds_conf) # 指定输入接口准备阶段的配置 # 准备阶段配置中有效步骤的配置 # Note: 对 dict 进行 filter,传给function的参数是 dict 的 key prepares_config_active = PREPARES_CONFIG[di_id] \ if PREPARES_CONFIG.get(di_id, {}).get('prepares.enabled', False) else {} # print('= = ' * 20, type(prepares_config_active), 'prepares_config_active = ') # pprint(prepares_config_active) # TODO: 2中方法的结果==测试False, 删除注释 # prepares_config_active_steps = filter( # lambda step_conf: step_conf[1].get('step.enabled', False), # map(lambda step_conf: (step_conf[0], map_conf_properties(step_conf[1])), # prepares_config_active.get('steps', {}).iteritems() # ) # ) prepares_config_active_steps = \ [(k, map_conf_properties(v)) for k, v in prepares_config_active.get('steps', {}).iteritems() if v.get('step.enabled', False)] print('= = ' * 20, type(prepares_config_active_steps), 'prepares_config_active_steps = ') pprint(prepares_config_active_steps) # 指定输入接口计算阶段的配置 # filter 之后变成 list,list 的每个元素是 tuple(computeStatistics.id, computeStatistics.conf_dict) computes_config_active = COMPUTES_CONFIG[di_id] \ if COMPUTES_CONFIG.get(di_id, {}).get('computeStatistics.enabled', False) else {} # list[{computeStatistic.id: {conf}}, ...] # # TODO: 2中方法的结果==测试False, 删除注释 # compute_computeStatistics_config_active = filter( # lambda computeStatistic_conf: computeStatistic_conf[1].get('computeStatistic.enabled', False), # computes_config_active.get('computeStatistics', {}).iteritems()) compute_computeStatistics_config_active = [ kv for kv in computes_config_active.get('computeStatistics', {}).iteritems() if kv[1].get('computeStatistic.enabled', False) ] print('= = ' * 20, type(compute_computeStatistics_config_active), 'compute_computeStatistics_config_active = ') pprint(compute_computeStatistics_config_active) # {computeStatistic.id -> list[step_conf_tuple]}, 其中 step_conf_tuple = (step_id, step_conf_dict) compute_prepares_config_active = dict( map( lambda computeStatistic_conf: (computeStatistic_conf[0], sorted( list_dict_merge( map( lambda step_conf: map_conf_properties( step_conf[1], 'step.id'), filter( lambda step_conf: step_conf[1].get( 'step.enabled', False), computeStatistic_conf[ 1].get('prepares.steps', {}).iteritems())) ).iteritems())), compute_computeStatistics_config_active)) # print('= = ' * 30, compute_prepares_config_active2 == compute_prepares_config_active) print('= = ' * 20, type(compute_prepares_config_active), 'compute_prepares_config_active = ') pprint(compute_prepares_config_active) compute_computes_config_active = dict( map( lambda computeStatistic_conf: (computeStatistic_conf[0], sorted( list_dict_merge( map( lambda step_conf: map_conf_properties( step_conf[1], 'step.id'), filter( lambda step_conf: step_conf[1].get( 'step.enabled', False), computeStatistic_conf[ 1].get('computes.steps', {}).iteritems())) ).iteritems())), compute_computeStatistics_config_active)) print('= = ' * 20, type(compute_computes_config_active), 'compute_computes_config_active = ') pprint(compute_computes_config_active) test_flag = False if not test_flag: # 初始化 # 测试 serializer # serializer 默认取值 PickleSerializer() #UnpicklingError: invalid load key, '{'. # serializer=MarshalSerializer() # ValueError: bad marshal data # serializer=AutoSerializer() # ValueError: invalid sevialization type: { # serializer=CompressedSerializer(PickleSerializer()) # error: Error -3 while decompressing data: incorrect header check # sc = SparkContext(master, app_name, sparkHome = spark_home, pyFiles=pyFiles) # sc = SparkContext(master, app_name, sparkHome = sparkHome, pyFiles=pyFiles, serializer=MarshalSerializer()) # sc = SparkContext(master, app_name, sparkHome = sparkHome, pyFiles=pyFiles, serializer=AutoSerializer()) # sc = SparkContext(master, app_name, sparkHome = sparkHome, pyFiles=pyFiles, serializer=CompressedSerializer(PickleSerializer())) spark_conf = SparkConf() spark_conf.setMaster(master).setAppName(app_name).setSparkHome( spark_home) # spark streaming 调优配置 spark_streaming_blockInterval = str( app_conf.get('spark.streaming.blockInterval', '')).strip() if spark_streaming_blockInterval: spark_conf.set('spark.streaming.blockInterval', spark_streaming_blockInterval) spark_streaming_kafka_maxRatePerPartition = str( app_conf.get('spark.streaming.kafka.maxRatePerPartition', '')).strip() if spark_streaming_kafka_maxRatePerPartition: spark_conf.set('spark.streaming.kafka.maxRatePerPartition', spark_streaming_kafka_maxRatePerPartition) spark_streaming_receiver_maxRate = str( app_conf.get('spark.streaming.receiver.maxRate', '')).strip() if spark_streaming_receiver_maxRate: spark_conf.set('spark.streaming.receiver.maxRate', spark_streaming_receiver_maxRate) spark_streaming_concurrentJobs = str( app_conf.get('spark.streaming.concurrentJobs', '')).strip() if spark_streaming_concurrentJobs: spark_conf.set('spark.streaming.concurrentJobs', spark_streaming_concurrentJobs) # spark sql 调优配置 spark_sql_shuffle_partitions = str( app_conf.get('spark.sql.shuffle.partitions', '')).strip() if spark_sql_shuffle_partitions: spark_conf.set('spark.sql.shuffle.partitions', spark_sql_shuffle_partitions) sc = SparkContext(conf=spark_conf) for path in (pyFiles or []): sc.addPyFile(path) # 外部缓存优化,broadcast 分发 cache_manager = CacheManager() cache_broadcast_list = \ [(cache_id, cache_manager.cache_dataset(sc, cache_conf)) for cache_id, cache_conf in cache_confs_with_ds_conf.iteritems() if cache_conf.get('broadcast.enabled', False)] for cache_id, cache_broadcast in cache_broadcast_list: cache_confs_with_ds_conf[cache_id]['broadcast'] = cache_broadcast batchDruationSeconds = app_conf['batchDuration.seconds'] ssc = StreamingContext(sc, batchDruationSeconds) sqlc = SQLContext(sc) # 读取数据源 stream = StreamingReader.readSource(ssc, di_in_conf_with_ds_conf, app_conf) # 流处理: 1 根据配置初始化处理指定数据接口的类的实例, 2 调用指定处理类实例的流数据处理方法 # 测试 kafka_wordcount # counts = stream.flatMap(lambda line: line.split(" ")) \ # .map(lambda word: (word, 1)) \ # .reduceByKey(lambda a, b: a+b) # counts.pprint() StreamingApp.process(stream, sc, sqlc, di_in_conf_with_ds_conf, di_out_confs_with_ds_conf, cache_confs_with_ds_conf, prepares_config_active_steps, compute_prepares_config_active, compute_computes_config_active) ssc.start() ssc.awaitTermination()
checkpoint_dir = './Checkpoint/spark' ssc.checkpoint(checkpoint_dir) kafka_params = { "bootstrap.servers": "localhost:9092", "group.id": "myUserGroup", "enable.auto.commit": "false", "auto.offset.reset": "largest" } dstream = [KafkaUtils.createDirectStream(ssc, [tlist[i]], kafka_params,\ keyDecoder=spot_decoder,\ valueDecoder=spot_decoder,\ messageHandler=setHandler )\ for i in range(len(tlist)) ] countList = [] for index in range(len(tlist)): print(tlist[index]) tempt = ( dstream[index].map( lambda x : getID(x) )\ .map( lambda x : ( 1, x))\ .updateStateByKey( updatefunction )\ ) print("lalalaall") countList.append(tempt) countList[index].foreachRDD(lambda x: displayID(x)) ssc.start() ssc.awaitTermination(5000) ssc.stop()
index_col=0).transpose() oldlog['cnt'] = oldlog['cnt'].astype(int) if user_id in oldlog.columns: oldlog[user_id] += self.chkedwords_df[user_id] oldlog[user_id] = oldlog[user_id].fillna(0).astype(int) oldlog.transpose().to_csv('./newslog.csv', encoding='euc_kr', mode='w') else: newlog = pd.concat([oldlog, self.chkedwords_df], axis=1, join_axes=[oldlog.index], join='inner') newlog[user_id] = newlog[user_id].fillna(0).astype(int) newlog.transpose().to_csv('./newslog.csv', encoding='euc-kr') if __name__ == '__main__': sc = SparkContext() ssc = StreamingContext(sc, 10) date = time.strftime("%y%m%d") tstream = ssc.textFileStream('hdfs://192.168.56.102:9000/cplogs/news/' + date) logprocess = NewsLogToCSV() tstream.foreachRDD(logprocess.process_newslog) ssc.start() ssc.awaitTermination()
from pyspark import SparkContext from pyspark.streaming import StreamingContext # 创建一个sc(使用本地2个core) 和时间间隔为4s 的ssc sc = SparkContext("spark://master:7077", "NetworkWordCount") ssc = StreamingContext(sc, 4) # 创建DStream,并读取套接字 lines = ssc.socketTextStream("master", 9999) # 单词计数 words = lines.flatMap(lambda line: line.split(" ")) pairs = words.map(lambda word: (word, 1)) wordCounts = pairs.reduceByKey(lambda x, y: x + y) # 将每个RDD的前10个元素打印 wordCounts.pprint() ssc.start() # 启动计算 ssc.awaitTermination() # 等待终止
wordCounts.pprint(5) #Count lines totalLines = 0 linesCount = 0 def computeMetrics(rdd): global totalLines global linesCount linesCount = rdd.count() totalLines += linesCount print rdd.collect() print "Lines in RDD :", linesCount, " Total Lines:", totalLines lines.foreachRDD(computeMetrics) #Compute window metrics def windowMetrics(rdd): print "Window RDD size:", rdd.count() windowedRDD = lines.window(6, 3) windowedRDD.foreachRDD(windowMetrics) streamContext.start() #streamContext.stop() streamContext.awaitTermination() print "Overall lines :", totalLines
class SparkConsumer: """ Class for spark consumer reading from kafka topic which contains the ecg timeseries data. """ def __init__(self, kafka_config_infile, ecg_spark_config_infile, postgres_config_infile, s3bucket_config_infile, batch_interval): if not os.path.exists('./tmp'): os.makedirs('./tmp') logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s', filename='./tmp/spark_consumer.log', filemode='w') self.logger = logging.getLogger('py4j') self.logger.setLevel(logging.WARN) self.ecg_spark_config = helpers.parse_config(ecg_spark_config_infile) self.postgres_config = helpers.parse_config(postgres_config_infile) self.s3bucket_config = helpers.parse_config(s3bucket_config_infile) self.kafka_config = helpers.parse_config(kafka_config_infile) self.sc = SparkContext(appName='ECGDashboardApp') self.sc.setLogLevel("FATAL") self.ssc = StreamingContext(self.sc, batch_interval) self.logger.warn('Opened spark Context') self.kafkastream = self.connectToKafkaBrokers() self.logger.warn('Opened connection to Kafka brokers') self.a = self.sc.accumulator(0) def start(self): """ Starts the streaming context to start subscribing to kafka topic """ self.ssc.start() self.logger.warn('Spark context started') self.ssc.awaitTermination() self.logger.warn('Spark context terminated') def connectToKafkaBrokers(self): """ Setup subscription to kafka topic """ kafkastream = KafkaUtils.createDirectStream(self.ssc, [self.kafka_config["topic"]], {"metadata.broker.list": self.kafka_config['ip-addr'], "group.id": self.ecg_spark_config['group-id'], "num.partitions": str(self.kafka_config['partitions'])}) self.logger.warn('Connected kafka stream to spark context') return kafkastream def runECG(self): """ Grouping and insertion of ecg samples into database :return: """ lines = self.kafkastream.map(lambda x: x[1]) self.logger.warn('Reading in kafka stream line') raw_record = lines.map(lambda line: line.encode('utf-8')). \ map(lambda line: line.split(',')) if raw_record is not None: raw_record.pprint() else: print('raw_record is none') record_interval = raw_record.map(lambda x: (x[0], x[1:])). \ groupByKey().map(lambda x: (x[0], list(x[1]))) record_interval.foreachRDD(lambda x: insertECGSamples(self.logger, self.postgres_config, accum(self.a), x)) self.ssc.start() self.logger.warn('Spark context started') self.ssc.awaitTermination() self.logger.warn('Spark context terminated') def runHR(self): """ Grouping and calculation of HR for insertion in database :return: """ s3 = boto3.client('s3') obj = s3.get_object(Bucket=self.s3bucket_config['bucket'], Key="mgh001_metadata.txt") file_content = obj['Body'].read().decode('utf-8') meta_data = json.loads(file_content) fs = meta_data['fs'] lines = self.kafkastream.map(lambda x: x[1]) self.logger.warn('Reading in kafka stream line') raw_record = lines.map(lambda line: line.encode('utf-8')). \ map(lambda line: line.split(',')) if raw_record is not None: raw_record.pprint() else: print('raw_record is none') record_interval = raw_record.map(lambda line: (line[0], line[1:])). \ groupByKey().map(lambda x: (x[0], list(x[1]))) record_interval.foreachRDD( lambda x: processHRSample(self.logger, self.postgres_config, accum(self.a), fs, x)) self.logger.warn('Saved records to DB') self.ssc.start() self.logger.warn('Spark context started') self.ssc.awaitTermination() self.logger.warn('Spark context terminated')
## Spark Streaming ### import sys ## Develop Spark streaming context ## # import os # os.environ["SPARK_HOME"] = '/usr/lib/spark' from pyspark import SparkConf, SparkContext from pyspark.streaming import StreamingContext ## creating spark stream for word count if __name__ == "__main__": conf = SparkConf().setMaster("local[2]").setAppName( "SparkStreamingcount").set("spark.executor.memory", "1g") sc = SparkContext(conf=conf) strc = StreamingContext(sc, 1) strc.checkpoint( "hdfs://quickstart.cloudera:8020/user/cloudera/sparkstream") lines = strc.socketTextStream(sys.argv[1], int(sys.argv[2])) count = lines.flatMap(lambda x: x.split(' ')).map( lambda x: (x, 1)).reduceByKey(lambda x, y: x + y) count.pprint() strc.start() strc.awaitTermination()
import sys from pyspark import SparkContext from pyspark.streaming import StreamingContext if __name__ == "__main__": sc = SparkContext(master="local[2]", appName="StreamingErrorCount") ssc = StreamingContext(sc, 2) # 2 segundos de intervalo de tempo ssc.checkpoint("file:///home/felipe/checkpoint2") # tolerancia a falhas # lines eh uma sequencia de RDDs lines = ssc.socketTextStream(sys.argv[1], int(sys.argv[2])) # host e porta counts = lines.countByWindow(10,2) # (window size, sliding interval) counts.pprint() # imprime para cada intervalo. nao sao necessarios loops ssc.start() # inicia a escuta peloas dados de streaming ssc.awaitTermination() # aplicacao espera terminar os dados de transmissao
def shutdown_hook(producer): try: producer.flush(10) except KafkaError as kafka_error: logger.warn('Failed to flush pending messages to kafka, caused by:%s', kafka_error.message) finally: try: producer.close(10) except Exception as e: logger.warn('Failed to close kafka connection, caused by %s',def process_stream(stream, kafka_producer, target_topic): def send_to_kafka(rdd): results = rdd.collect() for r in results: data = json.dumps({ 'Symbol': r[0], 'Timestamp': time.time(), 'Average': r[1] }) try: logger.info('Sending average price %s to kafka', data) kafka_producer.send(target_topic, value=data) except KafkaError as error: logger.warn('Failed to send average price to kafka, caused by:%s', error.message) def pair(data): record = json.loads(data.encode('utf-8')) return record.get('Symbol'), (float(record.get('LastTradePrice')), 1) # (symbol, (price, count)) stream.map(pair).reduceByKey(lambda a, b: (a[0] + b[0], a[1] + b[1])).map(lambda (k, v):(k, v[0]/v[1])).foreachRDD(send_to_kafka) if __name__ == '__main__': # Setup command line arguments. parser = argparse.ArgumentParser() parser.add_argument('source_topic', help='the kafka topic to subscribe from.') parser.add_argument('target_topic', help='the kafka topic to send message to.') parser.add_argument('kafka_broker', help='the kafka broker.') parser.add_argument('batch_duration', help='the batch duration in secs.') # Parse arguments. args = parser.parse_args() source_topic = args.source_topic target_topic = args.target_topic kafka_broker = args.kafka_broker batch_duration = int(args.batch_duration) # Create SparkContext and SparkStreamingContext sc = SparkContext('local[2]', 'AveragePrice') sc.setLogLevel('INFO') ssc = StreamingContext(sc, batch_duration) # Instantiate a Kafka stream for processing. directKafkaStream = KafkaUtils.createDirectStream(ssc, [source_topic], {'metadata.broker.list':kafka_broker}) # Extract value stream = directKafkaStream.map(lambda x : x[1]) # Instantiate a simple Kafka producer. kafka_producer = KafkaProducer(bootstrap_servers=kafka_broker) process_stream(stream, kafka_producer, target_topic) # Setup shutdown hook atexit.register(shutdown_hook, kafka_producer) ssc.start() ssc.awaitTermination()
ssc = StreamingContext(sc, batch_interval) # streaming application must run 24 hours a day. Thus, it needs to be resilient to failures caused by some unexpected errors such as system failures, driver failure, JVM crashes, etc. Checkpointing saves the generated RDDs to a reliable storate and performs receovery from an error. #To summarise, checkpoints provide a way of recovering to a safe stable application snapshot. # Using the ssc.checkpoint() method, we can tell the Spark engine where to store the checkpoint files. ssc.checkpoint("checkpoint") host = "localhost" port = 9999 lines = ssc.socketTextStream(host, int(port)) # Split each line into words words = lines.flatMap(lambda line: line.split(" ")) # Count each word in each batch pairs = words.map(lambda word: (word, 1)) wordCounts = pairs.updateStateByKey(updateFunc) # Print the result wordCounts.pprint() ssc.start() try: ssc.awaitTermination(timeout=60) except KeyboardInterrupt: ssc.stop() sc.stop() ssc.stop() sc.stop()
def main(): parser = OptionParser() parser.add_option('', '--enriched_data_path', action='store', dest='enriched_data_path', help='path to write enriched data') parser.add_option('', '--model_path', action='store', dest='model_path', help='path for model data') parser.add_option('', '--kafka_zookeeper_hosts', action='store', dest='kafka_zookeeper_hosts', help='list of Zookeeper hosts (host:port)') parser.add_option('', '--kafka_broker_list', action='store', dest='kafka_broker_list', help='list of Kafka brokers (host:port)') parser.add_option('', '--kafka_message_topic', action='store', dest='kafka_message_topic', help='topic to consume input messages from') parser.add_option('', '--kafka_alert_topic', action='store', dest='kafka_alert_topic', help='topic to produce alert messages to') parser.add_option('', '--kafka_enriched_data_topic', action='store', dest='kafka_enriched_data_topic', help='topic to produce enriched data to') parser.add_option('', '--streaming_batch_duration_sec', type='float', default=15.0, action='store', dest='streaming_batch_duration_sec', help='Streaming batch duration in seconds') parser.add_option('', '--max_batches', type='int', default=0, action='store', dest='max_batches', help='Number of batches to process (0 means forever)') options, args = parser.parse_args() sc = SparkContext() ssc = StreamingContext(sc, options.streaming_batch_duration_sec) sqlContext = getSqlContextInstance(sc) # Load saved model. model = None if options.model_path: model = RandomForestModel.load(sc, options.model_path) else: print('No model loaded.') # Create Kafka stream to receive new messages. kvs = KafkaUtils.createDirectStream( ssc, [options.kafka_message_topic], { 'metadata.broker.list': options.kafka_broker_list, 'group.id': 'spark_streaming_processor.py' }) # Take only the 2nd element of the tuple. messages = kvs.map(lambda x: x[1]) # Convert RDD of JSON strings to RDD of Rows. rows = messages.map(json_to_row) # Process messages. rows.foreachRDD(lambda time, rdd: process_messages( time, rdd, ssc=ssc, model=model, enriched_data_path=options.enriched_data_path, zookeeper_hosts=options.kafka_zookeeper_hosts, kafka_alert_topic=options.kafka_alert_topic, kafka_enriched_data_topic=options.kafka_enriched_data_topic, max_batches=options.max_batches)) ssc.start() ssc.awaitTermination()
return sum(new_values) + (total_sum or 0) def flatt(url): parts=re.split(r',',url) for u in parts: yield (str(u),1) if __name__=="__main__": if len(sys.argv)!=3: print("3 arguements required") sys.exit(-1) a=int(sys.argv[1]) b=int(sys.argv[2]) conf=SparkConf() conf.setAppName("BigData") sc=SparkContext(conf=conf) ssc=StreamingContext(sc,b) ssc.checkpoint("/some") dataStream=ssc.socketTextStream("localhost",9009) tweet2=dataStream.filter(lambda w:w.split(';')[7]!="") tweet=tweet2.map(lambda x:x.split(';')[7]) job=tweet.flatMap(lambda x:flatt(x)) windowedWordCounts = job.reduceByKeyAndWindow(lambda x, y: x + y, lambda x, y: x - y, a, 1) def gunf(time,rdd): val=sorted(rdd.collect(),key=lambda x:(-x[1],x[0])) i=0 if(len(val)>4): print(val[0][0]+","+val[1][0]+","+val[2][0]+","+val[3][0]+","+val[4][0]) windowedWordCounts.foreachRDD(gunf) ssc.start() ssc.awaitTermination(30) ssc.stop()
if ',' not in t: return [t] else: y=t.split(",") return y def fab(r): sr = r.sortBy(lambda x: (-x[1],x[0])) srr = sr.collect() c=0 i=0 if(srr!=[]): while(c!=5): if(srr[i][0]!=''): if(c!=4): print(srr[i][0],end=',') else: print(srr[i][0]) c+=1 i+=1 conf=SparkConf() conf.setAppName("BigData") ab=SparkContext(conf=conf) cc=StreamingContext(ab,int(sys.argv[2])) cc.checkpoint("~/checkpoint_BIGDATA") stream=cc.socketTextStream("localhost",9009) finalans=stream.window(int(sys.argv[1]),1).flatMap(rc).map(lambda x : (x, 1)).reduceByKey(lambda a,b:int(a)+int(b)) finalans.foreachRDD(fab) cc.start() cc.awaitTermination(25) cc.stop()
r = aw1.collect() if (r != []): f1(r) def f1(inp): count = 0 j = 0 while (count != 5): if (inp[j][0] != ""): if (count != 4): print(inp[j][0], end=",") else: print(inp[j][0]) count += 1 j = j + 1 configuration = SparkConf() configuration.setAppName("Assign3") spark_context = SparkContext(configuration=configuration) stream_context = StreamingContext(spark_context, int(sys.argv[2])) stream_context.checkpoint("~/checkpoint_Assign3") stream = stream_context.socketTextStream("localhost", 8000) o = dataStream.window(int(sys.argv[1]), 1).flatMap(cr).map( lambda z: (z, 1)).reduceByKey(lambda a, b: int(a) + int(b)) o.foreachRDD(f) stream_context.start() stream_context.awaitTermination(60) stream_context.stop()
from pyspark import SparkContext, SparkConf from pyspark.streaming import StreamingContext conf = SparkConf() conf.set("spark.master", "yarn") conf.set("spark.app.name", "streamingapp") sc = SparkContext(conf=conf) streamc = StreamingContext(sc, batchDuration=15) r1 = sc.textFile("s3://datasets-spark-learning/flat_files/au-500.csv") ds1 = streamc.textFileStream( "s3://datasets-spark-learning/flat_files/csvfiles/") ds2 = ds1.transform(lambda rdd: rdd.union(r1).map(lambda x: x + "spark")) ds2.pprint() streamc.start() streamc.awaitTermination()
# Get relevant data rows = rows.filter(lambda row: len(row) > 8) airports_fromto = rows.map(lambda row: ( \ (row[0], row[1], row[2], AMOrPM(row[5])), \ (row[3], row[4], departureTimePretty(row[5]), float(row[8])) \ ) \ ) # Filtering just necessary flights airports_fromto = airports_fromto.filter(lambda row: row[0] == ('BOS', 'ATL', '2008-04-03', 'AM')) \ .union(airports_fromto.filter(lambda row: row[0] == ('ATL', 'LAX', '2008-04-05', 'PM'))) \ .union(airports_fromto.filter(lambda row: row[0] == ('PHX', 'JFK', '2008-09-07', 'AM'))) \ .union(airports_fromto.filter(lambda row: row[0] == ('JFK', 'MSP', '2008-09-09', 'PM'))) \ .union(airports_fromto.filter(lambda row: row[0] == ('DFW', 'STL', '2008-01-24', 'AM'))) \ .union(airports_fromto.filter(lambda row: row[0] == ('STL', 'ORD', '2008-01-26', 'PM'))) \ .union(airports_fromto.filter(lambda row: row[0] == ('LAX', 'MIA', '2008-05-16', 'AM'))) \ .union(airports_fromto.filter(lambda row: row[0] == ('MIA', 'LAX', '2008-05-18', 'PM'))) # Minimum search airports_fromto = airports_fromto.updateStateByKey(getMinimum) # Print and save airports_fromto.foreachRDD(printResults) airports_fromto.foreachRDD(saveResults) # Kafka Sink airports_fromto.foreachRDD(lambda rdd: rdd.foreachPartition(sendToKafka)) ssc.start() # Start the computation ssc.awaitTermination() # Wait for the computation to terminate
import sys
b = a.split(',') for i in b: if (i != ''): yield (i, 1) conf = SparkConf() conf.setAppName("BigData") sc = SparkContext(conf=conf) batch = int(sys.argv[2]) window_size = int(sys.argv[1]) ssc = StreamingContext(sc, batch) ssc.checkpoint("/checkpoint_BIGDATA") dataStream = ssc.socketTextStream("localhost", 9009) tweet = dataStream.map(tmp) data = dataStream.window( window_size, 1).flatMap(get_hashtag).reduceByKey(lambda x, y: x + y).transform( lambda rdd: rdd.sortBy(lambda x: x[1], ascending=False)) data.foreachRDD(process_rdd) #data.pprint(3) ssc.start() ssc.awaitTermination(12) ssc.stop()
words_df.registerTempTable("Words") # get the words from the table using SQL and print them words_df = sql_context.sql( "select word, word_count from Words order by word_count desc") words_df.show() # words_df.saveAsTextFiles("wc_output") except: e = sys.exc_info()[0] print("Error: %s" % e) # initializing spark context sc = SparkContext("local[2]", "TCP Streaming word count") # streaming context ssc = StreamingContext(sc, 5) ssc.checkpoint("checkpoint_TwitterApp") # getting the data from the stream lines = ssc.socketTextStream("localhost", 9009) # splitting the data using space as delimiter words = lines.flatMap(lambda line: line.split(" ")) # mapping the words as jey and value pairs = words.map(lambda word: (word, 1)) # wordCounts = pairs.reduceByKey(lambda x, y: x + y) # passing the words to the aggregate funtion which adds them to the previous count words_total = pairs.updateStateByKey(aggregate_words_count) words_total.foreachRDD(rdd_processing) # wordCounts.pprint() ssc.start() # Start the computation ssc.awaitTermination() # Wait before termination