def read_offsets(cls, topics): try: zk = cls.get_zookeeper_instance() from_offsets = {} for topic in topics: logger.warning("TOPIC:%s", topic) #create path if it does not exist topic_path = ZK_CHECKPOINT_PATH + topic try: partitions = zk.get_children(topic_path) for partition in partitions: topic_partition = TopicAndPartition( topic, int(partition)) partition_path = topic_path + '/' + partition offset = int(zk.get(partition_path)[0]) from_offsets[topic_partition] = offset except Exception: try: topic_partition = TopicAndPartition(topic, int(0)) zk.ensure_path(topic_path + '/' + "0") zk.set(topic_path, str(0).encode()) from_offsets[topic_partition] = int(0) logger.warning("NO OFFSETS") except Exception: logger.error('MAKE FIRST OFFSET:{}', exc_info=True) #logger.warning("FROM_OFFSETS:%s",from_offsets) return from_offsets except Exception: logger.error('READ OFFSETS:%s', exc_info=True)
def main(): # create spark context, spark session sc = spark_context_creator() spark = SparkSession(sc) # To avoid unnecessary logs sc.setLogLevel("WARN") # create streaming context ssc = StreamingContext(sc, 3) # create stream handler object to process stream stream_process = stream_handler_proessed.StreamHandler() # prepare direct stream parameters kafka_params = config.KAFKA_PARAMS # checkpoint for the last consumed offset object offset_file_path = config.OFFSET_FILE_PATH # get last consumed offset offset_ranges = stream_process.get_offset(offset_file_path) topics = [] from_offset = {} # if list empty, means first time to consume, then configure manually if (not offset_ranges): topics = list(config.TOPICS_PARTIONS_OFFSETS.keys()) topics_partions_offsets = config.TOPICS_PARTIONS_OFFSETS for topic in list(topics_partions_offsets.keys()): topic_partion = TopicAndPartition( topic, topics_partions_offsets[topic][0]) from_offset[topic_partion] = topics_partions_offsets[topic][1] # get info from saved offset_ranges object else: for o in offset_ranges: topics.append(o.topic) topic_partion = TopicAndPartition(o.topic, o.partition) from_offset[topic_partion] = o.untilOffset # kafka consumer-spark connection kafka_direct_stream = KafkaUtils.createDirectStream( ssc, topics=topics, kafkaParams=kafka_params, fromOffsets=from_offset, keyDecoder=lambda screen_name: jsonpickle.encode(screen_name), valueDecoder=lambda tweet: jsonpickle.decode(tweet)) # process the stream kafka_direct_stream.foreachRDD( lambda rdd: stream_process.process(rdd, spark, offset_file_path)) # ssc.checkpoint(config.CHECKPOINT) ssc.start() ssc.awaitTermination()
def main(): topicName = 'Test-OnlineMonitor' topic_partition = TopicAndPartition(topicName, 0) from_offsets = {topic_partition: 0} sc = SparkContext(appName="streamingkafka") sc.setLogLevel("ERROR") # 减少shell打印日志 ssc = StreamingContext(sc, 1) # 1秒的计算窗口 brokers = '127.0.0.1:9092' topic = 'Test-OnlineMonitor' # 使用streaming使用直连模式消费kafka message = KafkaUtils.createDirectStream(ssc, [topic], \ {"metadata.broker.list": brokers},\ fromOffsets=from_offsets,\ keyDecoder=spot_decoder,\ valueDecoder=spot_decoder) res = message.map(lambda x: x[1]) #ID = res.map( lambda msg: getID( msg ) ) ID = res.map(lambda msg: getValue(msg)) #ID.pprint(25) #ID.foreachRDD( lambda x: print(x.first()) ) ID.foreachRDD(lambda x: displayRDD(x)) ssc.start() ssc.awaitTermination(240) ssc.stop()
def setup(): sc = SparkContext(conf=conf) # Set the Batch duration to 10 sec of Streaming Context ssc = StreamingContext(sc, 10) ssc.sparkContext.setLogLevel("ERROR") ssc.checkpoint(checkpoints_folder) kafka_params = { "metadata.broker.list": "kafka:9092", "zookeeper.connect": "zookeeper:2181", "group.id": "spark-streaming", "zookeeper.connection.timeout.ms": "10000", "auto.offset.reset": "smallest" } start = 0 partition = 0 topic = 'twitter' topic_partition = TopicAndPartition(topic, partition) from_offset = {topic_partition: int(start)} # Create Kafka Stream to Consume Data Comes From Twitter Topic # localhost:2181 = Default Zookeeper Consumer Address kafka_stream = KafkaUtils.createDirectStream(ssc, [topic], kafka_params, fromOffsets=from_offset) create_transformations(kafka_stream) return ssc
def spark_kafka_consumer(kafka_topic: str, ssc, broker, consumer_group_id) -> KafkaDStream: """ supports only one topic at a time :param kafka_topic: :return: """ try: offsets = CC.get_kafka_offsets(kafka_topic[0]) if bool(offsets): fromOffset = {} for offset in offsets: offset_start = offset["offset_start"] offset_until = offset["offset_until"] topic_partition = offset["topic_partition"] topic = offset["topic"] topicPartion = TopicAndPartition(topic,int(topic_partition)) fromOffset[topicPartion] = int(offset_start) return KafkaUtils.createDirectStream(ssc, kafka_topic, {"metadata.broker.list": broker, "group.id": consumer_group_id},fromOffsets=fromOffset) else: offset_reset = "smallest" # smallest OR largest return KafkaUtils.createDirectStream(ssc, kafka_topic, {"metadata.broker.list": broker, "auto.offset.reset":offset_reset, "group.id": consumer_group_id}) except Exception as e: print(e)
def create_context(): spark = get_session(SPARK_CONF) ssc = StreamingContext(spark.sparkContext, BATCH_DURATION) ssc.checkpoint(CHECKPOINT) # start offsets from beginning # won't work if we have a chackpoint offsets = {TopicAndPartition(topic, 0): 0 for topic in TOPICS} stream = KafkaUtils.createDirectStream(ssc, TOPICS, KAFKA_PARAMS, offsets) main(stream) return ssc
def test_kafka_direct_stream_from_offset(self): """Test the Python direct Kafka stream API with start offset specified.""" topic = self._randomTopic() sendData = {"a": 1, "b": 2, "c": 3} fromOffsets = {TopicAndPartition(topic, 0): long(0)} kafkaParams = {"metadata.broker.list": self._kafkaTestUtils.brokerAddress()} self._kafkaTestUtils.createTopic(topic) self._kafkaTestUtils.sendMessages(topic, sendData) stream = KafkaUtils.createDirectStream(self.ssc, [topic], kafkaParams, fromOffsets) self._validateStreamResult(sendData, stream)
def main(): record_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "offset.txt") with open(record_path, 'r') as f: start = json.loads(f.read()) start_0, start_1, start_2 = start['start_0'], start['start_1'], start[ 'start_2'] kafkaStreams = KafkaUtils.createDirectStream( ssc, [topic], kafkaParams={"metadata.broker.list": brokers}, fromOffsets={ TopicAndPartition(topic, 0): int(start_0), TopicAndPartition(topic, 1): int(start_1), TopicAndPartition(topic, 2): int(start_2) }) kafkaStreams.transform(storeOffsetRanges).map(format_data).foreachRDD( process) ssc.start() ssc.awaitTermination()
def test_kafka_rdd_with_leaders(self): """Test the Python direct Kafka RDD API with leaders.""" topic = self._randomTopic() sendData = {"a": 1, "b": 2, "c": 3} offsetRanges = [OffsetRange(topic, 0, long(0), long(sum(sendData.values())))] kafkaParams = {"metadata.broker.list": self._kafkaTestUtils.brokerAddress()} address = self._kafkaTestUtils.brokerAddress().split(":") leaders = {TopicAndPartition(topic, 0): Broker(address[0], int(address[1]))} self._kafkaTestUtils.createTopic(topic) self._kafkaTestUtils.sendMessages(topic, sendData) rdd = KafkaUtils.createRDD(self.sc, kafkaParams, offsetRanges, leaders) self._validateRddResult(sendData, rdd)
def _fetch_offsets(url): engine = create_engine(url) result = dict() with engine.begin() as conn: resultset = conn.execute(text(SELECT_OFFSETS_QUERY)) result = { TopicAndPartition(topic=row['topic'], partition=int(row['partition'])): long(row['offset']) for row in resultset.fetchall() } return result
def functionToCreateContext(): sc = SparkContext(appName=APP_NAME) ssc = StreamingContext(sc, PERIOD) offsets = {TopicAndPartition(topic, 0): long(0) for topic in TOPICS} kafkaParams= {"metadata.broker.list": BROKERS, "group.id": GROUP_ID, "auto.offset.reset" : "smallest"} stream = KafkaUtils.createDirectStream(ssc, TOPICS, kafkaParams, offsets) main(stream) ssc.checkpoint(CHECKPOINT) return ssc
def read_offsets(topics): try: zk = PipelineUtils.getZookeeperInstance() from_offsets = {} for topic in topics: for partition in zk.get_children(f'/consumers/{topic}'): topic_partion = TopicAndPartition(topic, int(partition)) offset = int(zk.get(f'/consumers/{topic}/{partition}')[0]) from_offsets[topic_partion] = offset print("Previous offset -->", from_offsets) return from_offsets except Exception as e: print("An unexpected error occurred while reading offset", e) pass
def get_kafka_stream(topic, streaming_context): offset_specifications = simport.load(cfg.CONF.repositories.offsets)() app_name = streaming_context.sparkContext.appName saved_offset_spec = offset_specifications.get_kafka_offsets(app_name) if len(saved_offset_spec) < 1: MonMetricsKafkaProcessor.log_debug( "No saved offsets available..." "connecting to kafka without specifying offsets") kvs = KafkaUtils.createDirectStream( streaming_context, [topic], {"metadata.broker.list": cfg.CONF.messaging.brokers}) return kvs else: from_offsets = {} for key, value in saved_offset_spec.items(): if key.startswith("%s_%s" % (app_name, topic)): # spec_app_name = value.get_app_name() spec_topic = value.get_topic() spec_partition = int(value.get_partition()) # spec_from_offset = value.get_from_offset() spec_until_offset = value.get_until_offset() # composite_key = "%s_%s_%s" % (spec_app_name, # spec_topic, # spec_partition) # partition = saved_offset_spec[composite_key] from_offsets[ TopicAndPartition(spec_topic, spec_partition) ] = long(spec_until_offset) MonMetricsKafkaProcessor.log_debug( "get_kafka_stream: calling createDirectStream :" " topic:{%s} : start " % topic) for key, value in from_offsets.items(): MonMetricsKafkaProcessor.log_debug( "get_kafka_stream: calling createDirectStream : " "offsets : TopicAndPartition:{%s,%s}, value:{%s}" % (str(key._topic), str(key._partition), str(value))) MonMetricsKafkaProcessor.log_debug( "get_kafka_stream: calling createDirectStream : " "topic:{%s} : done" % topic) kvs = KafkaUtils.createDirectStream( streaming_context, [topic], {"metadata.broker.list": cfg.CONF.messaging.brokers}, from_offsets) return kvs
def get_kafka_stream(spark_streaming_context): topicPartion = TopicAndPartition(TOPIC, PARTITION) fromOffset = {topicPartion: long(START)} kafkaParams = { "metadata.broker.list": KAFKA_BROKER, 'auto.offset.reset': 'smallest', "group.id": "group_id_1" } kafka_stream = KafkaUtils.createDirectStream(spark_streaming_context, [TOPIC], kafkaParams=kafkaParams, fromOffsets=fromOffset) return kafka_stream
def read_offsets(zk, topics): from pyspark.streaming.kafka import TopicAndPartition from_offsets = {} try: for topic in topics: for partition in zk.get_children(f'/consumers/{topic}'): topic_partion = TopicAndPartition(topic, int(partition)) offset = int(zk.get(f'/consumers/{topic}/{partition}')[0]) from_offsets[topic_partion] = offset except Exception as e: print("Excep :: " + str(e)) return from_offsets
def read_offsets(ssc,zk, topics,kafkaParams): from pyspark.streaming.kafka import TopicAndPartition print("zk===", zk) print("topics===", topics) from_offsets = {} try: for topic in topics: print("topic=====",topic) for partition in zk.get_children(f'/consumers/{topic}'): topic_partion = TopicAndPartition(topic, int(partition)) offset = int(zk.get(f'/consumers/{topic}/{partition}')[0]) from_offsets[topic_partion] = offset except Exception as e: print("read offset error=============",e) print("===from_offsets=====", from_offsets) return from_offsets
def initialize_stream(self): """ initializes stream from Kafka topic """ topic, n = self.kafka_config["TOPIC"], self.kafka_config["PARTITIONS"] try: fromOffsets = { TopicAndPartition(topic, i): long(self.start_offset) for i in range(n) } except: fromOffsets = None self.dataStream = KafkaUtils.createDirectStream( self.ssc, [topic], {"metadata.broker.list": self.kafka_config["BROKERS_IP"]}, fromOffsets=fromOffsets)
def readOffsets( zk, topics, groupID ): from_offsets = {} for topic in topics: childName = '/consumers/' + topic zk.ensure_path( childName ) for partition in zk.get_children(childName): childPart = childName +'/' + partition zk.ensure_path( childPart ) topic_partition = TopicAndPartition( topic, int(partition) ) try: offset = int( zk.get(childPart)[0] ) except: print( " ============= Get child partition error ============== " ) return None from_offsets[topic_partition] = offset return from_offsets
def save_by_spark_streaming(): root_path = os.path.dirname(os.path.realpath(__file__)) record_path = os.path.join(root_path, "offset.txt") print("offset.txt--save--record_path%s" % (record_path)) from_offsets = {} # 获取已有的offset,没有记录文件时则用默认值即最大值 if os.path.exists(record_path): f = open(record_path, "r") offset_data = json.loads(f.read()) f.close() if offset_data["topic"] != topic_name: raise Exception("the topic name in offset.txt is incorrect") topic_partion = TopicAndPartition(offset_data["topic"], offset_data["partition"]) from_offsets = {topic_partion: int(offset_data["untilOffset"])} # 注意设置起始offset时的方法 print("start from offsets: %s" % (from_offsets)) sc = SparkContext(appName="Realtime-Analytics-Engine") ssc = StreamingContext(sc, int(timer)) kvs = KafkaUtils.createDirectStream(ssc=ssc, topics=[topic_name], fromOffsets=from_offsets, kafkaParams={"metadata.broker.list": broker_list}) # 官网offset说明 # directKafkaStream \ # .transform(storeOffsetRanges) \ # .foreachRDD(printOffsetRanges) # 事务处理 # kvs.foreachRDD(lambda rec: deal_data(rec)) lines = kvs.map(lambda x: x[1]) counts = lines.flatMap(lambda line: line.split(" ")) \ .map(lambda word: (word, 1)) \ .reduceByKey(lambda a, b: a + b) counts.pprint() # 存储offset kvs.transform(store_offset_ranges).foreachRDD(save_offset_ranges) ssc.start() ssc.awaitTermination() ssc.stop()
def save_by_spark_streaming(): root_path = os.path.dirname(os.path.realpath(__file__)) record_path = os.path.join(root_path, "offset.txt") from_offsets = {} # 获取已有的offset,没有记录文件时则用默认值即最大值 if os.path.exists(record_path): f = open(record_path, "r") offset_data = json.loads(f.read()) f.close() if offset_data["topic"] != topic_name: raise Exception("the topic name in offset.txt is incorrect") topic_partion = TopicAndPartition(offset_data["topic"], offset_data["partition"]) print('topic_partion', type(topic_partion)) from_offsets = { str(topic_partion): int(offset_data["untilOffset"]) } # 设置起始offset的方法(topic_partion不转为str时不能作为字典的key) print("start from offsets: %s" % from_offsets) print("type(from_offsets)", type(from_offsets)) sc = SparkContext(appName="Realtime-Analytics-Engine") ssc = StreamingContext(sc, int(timer)) ''' createDirectStream方法中读取from_offsets时,提示AttributeError: 'str' object has no attribute '_jTopicAndPartition' 与上面将topic_partion转为str矛盾,具体原因如下: jfromOffsets = dict([(k._jTopicAndPartition(helper), v) for (k, v) in fromOffsets.items()]) ''' #kvs = KafkaUtils.createDirectStream(ssc=ssc, topics=[topic_name], fromOffsets=from_offsets,kafkaParams={"metadata.broker.list": broker_list}) kvs = KafkaUtils.createDirectStream( ssc=ssc, topics=[topic_name], kafkaParams={"metadata.broker.list": broker_list}) kvs.foreachRDD(lambda rec: deal_data(rec)) kvs.transform(store_offset_ranges).foreachRDD(save_offset_ranges) ssc.start() ssc.awaitTermination() ssc.stop()
def read_offsets(zk, topics, consumer_group): """ 등록된 zookeeper 로부터 특정 topic 을 consume 하는 특정 consumer_group 의 오프셋을 읽어옴 """ from_offsets = {} for topic in topics: child_nodes = zk.get_children( f"/consumers/{consumer_group}/owners/{topic}") for partition in child_nodes: topic_partition = TopicAndPartition(topic, int(partition)) partition_offset = zk.get( f"/consumers/{consumer_group}/owners/{topic}/{partition}") print(f"{partition} offset :", partition_offset) if not partition_offset: print( 'The spark streaming started first time and offset value should be ZERO.' ) offset = 0 else: offset = int(partition_offset[0]) from_offsets[topic_partition] = offset print("from_offset:", from_offsets) return from_offsets
def streaming(): global wtopic for i in range(0, len(Topic)): print(Topic[i]) fromOffsets = {TopicAndPartition(Topic[i], 0): long(0)} kafkaParams = {"metadata.broker.list": 'localhost:9092'} ks =KafkaUtils.createDirectStream(ssc, [Topic[i]], \ kafkaParams, \ fromOffsets) ks.foreachRDD(handler) if (Topic[i] == 'Task10-DlyPrices'): print("prices") lines = ks.map(lambda v: v[1]) tlines = lines.map(lambda prices: (prices.split(","))) rlines =tlines.map(lambda t: (t[0], str(t[1]), t[2], t[3], t[4], \ t[5],t[6],t[7])) rlines.foreachRDD(CreateDffortuple) else: wtopic = Topic[i] print("other ", wtopic) lines = ks.map(lambda v: v[1]) lines.foreachRDD(CreateDfforjson)
def getStartOffsets(task, topic, partitions): connection = MySQLdb.connect(user='******', db='test', host="127.0.0.1", passwd="") cursor = connection.cursor() que = 'SELECT `partition`, `offset` FROM `test`.`kafka_offsets` WHERE `task`="%s" AND `topic`="%s"' % ( task, topic) print(que) cnt = cursor.execute(que) if not cnt: for p in range(partitions): que = 'INSERT INTO test.kafka_offsets (`task`,`topic`,`partition`,`offset`) VALUES ("%s","%s",%s,0)' % ( task, topic, p) print(que) cnt = cursor.execute(que) connection.commit() return getStartOffsets(task, topic, partitions) ret = {} for row in cursor.fetchall(): ret[TopicAndPartition(topic, row[0])] = long(row[1]) connection.close() return ret
globals()['KazooSingletonInstance'].start() return globals()['KazooSingletonInstance'] def save_offsets(rdd): zk = get_zookeeper_instance() for offset in rdd.offsetRanges(): path = f"/consumers" print(path) zk.ensure_path(path) zk.set(path, str(offset.untilOffset).encode()) TOPIC = 'anna' PARTITION = 0 topicAndPartition = TopicAndPartition(TOPIC, PARTITION) fromOffsets = {topicAndPartition: int(PARTITION)} def main(brokers="127.0.0.1:9092", topics=['anna']): sc = SparkContext(appName="PythonStreamingSaveOffsets") ssc = StreamingContext(sc, 2) directKafkaStream = KafkaUtils.createDirectStream( ssc, topics, {"metadata.broker.list": brokers}, fromOffsets=fromOffsets) directKafkaStream.foreachRDD(save_offsets) ssc.start() ssc.awaitTermination()
import random from pyspark.sql import SQLContext, Row from pyspark.sql import SparkSession from pyspark.sql.types import StructType from pyspark.sql.types import StructField from pyspark.sql.types import StringType sc = pyspark.SparkContext() ssc = StreamingContext(sc, 20) sqlContext = SQLContext(sc) topic = "notificacion_eventos_internos" brokers = "127.0.0.1:9092" partition = 0 start = 0 topicpartion = TopicAndPartition(topic, partition) fromoffset = {topicpartion: int(start)} kvs = KafkaUtils.createDirectStream(ssc, [topic], {"metadata.broker.list": brokers}, fromOffsets=fromoffset) data = kvs.map(lambda line: line) #data.write.parquet("hdfs://data.parquet") schema = StructType( [StructField(str(i), StringType(), True) for i in range(2)]) def saveData(rdd): now = datetime.now() current_time = now.strftime("%Y%m%d_%H%M%S") #rdd.saveAsTextFile("resultados/raw-${System.currentTimeInMillis()}.txt")
def extractInfo(flight, pm=False): flightDate = datetime.date(int(flight[0]), int(flight[1]), int(flight[2])) yDest = flight[7] if pm: yDest = flight[6] flightDate -= datetime.timedelta(days=2) return ((str(flightDate), yDest), (flight[6], flight[7], flight[4], flight[5], flight[8], float(flight[10].strip('\"')))) sc = SparkContext(appName="bestFlights") sc.setLogLevel("ERROR") ssc = StreamingContext(sc, 3) ssc.checkpoint("s3://mudabircapstonecheckpoint/bestFlights/") topicPartition = TopicAndPartition("airportsAll2", 0) fromOffset = {topicPartition: 0} kafkaParams = { "metadata.broker.list": "b-2.kafkacluster.qa2zr3.c2.kafka.us-east-1.amazonaws.com:9092,b-3.kafkacluster.qa2zr3.c2.kafka.us-east-1.amazonaws.com:9092,b-1.kafkacluster.qa2zr3.c2.kafka.us-east-1.amazonaws.com:9092" } stream = KafkaUtils.createDirectStream(ssc, ['airportsAll2'], kafkaParams, fromOffsets=fromOffset) ''' The incoming data format is Year|Month|date|DayofWeek|UniqueCarrier|FlightNum|Origin|Dest|CRSDeptime|DepDelay|ArrDelay ''' rdd = stream.map(lambda x: x[1])
from pyspark.streaming.kafka import TopicAndPartition from pyspark.streaming.kafka import KafkaUtils def toredis(rdd): import redis rclient = redis.Redis(host="172.17.0.7", port=6379) for y in rdd: print(y) rclient.set(y[0], y[1]) rclient.set(y, 5) sparkcontext = SparkContext("spark://0.0.0.0:7077", "spark_test01") offsets = {} part01 = TopicAndPartition('', 2) sparkcontext.addPyFile('redis.zip') ssc = StreamingContext(sparkcontext, 5) kafka_strem_context = KafkaUtils.createDirectStream( ssc, ['flumetest2'], { "metadata.broker.list": '172.17.0.6:9092', 'auto.offset.reset': 'largest' }) kafka_strem_context.map(lambda x: (x.split("|@|")[0], x.split("|@|")[2]) ).reduceByKey(lambda a, b: int(a) + int(b)).foreachRDD( lambda q: q.foreachPartition(toredis)) # kafka_strem_context.map(lambda x: (x.split("|@|")[0],x.split("|@|")[2])).reduceByKey(lambda a, b: int(a)+int(b)).foreachPartition(toredis)# AttributeError: 'TransformedDStream' object has no attribute 'foreachPartition' ssc.start() ssc.awaitTermination()
def main(): sc = SparkContext(appName="Twitchatter") sc.setLogLevel('ERROR') # broadcast the emotes set global_emotes = sc.broadcast(load_emotes()) #print(global_emotes.value.keys()) sub_emotes = sc.broadcast(load_subemotes()) #print(sub_emotes.value.keys()[:10]) batch_duration = 6 ssc = StreamingContext(sc, batch_duration) # every 3 seconds per batch # set checkpoint directory:use default fs protocol in core-site.xml ssc.checkpoint("hdfs://" + config.spark_ckpt) zkQuorum = [config.zk_address] topic = [config.topic] print("{}{}".format(zkQuorum, topic)) partition = 0 start = 0 topicpartition = TopicAndPartition(topic[0], partition) kvs = KafkaUtils.createDirectStream( ssc, topic, {"metadata.broker.list": config.ip_address}) # uncomment the following if running sum #kvs = KafkaUtils.createDirectStream(ssc,topic,{"metadata.broker.list": config.ip_address}, # fromOffsets={topicpartition: int(start)}) #kvs.checkpoint(600) parsed = kvs.map(lambda v: json.loads(v[1])) window_duration, sliding_duration = 12, 12 # (1) total count of emotes for given channel def get_emotes_count(x): line = x.split(" ") words = [item.encode('utf-8') for item in line] emotes = [item for item in words if item in global_emotes.value] #emotes = [item for item in words if item in global_emotes.value.keys()] return dict(Counter(emotes)) def sum_dict(x, y): return {k: x.get(k, 0) + y.get(k, 0) for k in set(x) | set(y)} def sub_dict(x, y): return {k: x.get(k, 0) - y.get(k, 0) for k in set(x) | set(y)} def get_count(x): line = x.split(" ") words = [item.encode('utf-8') for item in line] #emotes = [item for item in words if item in global_emotes.value.keys()] #subemotes = [item for item in words if item in sub_emotes.value.keys()] emotes = [item for item in words if item in global_emotes.value] subemotes = [item for item in words if item in sub_emotes.value] return [len(emotes), len(subemotes)] def sum_list(x, y): return [x[0] + y[0], x[1] + y[1]] def sub_list(x, y): return [x[0] - y[0], x[1] - y[1]] channel_count_time = parsed.map(lambda v: (v[u'channel'],v[u'message']))\ .mapValues(get_count)\ .reduceByKeyAndWindow(sum_list,sub_list,window_duration,sliding_duration)\ .map(lambda v: {"channel":v[0],\ "global_emotes":v[1][0],\ "subscriber_emotes":v[1][1],\ "total_emotes":(v[1][0]+v[1][1]),\ "timestamp":datetime.datetime.now()\ .strftime("%Y-%m-%d %H:%M:%S")}) #channel_count_time.pprint() # 2) get individual emotes count for given channel channel_message = parsed.map(lambda v: [(v[u'channel'],word) \ for word in v[u'message'].split(" ")])\ .flatMap(lambda x: x) #channel_message.pprint() def get_global(x): if x[1] in global_emotes.value: #if x[1] in global_emotes.value.keys(): return True else: return False def get_sub(x): if x[1] in sub_emotes.value: #if x[1] in sub_emotes.value.keys(): return True else: return False channel_emotes = channel_message.filter(get_global)\ .map(lambda v: (v[0],v[1],True)) #channel_emotes.pprint() channel_subemotes = channel_message.filter(get_sub)\ .map(lambda v: (v[0],v[1],False)) #channel_subemotes.pprint() time_channel_emotes_count = channel_emotes.union(channel_subemotes)\ .map(lambda v: ((v[0],v[1],v[2]),1))\ .reduceByKeyAndWindow(lambda x,y: x+y,lambda x,y:x-y,\ window_duration, sliding_duration)\ .map(lambda v: { "timestamp":datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),\ "channel":v[0][0],\ "emote_name":v[0][1],\ "is_free":v[0][2],\ "count":v[1]}) #time_channel_emotes_count.pprint() # 3) get world cup Footy emotes count for all channels footy_count = parsed.flatMap(lambda v: v[u'message'].split(" "))\ .filter(lambda x: 'Footy' in x)\ .map(lambda x: (x,1))\ .reduceByKeyAndWindow(lambda x,y: x+y,lambda x,y:x-y,\ window_duration, sliding_duration)\ .map(lambda v: { "timestamp":datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), \ "emote_name":v[0],\ "count":v[1]}) channel_count_time.saveToCassandra(config.cass_keyspace, "channel_count_time") time_channel_emotes_count.saveToCassandra(config.cass_keyspace, "time_channel_emotes_count") footy_count.saveToCassandra(config.cass_keyspace, "time_footy_count") ssc.start() ssc.awaitTermination()
def main(): sc = SparkContext(appName="Twitchatter") sc.setLogLevel('ERROR') # broadcast the emotes set global_emotes = sc.broadcast(load_emotes()) #print(global_emotes.value.keys()) sub_emotes = sc.broadcast(load_subemotes()) #print(sub_emotes.value.keys()[:10]) batch_duration = 5 ssc = StreamingContext(sc, 5) # every 3 seconds per batch # set checkpoint directory:use default fs protocol in core-site.xml ssc.checkpoint("hdfs://" + config.spark_ckpt) zkQuorum = [config.zk_address] topic = [config.topic] print("{}{}".format(zkQuorum, topic)) partition = 0 start = 0 topicpartition = TopicAndPartition(topic[0], partition) kvs = KafkaUtils.createDirectStream( ssc, topic, {"metadata.broker.list": config.ip_address}) # uncomment the following if running sum #kvs = KafkaUtils.createDirectStream(ssc,topic,{"metadata.broker.list": config.ip_address}, # fromOffsets={topicpartition: int(start)}) #kvs.checkpoint(600) parsed = kvs.map(lambda v: json.loads(v[1])) window_duration, sliding_duration = 60, 20 # (1) total count of emotes for given channel def get_emotes_count(x): line = x.split(" ") words = [item.encode('utf-8') for item in line] emotes = [item for item in words if item in global_emotes.value.keys()] return dict(Counter(emotes)) def sum_dict(x, y): return {k: x.get(k, 0) + y.get(k, 0) for k in set(x) | set(y)} def sub_dict(x, y): return {k: x.get(k, 0) - y.get(k, 0) for k in set(x) | set(y)} def get_count(x): line = x.split(" ") words = [item.encode('utf-8') for item in line] emotes = [item for item in words if item in global_emotes.value.keys()] subemotes = [item for item in words if item in sub_emotes.value.keys()] return [len(emotes), len(subemotes)] def sum_list(x, y): return [x[0] + y[0], x[1] + y[1]] def sub_list(x, y): return [x[0] - y[0], x[1] - y[1]] channel_count_time = parsed.map(lambda v: (v[u'channel'],v[u'message']))\ .mapValues(get_count)\ .reduceByKeyAndWindow(sum_list,sub_list,window_duration,sliding_duration)\ .map(lambda v: {"channel":v[0],\ "global_emotes":v[1][0],\ "subscriber_emotes":v[1][1],\ "total_emotes":(v[1][0]+v[1][1]),\ "timestamp":datetime.datetime.now()\ .strftime("%Y-%m-%d %H:%M:%S")}) #channel_count_time.pprint() # 2) get individual emotes count for given channel channel_message = parsed.map(lambda v: [(v[u'channel'],word) \ for word in v[u'message'].split(" ")])\ .flatMap(lambda x: x) #channel_message.pprint() def get_global(x): if x[1] in global_emotes.value.keys(): return True else: return False def get_sub(x): if x[1] in sub_emotes.value.keys(): return True else: return False channel_emotes = channel_message.filter(get_global)\ .map(lambda v: (v[0],v[1],True)) #channel_emotes.pprint() channel_subemotes = channel_message.filter(get_sub)\ .map(lambda v: (v[0],v[1],False)) #channel_subemotes.pprint() time_channel_emotes_count = channel_emotes.union(channel_subemotes)\ .map(lambda v: ((v[0],v[1],v[2]),1))\ .reduceByKeyAndWindow(lambda x,y: x+y,lambda x,y:x-y,\ window_duration, sliding_duration)\ .map(lambda v: { "timestamp":datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),\ "channel":v[0][0],\ "emote_name":v[0][1],\ "is_free":v[0][2],\ "count":v[1],\ }) #time_channel_emotes_count.pprint() # connect to cassandra cluster cluster = Cluster([config.cass_seedip]) session = cluster.connect() # create and set cassandra keyspace to work only once. session.execute( "CREATE KEYSPACE IF NOT EXISTS " + config.cass_keyspace + " WITH replication = {'class': 'SimpleStrategy', 'replication_factor': '3'};" ) session.set_keyspace(config.cass_keyspace) # create tables to insert data session.execute( "CREATE TABLE IF NOT EXISTS channel_count_time (channel text, global_emotes int, subscriber_emotes int, total_emotes int, timestamp text, primary key(channel,timestamp));" ) channel_count_time.saveToCassandra(config.cass_keyspace, "channel_count_time") session.execute( "CREATE TABLE IF NOT EXISTS time_channel_emotes_count (timestamp text, channel text, emote_name text, is_free boolean, count int, primary key(emote_name,timestamp));" ) time_channel_emotes_count.saveToCassandra(config.cass_keyspace, "time_channel_emotes_count") ssc.start() ssc.awaitTermination()
print(airport) def isFloat(row): try: float(row[10]) return True except: return False sc = SparkContext(appName="top10airports") sc.setLogLevel("ERROR") ssc = StreamingContext(sc, 3) ssc.checkpoint("s3://mudabircapstonecheckpoint/top10carriers/") topicPartition = TopicAndPartition("airportsFull", 0) fromOffset = {topicPartition: 0} kafkaParams = { "metadata.broker.list": "b-2.kafkacluster.qa2zr3.c2.kafka.us-east-1.amazonaws.com:9092,b-3.kafkacluster.qa2zr3.c2.kafka.us-east-1.amazonaws.com:9092,b-1.kafkacluster.qa2zr3.c2.kafka.us-east-1.amazonaws.com:9092" } stream = KafkaUtils.createDirectStream(ssc, ['airportsFull'], kafkaParams, fromOffsets=fromOffset) ''' The incoming data format is Year|Month|date|DayofWeek|UniqueCarrier|FlightNum|Origin|Dest|CRSDeptime|DepDelay|ArrDelay ''' rdd = stream.map(lambda x: x[1])