Ejemplo n.º 1
0
    def read_offsets(cls, topics):
        try:
            zk = cls.get_zookeeper_instance()
            from_offsets = {}
            for topic in topics:
                logger.warning("TOPIC:%s", topic)
                #create path if it does not exist
                topic_path = ZK_CHECKPOINT_PATH + topic

                try:
                    partitions = zk.get_children(topic_path)
                    for partition in partitions:
                        topic_partition = TopicAndPartition(
                            topic, int(partition))
                        partition_path = topic_path + '/' + partition
                        offset = int(zk.get(partition_path)[0])
                        from_offsets[topic_partition] = offset
                except Exception:
                    try:
                        topic_partition = TopicAndPartition(topic, int(0))
                        zk.ensure_path(topic_path + '/' + "0")
                        zk.set(topic_path, str(0).encode())
                        from_offsets[topic_partition] = int(0)
                        logger.warning("NO OFFSETS")
                    except Exception:
                        logger.error('MAKE FIRST OFFSET:{}', exc_info=True)

            #logger.warning("FROM_OFFSETS:%s",from_offsets)
            return from_offsets
        except Exception:
            logger.error('READ OFFSETS:%s', exc_info=True)
def main():
    # create spark context, spark session
    sc = spark_context_creator()
    spark = SparkSession(sc)
    # To avoid unnecessary logs
    sc.setLogLevel("WARN")
    # create streaming context
    ssc = StreamingContext(sc, 3)

    # create stream handler object to process stream
    stream_process = stream_handler_proessed.StreamHandler()

    # prepare direct stream parameters
    kafka_params = config.KAFKA_PARAMS
    # checkpoint for the last consumed offset object
    offset_file_path = config.OFFSET_FILE_PATH
    # get last consumed offset
    offset_ranges = stream_process.get_offset(offset_file_path)
    topics = []
    from_offset = {}
    # if list empty, means first time to consume, then configure manually
    if (not offset_ranges):
        topics = list(config.TOPICS_PARTIONS_OFFSETS.keys())
        topics_partions_offsets = config.TOPICS_PARTIONS_OFFSETS
        for topic in list(topics_partions_offsets.keys()):
            topic_partion = TopicAndPartition(
                topic, topics_partions_offsets[topic][0])
            from_offset[topic_partion] = topics_partions_offsets[topic][1]
    # get info from saved offset_ranges object
    else:
        for o in offset_ranges:
            topics.append(o.topic)
            topic_partion = TopicAndPartition(o.topic, o.partition)
            from_offset[topic_partion] = o.untilOffset

    # kafka consumer-spark connection
    kafka_direct_stream = KafkaUtils.createDirectStream(
        ssc,
        topics=topics,
        kafkaParams=kafka_params,
        fromOffsets=from_offset,
        keyDecoder=lambda screen_name: jsonpickle.encode(screen_name),
        valueDecoder=lambda tweet: jsonpickle.decode(tweet))

    # process the stream
    kafka_direct_stream.foreachRDD(
        lambda rdd: stream_process.process(rdd, spark, offset_file_path))
    # ssc.checkpoint(config.CHECKPOINT)
    ssc.start()
    ssc.awaitTermination()
Ejemplo n.º 3
0
def main():
    topicName = 'Test-OnlineMonitor'
    topic_partition = TopicAndPartition(topicName, 0)
    from_offsets = {topic_partition: 0}

    sc = SparkContext(appName="streamingkafka")
    sc.setLogLevel("ERROR")  # 减少shell打印日志
    ssc = StreamingContext(sc, 1)  # 1秒的计算窗口
    brokers = '127.0.0.1:9092'
    topic = 'Test-OnlineMonitor'
    # 使用streaming使用直连模式消费kafka
    message = KafkaUtils.createDirectStream(ssc, [topic], \
                    {"metadata.broker.list": brokers},\
                    fromOffsets=from_offsets,\
                    keyDecoder=spot_decoder,\
                    valueDecoder=spot_decoder)
    res = message.map(lambda x: x[1])

    #ID = res.map( lambda msg: getID( msg ) )
    ID = res.map(lambda msg: getValue(msg))

    #ID.pprint(25)
    #ID.foreachRDD( lambda x: print(x.first()) )
    ID.foreachRDD(lambda x: displayRDD(x))

    ssc.start()
    ssc.awaitTermination(240)
    ssc.stop()
Ejemplo n.º 4
0
def setup():
    sc = SparkContext(conf=conf)

    # Set the Batch duration to 10 sec of Streaming Context
    ssc = StreamingContext(sc, 10)
    ssc.sparkContext.setLogLevel("ERROR")
    ssc.checkpoint(checkpoints_folder)

    kafka_params = {
        "metadata.broker.list": "kafka:9092",
        "zookeeper.connect": "zookeeper:2181",
        "group.id": "spark-streaming",
        "zookeeper.connection.timeout.ms": "10000",
        "auto.offset.reset": "smallest"
    }
    start = 0
    partition = 0
    topic = 'twitter'
    topic_partition = TopicAndPartition(topic, partition)
    from_offset = {topic_partition: int(start)}

    # Create Kafka Stream to Consume Data Comes From Twitter Topic
    # localhost:2181 = Default Zookeeper Consumer Address
    kafka_stream = KafkaUtils.createDirectStream(ssc, [topic],
                                                 kafka_params,
                                                 fromOffsets=from_offset)

    create_transformations(kafka_stream)

    return ssc
Ejemplo n.º 5
0
def spark_kafka_consumer(kafka_topic: str, ssc, broker, consumer_group_id) -> KafkaDStream:
    """
    supports only one topic at a time
    :param kafka_topic:
    :return:
    """
    try:
        offsets = CC.get_kafka_offsets(kafka_topic[0])

        if bool(offsets):
            fromOffset = {}
            for offset in offsets:
                offset_start = offset["offset_start"]
                offset_until = offset["offset_until"]
                topic_partition = offset["topic_partition"]
                topic = offset["topic"]

                topicPartion = TopicAndPartition(topic,int(topic_partition))
                fromOffset[topicPartion] = int(offset_start)

            return KafkaUtils.createDirectStream(ssc, kafka_topic,
                                                 {"metadata.broker.list": broker,
                                                  "group.id": consumer_group_id},fromOffsets=fromOffset)
        else:
            offset_reset = "smallest"  # smallest OR largest
            return KafkaUtils.createDirectStream(ssc, kafka_topic,
                                                 {"metadata.broker.list": broker, "auto.offset.reset":offset_reset,
                                                  "group.id": consumer_group_id})
    except Exception as e:
        print(e)
Ejemplo n.º 6
0
def create_context():
    spark = get_session(SPARK_CONF)
    ssc = StreamingContext(spark.sparkContext, BATCH_DURATION)
    ssc.checkpoint(CHECKPOINT)
    # start offsets from beginning
    # won't work if we have a chackpoint
    offsets = {TopicAndPartition(topic, 0): 0 for topic in TOPICS}
    stream = KafkaUtils.createDirectStream(ssc, TOPICS, KAFKA_PARAMS, offsets)
    main(stream)
    return ssc
Ejemplo n.º 7
0
    def test_kafka_direct_stream_from_offset(self):
        """Test the Python direct Kafka stream API with start offset specified."""
        topic = self._randomTopic()
        sendData = {"a": 1, "b": 2, "c": 3}
        fromOffsets = {TopicAndPartition(topic, 0): long(0)}
        kafkaParams = {"metadata.broker.list": self._kafkaTestUtils.brokerAddress()}

        self._kafkaTestUtils.createTopic(topic)
        self._kafkaTestUtils.sendMessages(topic, sendData)

        stream = KafkaUtils.createDirectStream(self.ssc, [topic], kafkaParams, fromOffsets)
        self._validateStreamResult(sendData, stream)
Ejemplo n.º 8
0
def main():
    record_path = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                               "offset.txt")
    with open(record_path, 'r') as f:
        start = json.loads(f.read())
    start_0, start_1, start_2 = start['start_0'], start['start_1'], start[
        'start_2']

    kafkaStreams = KafkaUtils.createDirectStream(
        ssc, [topic],
        kafkaParams={"metadata.broker.list": brokers},
        fromOffsets={
            TopicAndPartition(topic, 0): int(start_0),
            TopicAndPartition(topic, 1): int(start_1),
            TopicAndPartition(topic, 2): int(start_2)
        })
    kafkaStreams.transform(storeOffsetRanges).map(format_data).foreachRDD(
        process)

    ssc.start()
    ssc.awaitTermination()
Ejemplo n.º 9
0
    def test_kafka_rdd_with_leaders(self):
        """Test the Python direct Kafka RDD API with leaders."""
        topic = self._randomTopic()
        sendData = {"a": 1, "b": 2, "c": 3}
        offsetRanges = [OffsetRange(topic, 0, long(0), long(sum(sendData.values())))]
        kafkaParams = {"metadata.broker.list": self._kafkaTestUtils.brokerAddress()}
        address = self._kafkaTestUtils.brokerAddress().split(":")
        leaders = {TopicAndPartition(topic, 0): Broker(address[0], int(address[1]))}

        self._kafkaTestUtils.createTopic(topic)
        self._kafkaTestUtils.sendMessages(topic, sendData)
        rdd = KafkaUtils.createRDD(self.sc, kafkaParams, offsetRanges, leaders)
        self._validateRddResult(sendData, rdd)
Ejemplo n.º 10
0
    def _fetch_offsets(url):
        engine = create_engine(url)
        result = dict()
        with engine.begin() as conn:
            resultset = conn.execute(text(SELECT_OFFSETS_QUERY))
            result = {
                TopicAndPartition(topic=row['topic'],
                                  partition=int(row['partition'])):
                long(row['offset'])
                for row in resultset.fetchall()
            }

        return result
Ejemplo n.º 11
0
def functionToCreateContext():
    sc = SparkContext(appName=APP_NAME)
    ssc = StreamingContext(sc, PERIOD)

    offsets = {TopicAndPartition(topic, 0): long(0) for topic in TOPICS}
    kafkaParams= {"metadata.broker.list": BROKERS, "group.id": GROUP_ID, 
        "auto.offset.reset" : "smallest"}
    
    stream = KafkaUtils.createDirectStream(ssc, TOPICS, kafkaParams, offsets)
    main(stream)

    ssc.checkpoint(CHECKPOINT)
    return ssc
Ejemplo n.º 12
0
 def read_offsets(topics):
     try:
         zk = PipelineUtils.getZookeeperInstance()
         from_offsets = {}
         for topic in topics:
             for partition in zk.get_children(f'/consumers/{topic}'):
                 topic_partion = TopicAndPartition(topic, int(partition))
                 offset = int(zk.get(f'/consumers/{topic}/{partition}')[0])
                 from_offsets[topic_partion] = offset
         print("Previous offset -->", from_offsets)
         return from_offsets
     except Exception as e:
         print("An unexpected error occurred while reading offset", e)
         pass
Ejemplo n.º 13
0
    def get_kafka_stream(topic, streaming_context):
        offset_specifications = simport.load(cfg.CONF.repositories.offsets)()
        app_name = streaming_context.sparkContext.appName
        saved_offset_spec = offset_specifications.get_kafka_offsets(app_name)
        if len(saved_offset_spec) < 1:

            MonMetricsKafkaProcessor.log_debug(
                "No saved offsets available..."
                "connecting to kafka without specifying offsets")
            kvs = KafkaUtils.createDirectStream(
                streaming_context, [topic],
                {"metadata.broker.list": cfg.CONF.messaging.brokers})

            return kvs

        else:
            from_offsets = {}
            for key, value in saved_offset_spec.items():
                if key.startswith("%s_%s" % (app_name, topic)):
                    # spec_app_name = value.get_app_name()
                    spec_topic = value.get_topic()
                    spec_partition = int(value.get_partition())
                    # spec_from_offset = value.get_from_offset()
                    spec_until_offset = value.get_until_offset()
                    # composite_key = "%s_%s_%s" % (spec_app_name,
                    #                               spec_topic,
                    #                               spec_partition)
                    # partition = saved_offset_spec[composite_key]
                    from_offsets[
                        TopicAndPartition(spec_topic, spec_partition)
                    ] = long(spec_until_offset)

            MonMetricsKafkaProcessor.log_debug(
                "get_kafka_stream: calling createDirectStream :"
                " topic:{%s} : start " % topic)
            for key, value in from_offsets.items():
                MonMetricsKafkaProcessor.log_debug(
                    "get_kafka_stream: calling createDirectStream : "
                    "offsets : TopicAndPartition:{%s,%s}, value:{%s}" %
                    (str(key._topic), str(key._partition), str(value)))
            MonMetricsKafkaProcessor.log_debug(
                "get_kafka_stream: calling createDirectStream : "
                "topic:{%s} : done" % topic)

            kvs = KafkaUtils.createDirectStream(
                streaming_context, [topic],
                {"metadata.broker.list": cfg.CONF.messaging.brokers},
                from_offsets)
            return kvs
def get_kafka_stream(spark_streaming_context):
    topicPartion = TopicAndPartition(TOPIC, PARTITION)
    fromOffset = {topicPartion: long(START)}

    kafkaParams = {
        "metadata.broker.list": KAFKA_BROKER,
        'auto.offset.reset': 'smallest',
        "group.id": "group_id_1"
    }

    kafka_stream = KafkaUtils.createDirectStream(spark_streaming_context,
                                                 [TOPIC],
                                                 kafkaParams=kafkaParams,
                                                 fromOffsets=fromOffset)
    return kafka_stream
Ejemplo n.º 15
0
def read_offsets(zk, topics):
    from pyspark.streaming.kafka import TopicAndPartition

    from_offsets = {}
    try:
        for topic in topics:
            for partition in zk.get_children(f'/consumers/{topic}'):
                topic_partion = TopicAndPartition(topic, int(partition))
                offset = int(zk.get(f'/consumers/{topic}/{partition}')[0])
                from_offsets[topic_partion] = offset

    except Exception as e:
        print("Excep :: " + str(e))

    return from_offsets
def read_offsets(ssc,zk, topics,kafkaParams):
    from pyspark.streaming.kafka import TopicAndPartition
    print("zk===", zk)
    print("topics===", topics)
    from_offsets = {}
    try:
        for topic in topics:
            print("topic=====",topic)
            for partition in zk.get_children(f'/consumers/{topic}'):
                topic_partion = TopicAndPartition(topic, int(partition))
                offset = int(zk.get(f'/consumers/{topic}/{partition}')[0])
                from_offsets[topic_partion] = offset
    except Exception as e:
        print("read offset error=============",e)

    print("===from_offsets=====", from_offsets)
    return from_offsets
Ejemplo n.º 17
0
    def initialize_stream(self):
        """
        initializes stream from Kafka topic
        """
        topic, n = self.kafka_config["TOPIC"], self.kafka_config["PARTITIONS"]
        try:
            fromOffsets = {
                TopicAndPartition(topic, i): long(self.start_offset)
                for i in range(n)
            }
        except:
            fromOffsets = None

        self.dataStream = KafkaUtils.createDirectStream(
            self.ssc, [topic],
            {"metadata.broker.list": self.kafka_config["BROKERS_IP"]},
            fromOffsets=fromOffsets)
Ejemplo n.º 18
0
def readOffsets( zk, topics, groupID ):
    from_offsets = {}
    for topic in topics:
        childName = '/consumers/' + topic
        zk.ensure_path( childName )
        for partition in zk.get_children(childName):
            childPart = childName +'/' + partition
            zk.ensure_path( childPart )
            topic_partition = TopicAndPartition( topic, int(partition) )
            try:
                offset = int( zk.get(childPart)[0] )
            except:
                print( " ============= Get child partition error ============== " )
                return None
            from_offsets[topic_partition] = offset

    return from_offsets
Ejemplo n.º 19
0
def save_by_spark_streaming():
    root_path = os.path.dirname(os.path.realpath(__file__))
    record_path = os.path.join(root_path, "offset.txt")
    print("offset.txt--save--record_path%s" % (record_path))
    from_offsets = {}
    # 获取已有的offset,没有记录文件时则用默认值即最大值
    if os.path.exists(record_path):
        f = open(record_path, "r")
        offset_data = json.loads(f.read())
        f.close()
        if offset_data["topic"] != topic_name:
            raise Exception("the topic name in offset.txt is incorrect")

        topic_partion = TopicAndPartition(offset_data["topic"], offset_data["partition"])
        from_offsets = {topic_partion: int(offset_data["untilOffset"])}  # 注意设置起始offset时的方法
        print("start from offsets: %s" % (from_offsets))

    sc = SparkContext(appName="Realtime-Analytics-Engine")
    ssc = StreamingContext(sc, int(timer))

    kvs = KafkaUtils.createDirectStream(ssc=ssc, topics=[topic_name], fromOffsets=from_offsets,
                                        kafkaParams={"metadata.broker.list": broker_list})

    # 官网offset说明
    # directKafkaStream \
    #     .transform(storeOffsetRanges) \
    #     .foreachRDD(printOffsetRanges)

    # 事务处理
    # kvs.foreachRDD(lambda rec: deal_data(rec))

    lines = kvs.map(lambda x: x[1])
    counts = lines.flatMap(lambda line: line.split(" ")) \
        .map(lambda word: (word, 1)) \
        .reduceByKey(lambda a, b: a + b)
    counts.pprint()
    # 存储offset
    kvs.transform(store_offset_ranges).foreachRDD(save_offset_ranges)

    ssc.start()
    ssc.awaitTermination()
    ssc.stop()
Ejemplo n.º 20
0
def save_by_spark_streaming():
    root_path = os.path.dirname(os.path.realpath(__file__))
    record_path = os.path.join(root_path, "offset.txt")
    from_offsets = {}
    # 获取已有的offset,没有记录文件时则用默认值即最大值
    if os.path.exists(record_path):
        f = open(record_path, "r")
        offset_data = json.loads(f.read())
        f.close()
        if offset_data["topic"] != topic_name:
            raise Exception("the topic name in offset.txt is incorrect")

        topic_partion = TopicAndPartition(offset_data["topic"],
                                          offset_data["partition"])
        print('topic_partion', type(topic_partion))
        from_offsets = {
            str(topic_partion): int(offset_data["untilOffset"])
        }  # 设置起始offset的方法(topic_partion不转为str时不能作为字典的key)
        print("start from offsets: %s" % from_offsets)
        print("type(from_offsets)", type(from_offsets))

    sc = SparkContext(appName="Realtime-Analytics-Engine")
    ssc = StreamingContext(sc, int(timer))
    '''
    createDirectStream方法中读取from_offsets时,提示AttributeError: 'str' object has no attribute '_jTopicAndPartition'
    与上面将topic_partion转为str矛盾,具体原因如下:
    jfromOffsets = dict([(k._jTopicAndPartition(helper),
                              v) for (k, v) in fromOffsets.items()])
    '''
    #kvs = KafkaUtils.createDirectStream(ssc=ssc, topics=[topic_name], fromOffsets=from_offsets,kafkaParams={"metadata.broker.list": broker_list})
    kvs = KafkaUtils.createDirectStream(
        ssc=ssc,
        topics=[topic_name],
        kafkaParams={"metadata.broker.list": broker_list})
    kvs.foreachRDD(lambda rec: deal_data(rec))
    kvs.transform(store_offset_ranges).foreachRDD(save_offset_ranges)

    ssc.start()
    ssc.awaitTermination()
    ssc.stop()
def read_offsets(zk, topics, consumer_group):
    """
    등록된 zookeeper 로부터 특정 topic 을 consume 하는 특정 consumer_group 의 오프셋을 읽어옴
    """
    from_offsets = {}
    for topic in topics:
        child_nodes = zk.get_children(
            f"/consumers/{consumer_group}/owners/{topic}")
        for partition in child_nodes:
            topic_partition = TopicAndPartition(topic, int(partition))
            partition_offset = zk.get(
                f"/consumers/{consumer_group}/owners/{topic}/{partition}")
            print(f"{partition} offset :", partition_offset)
            if not partition_offset:
                print(
                    'The spark streaming started first time and offset value should be ZERO.'
                )
                offset = 0
            else:
                offset = int(partition_offset[0])
            from_offsets[topic_partition] = offset
    print("from_offset:", from_offsets)
    return from_offsets
Ejemplo n.º 22
0
def streaming():
    global wtopic
    for i in range(0, len(Topic)):
        print(Topic[i])
        fromOffsets = {TopicAndPartition(Topic[i], 0): long(0)}
        kafkaParams = {"metadata.broker.list": 'localhost:9092'}
        ks =KafkaUtils.createDirectStream(ssc, [Topic[i]], \
                                              kafkaParams, \
                                              fromOffsets)

        ks.foreachRDD(handler)
        if (Topic[i] == 'Task10-DlyPrices'):
            print("prices")
            lines = ks.map(lambda v: v[1])
            tlines = lines.map(lambda prices: (prices.split(",")))
            rlines =tlines.map(lambda t: (t[0], str(t[1]), t[2], t[3], t[4], \
                                        t[5],t[6],t[7]))
            rlines.foreachRDD(CreateDffortuple)
        else:
            wtopic = Topic[i]
            print("other ", wtopic)
            lines = ks.map(lambda v: v[1])
            lines.foreachRDD(CreateDfforjson)
Ejemplo n.º 23
0
def getStartOffsets(task, topic, partitions):
    connection = MySQLdb.connect(user='******',
                                 db='test',
                                 host="127.0.0.1",
                                 passwd="")
    cursor = connection.cursor()

    que = 'SELECT `partition`, `offset` FROM `test`.`kafka_offsets` WHERE `task`="%s" AND `topic`="%s"' % (
        task, topic)
    print(que)
    cnt = cursor.execute(que)
    if not cnt:
        for p in range(partitions):
            que = 'INSERT INTO test.kafka_offsets (`task`,`topic`,`partition`,`offset`) VALUES ("%s","%s",%s,0)' % (
                task, topic, p)
            print(que)
            cnt = cursor.execute(que)
            connection.commit()
        return getStartOffsets(task, topic, partitions)
    ret = {}
    for row in cursor.fetchall():
        ret[TopicAndPartition(topic, row[0])] = long(row[1])
    connection.close()
    return ret
Ejemplo n.º 24
0
        globals()['KazooSingletonInstance'].start()
    return globals()['KazooSingletonInstance']


def save_offsets(rdd):
    zk = get_zookeeper_instance()
    for offset in rdd.offsetRanges():
        path = f"/consumers"
        print(path)
        zk.ensure_path(path)
        zk.set(path, str(offset.untilOffset).encode())


TOPIC = 'anna'
PARTITION = 0
topicAndPartition = TopicAndPartition(TOPIC, PARTITION)
fromOffsets = {topicAndPartition: int(PARTITION)}


def main(brokers="127.0.0.1:9092", topics=['anna']):
    sc = SparkContext(appName="PythonStreamingSaveOffsets")
    ssc = StreamingContext(sc, 2)

    directKafkaStream = KafkaUtils.createDirectStream(
        ssc,
        topics, {"metadata.broker.list": brokers},
        fromOffsets=fromOffsets)

    directKafkaStream.foreachRDD(save_offsets)
    ssc.start()
    ssc.awaitTermination()
Ejemplo n.º 25
0
import random
from pyspark.sql import SQLContext, Row
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType
from pyspark.sql.types import StructField
from pyspark.sql.types import StringType

sc = pyspark.SparkContext()
ssc = StreamingContext(sc, 20)
sqlContext = SQLContext(sc)

topic = "notificacion_eventos_internos"
brokers = "127.0.0.1:9092"
partition = 0
start = 0
topicpartion = TopicAndPartition(topic, partition)
fromoffset = {topicpartion: int(start)}

kvs = KafkaUtils.createDirectStream(ssc, [topic],
                                    {"metadata.broker.list": brokers},
                                    fromOffsets=fromoffset)
data = kvs.map(lambda line: line)
#data.write.parquet("hdfs://data.parquet")
schema = StructType(
    [StructField(str(i), StringType(), True) for i in range(2)])


def saveData(rdd):
    now = datetime.now()
    current_time = now.strftime("%Y%m%d_%H%M%S")
    #rdd.saveAsTextFile("resultados/raw-${System.currentTimeInMillis()}.txt")
Ejemplo n.º 26
0
def extractInfo(flight, pm=False):
    flightDate = datetime.date(int(flight[0]), int(flight[1]), int(flight[2]))
    yDest = flight[7]
    if pm:
        yDest = flight[6]
        flightDate -= datetime.timedelta(days=2)
    return ((str(flightDate), yDest), (flight[6], flight[7], flight[4],
                                       flight[5], flight[8],
                                       float(flight[10].strip('\"'))))


sc = SparkContext(appName="bestFlights")
sc.setLogLevel("ERROR")
ssc = StreamingContext(sc, 3)
ssc.checkpoint("s3://mudabircapstonecheckpoint/bestFlights/")
topicPartition = TopicAndPartition("airportsAll2", 0)
fromOffset = {topicPartition: 0}
kafkaParams = {
    "metadata.broker.list":
    "b-2.kafkacluster.qa2zr3.c2.kafka.us-east-1.amazonaws.com:9092,b-3.kafkacluster.qa2zr3.c2.kafka.us-east-1.amazonaws.com:9092,b-1.kafkacluster.qa2zr3.c2.kafka.us-east-1.amazonaws.com:9092"
}

stream = KafkaUtils.createDirectStream(ssc, ['airportsAll2'],
                                       kafkaParams,
                                       fromOffsets=fromOffset)
'''
The incoming data format is
Year|Month|date|DayofWeek|UniqueCarrier|FlightNum|Origin|Dest|CRSDeptime|DepDelay|ArrDelay
'''

rdd = stream.map(lambda x: x[1])
Ejemplo n.º 27
0
from pyspark.streaming.kafka import TopicAndPartition
from pyspark.streaming.kafka import KafkaUtils


def toredis(rdd):
    import redis
    rclient = redis.Redis(host="172.17.0.7", port=6379)
    for y in rdd:
        print(y)
        rclient.set(y[0], y[1])
        rclient.set(y, 5)


sparkcontext = SparkContext("spark://0.0.0.0:7077", "spark_test01")
offsets = {}
part01 = TopicAndPartition('', 2)
sparkcontext.addPyFile('redis.zip')
ssc = StreamingContext(sparkcontext, 5)

kafka_strem_context = KafkaUtils.createDirectStream(
    ssc, ['flumetest2'], {
        "metadata.broker.list": '172.17.0.6:9092',
        'auto.offset.reset': 'largest'
    })
kafka_strem_context.map(lambda x: (x.split("|@|")[0], x.split("|@|")[2])
                        ).reduceByKey(lambda a, b: int(a) + int(b)).foreachRDD(
                            lambda q: q.foreachPartition(toredis))
# kafka_strem_context.map(lambda x: (x.split("|@|")[0],x.split("|@|")[2])).reduceByKey(lambda a, b: int(a)+int(b)).foreachPartition(toredis)# AttributeError: 'TransformedDStream' object has no attribute 'foreachPartition'

ssc.start()
ssc.awaitTermination()
Ejemplo n.º 28
0
def main():
    sc = SparkContext(appName="Twitchatter")
    sc.setLogLevel('ERROR')
    # broadcast the emotes set
    global_emotes = sc.broadcast(load_emotes())
    #print(global_emotes.value.keys())
    sub_emotes = sc.broadcast(load_subemotes())
    #print(sub_emotes.value.keys()[:10])

    batch_duration = 6
    ssc = StreamingContext(sc, batch_duration)  # every 3 seconds per batch
    # set checkpoint directory:use default fs protocol in core-site.xml
    ssc.checkpoint("hdfs://" + config.spark_ckpt)

    zkQuorum = [config.zk_address]
    topic = [config.topic]
    print("{}{}".format(zkQuorum, topic))

    partition = 0
    start = 0
    topicpartition = TopicAndPartition(topic[0], partition)

    kvs = KafkaUtils.createDirectStream(
        ssc, topic, {"metadata.broker.list": config.ip_address})
    # uncomment the following if running sum
    #kvs = KafkaUtils.createDirectStream(ssc,topic,{"metadata.broker.list": config.ip_address},
    #        fromOffsets={topicpartition: int(start)})
    #kvs.checkpoint(600)

    parsed = kvs.map(lambda v: json.loads(v[1]))

    window_duration, sliding_duration = 12, 12

    # (1) total count of emotes for given channel
    def get_emotes_count(x):
        line = x.split(" ")
        words = [item.encode('utf-8') for item in line]
        emotes = [item for item in words if item in global_emotes.value]
        #emotes = [item for item in words if item in global_emotes.value.keys()]
        return dict(Counter(emotes))

    def sum_dict(x, y):
        return {k: x.get(k, 0) + y.get(k, 0) for k in set(x) | set(y)}

    def sub_dict(x, y):
        return {k: x.get(k, 0) - y.get(k, 0) for k in set(x) | set(y)}

    def get_count(x):
        line = x.split(" ")
        words = [item.encode('utf-8') for item in line]
        #emotes = [item for item in words if item in global_emotes.value.keys()]
        #subemotes = [item for item in words if item in sub_emotes.value.keys()]
        emotes = [item for item in words if item in global_emotes.value]
        subemotes = [item for item in words if item in sub_emotes.value]
        return [len(emotes), len(subemotes)]

    def sum_list(x, y):
        return [x[0] + y[0], x[1] + y[1]]

    def sub_list(x, y):
        return [x[0] - y[0], x[1] - y[1]]

    channel_count_time = parsed.map(lambda v: (v[u'channel'],v[u'message']))\
                               .mapValues(get_count)\
                               .reduceByKeyAndWindow(sum_list,sub_list,window_duration,sliding_duration)\
                               .map(lambda v: {"channel":v[0],\
                                               "global_emotes":v[1][0],\
                                               "subscriber_emotes":v[1][1],\
                                               "total_emotes":(v[1][0]+v[1][1]),\
                                               "timestamp":datetime.datetime.now()\
                                               .strftime("%Y-%m-%d %H:%M:%S")})
    #channel_count_time.pprint()

    # 2) get individual emotes count for given channel
    channel_message = parsed.map(lambda v: [(v[u'channel'],word) \
                                 for word in v[u'message'].split(" ")])\
                                 .flatMap(lambda x: x)

    #channel_message.pprint()

    def get_global(x):
        if x[1] in global_emotes.value:
            #if x[1] in global_emotes.value.keys():
            return True
        else:
            return False

    def get_sub(x):
        if x[1] in sub_emotes.value:
            #if x[1] in sub_emotes.value.keys():
            return True
        else:
            return False
    channel_emotes = channel_message.filter(get_global)\
                                    .map(lambda v: (v[0],v[1],True))
    #channel_emotes.pprint()
    channel_subemotes = channel_message.filter(get_sub)\
                                    .map(lambda v: (v[0],v[1],False))
    #channel_subemotes.pprint()

    time_channel_emotes_count = channel_emotes.union(channel_subemotes)\
                                              .map(lambda v: ((v[0],v[1],v[2]),1))\
                                              .reduceByKeyAndWindow(lambda x,y: x+y,lambda x,y:x-y,\
                                                                    window_duration, sliding_duration)\
                                              .map(lambda v: { "timestamp":datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),\
                                                               "channel":v[0][0],\
                                                               "emote_name":v[0][1],\
                                                               "is_free":v[0][2],\
                                                               "count":v[1]})
    #time_channel_emotes_count.pprint()
    # 3) get world cup Footy emotes count for all channels
    footy_count = parsed.flatMap(lambda v: v[u'message'].split(" "))\
                            .filter(lambda x: 'Footy' in x)\
                            .map(lambda x: (x,1))\
                            .reduceByKeyAndWindow(lambda x,y: x+y,lambda x,y:x-y,\
                                              window_duration, sliding_duration)\
                            .map(lambda v: { "timestamp":datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), \
                            "emote_name":v[0],\
                            "count":v[1]})

    channel_count_time.saveToCassandra(config.cass_keyspace,
                                       "channel_count_time")
    time_channel_emotes_count.saveToCassandra(config.cass_keyspace,
                                              "time_channel_emotes_count")
    footy_count.saveToCassandra(config.cass_keyspace, "time_footy_count")

    ssc.start()
    ssc.awaitTermination()
Ejemplo n.º 29
0
def main():
    sc = SparkContext(appName="Twitchatter")
    sc.setLogLevel('ERROR')
    # broadcast the emotes set
    global_emotes = sc.broadcast(load_emotes())
    #print(global_emotes.value.keys())
    sub_emotes = sc.broadcast(load_subemotes())
    #print(sub_emotes.value.keys()[:10])

    batch_duration = 5
    ssc = StreamingContext(sc, 5)  # every 3 seconds per batch
    # set checkpoint directory:use default fs protocol in core-site.xml
    ssc.checkpoint("hdfs://" + config.spark_ckpt)

    zkQuorum = [config.zk_address]
    topic = [config.topic]
    print("{}{}".format(zkQuorum, topic))

    partition = 0
    start = 0
    topicpartition = TopicAndPartition(topic[0], partition)

    kvs = KafkaUtils.createDirectStream(
        ssc, topic, {"metadata.broker.list": config.ip_address})
    # uncomment the following if running sum
    #kvs = KafkaUtils.createDirectStream(ssc,topic,{"metadata.broker.list": config.ip_address},
    #        fromOffsets={topicpartition: int(start)})
    #kvs.checkpoint(600)

    parsed = kvs.map(lambda v: json.loads(v[1]))

    window_duration, sliding_duration = 60, 20

    # (1) total count of emotes for given channel
    def get_emotes_count(x):
        line = x.split(" ")
        words = [item.encode('utf-8') for item in line]
        emotes = [item for item in words if item in global_emotes.value.keys()]
        return dict(Counter(emotes))

    def sum_dict(x, y):
        return {k: x.get(k, 0) + y.get(k, 0) for k in set(x) | set(y)}

    def sub_dict(x, y):
        return {k: x.get(k, 0) - y.get(k, 0) for k in set(x) | set(y)}

    def get_count(x):
        line = x.split(" ")
        words = [item.encode('utf-8') for item in line]
        emotes = [item for item in words if item in global_emotes.value.keys()]
        subemotes = [item for item in words if item in sub_emotes.value.keys()]
        return [len(emotes), len(subemotes)]

    def sum_list(x, y):
        return [x[0] + y[0], x[1] + y[1]]

    def sub_list(x, y):
        return [x[0] - y[0], x[1] - y[1]]

    channel_count_time = parsed.map(lambda v: (v[u'channel'],v[u'message']))\
                               .mapValues(get_count)\
                               .reduceByKeyAndWindow(sum_list,sub_list,window_duration,sliding_duration)\
                               .map(lambda v: {"channel":v[0],\
                                               "global_emotes":v[1][0],\
                                               "subscriber_emotes":v[1][1],\
                                               "total_emotes":(v[1][0]+v[1][1]),\
                                               "timestamp":datetime.datetime.now()\
                                               .strftime("%Y-%m-%d %H:%M:%S")})
    #channel_count_time.pprint()

    # 2) get individual emotes count for given channel
    channel_message = parsed.map(lambda v: [(v[u'channel'],word) \
                                 for word in v[u'message'].split(" ")])\
                                 .flatMap(lambda x: x)

    #channel_message.pprint()

    def get_global(x):
        if x[1] in global_emotes.value.keys():
            return True
        else:
            return False

    def get_sub(x):
        if x[1] in sub_emotes.value.keys():
            return True
        else:
            return False
    channel_emotes = channel_message.filter(get_global)\
                                    .map(lambda v: (v[0],v[1],True))
    #channel_emotes.pprint()
    channel_subemotes = channel_message.filter(get_sub)\
                                    .map(lambda v: (v[0],v[1],False))
    #channel_subemotes.pprint()

    time_channel_emotes_count = channel_emotes.union(channel_subemotes)\
                                              .map(lambda v: ((v[0],v[1],v[2]),1))\
                                              .reduceByKeyAndWindow(lambda x,y: x+y,lambda x,y:x-y,\
                                                                    window_duration, sliding_duration)\
                                              .map(lambda v: { "timestamp":datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),\
                                                               "channel":v[0][0],\
                                                               "emote_name":v[0][1],\
                                                               "is_free":v[0][2],\
                                                               "count":v[1],\
                                                               })
    #time_channel_emotes_count.pprint()

    # connect to cassandra cluster
    cluster = Cluster([config.cass_seedip])
    session = cluster.connect()

    # create and set cassandra keyspace to work only once.
    session.execute(
        "CREATE KEYSPACE IF NOT EXISTS " + config.cass_keyspace +
        " WITH replication = {'class':                 'SimpleStrategy', 'replication_factor': '3'};"
    )
    session.set_keyspace(config.cass_keyspace)

    # create tables to insert data
    session.execute(
        "CREATE TABLE IF NOT EXISTS channel_count_time (channel text, global_emotes int, subscriber_emotes int, total_emotes int, timestamp text, primary key(channel,timestamp));"
    )

    channel_count_time.saveToCassandra(config.cass_keyspace,
                                       "channel_count_time")

    session.execute(
        "CREATE TABLE IF NOT EXISTS time_channel_emotes_count (timestamp text, channel text, emote_name text, is_free boolean, count int, primary key(emote_name,timestamp));"
    )

    time_channel_emotes_count.saveToCassandra(config.cass_keyspace,
                                              "time_channel_emotes_count")

    ssc.start()
    ssc.awaitTermination()
Ejemplo n.º 30
0
        print(airport)


def isFloat(row):
    try:
        float(row[10])
        return True
    except:
        return False


sc = SparkContext(appName="top10airports")
sc.setLogLevel("ERROR")
ssc = StreamingContext(sc, 3)
ssc.checkpoint("s3://mudabircapstonecheckpoint/top10carriers/")
topicPartition = TopicAndPartition("airportsFull", 0)
fromOffset = {topicPartition: 0}
kafkaParams = {
    "metadata.broker.list":
    "b-2.kafkacluster.qa2zr3.c2.kafka.us-east-1.amazonaws.com:9092,b-3.kafkacluster.qa2zr3.c2.kafka.us-east-1.amazonaws.com:9092,b-1.kafkacluster.qa2zr3.c2.kafka.us-east-1.amazonaws.com:9092"
}

stream = KafkaUtils.createDirectStream(ssc, ['airportsFull'],
                                       kafkaParams,
                                       fromOffsets=fromOffset)
'''
The incoming data format is
Year|Month|date|DayofWeek|UniqueCarrier|FlightNum|Origin|Dest|CRSDeptime|DepDelay|ArrDelay
'''

rdd = stream.map(lambda x: x[1])