Esempio n. 1
0
    def setup_kafka(self, settings):
        """Setup redis connection and idle signal.
        This should be called after the spider has set its crawler object.
        :param settings: The current Scrapy settings being used
        :type settings: scrapy.settings.Settings
        """
        if not hasattr(self, 'topic') or not self.topic:
            self.topic = '%s-starturls' % self.name

        hosts = settings.get('SCRAPY_KAFKA_HOSTS', 'localhost:9092')
        consumer_group = settings.get('SCRAPY_KAFKA_SPIDER_CONSUMER_GROUP',
                                      'scrapy-kafka')
        _kafka = SimpleClient(hosts)
        # wait at most 1sec for more messages. Otherwise continue
        self.consumer = SimpleConsumer(_kafka,
                                       consumer_group,
                                       self.topic,
                                       auto_commit=True,
                                       iter_timeout=1.0)
        # idle signal is called when the spider has no requests left,
        # that's when we will schedule new requests from kafka topic
        self.crawler.signals.connect(self.spider_idle,
                                     signal=signals.spider_idle)
        self.crawler.signals.connect(self.item_scraped,
                                     signal=signals.item_scraped)
        logger.info("Reading URLs from kafka topic '%s'" % self.kafka_topic)
Esempio n. 2
0
    def from_crawler(cls, crawler, *args, **kwargs):
        spider = super(ListeningKafkaSpider,
                       cls).from_crawler(crawler, *args, **kwargs)

        if not hasattr(spider, 'topic') or not spider.topic:
            spider.topic = '%s-starturls' % spider.name

        hosts = crawler.settings.get('SCRAPY_KAFKA_HOSTS', 'localhost:9092')
        consumer_group = crawler.settings.get(
            'SCRAPY_KAFKA_SPIDER_CONSUMER_GROUP', 'scrapy-kafka')
        _kafka = SimpleClient(hosts)
        # wait at most 1sec for more messages. Otherwise continue
        spider.consumer = SimpleConsumer(_kafka,
                                         consumer_group,
                                         spider.topic,
                                         auto_commit=True,
                                         iter_timeout=1.0)
        # idle signal is called when the spider has no requests left,
        # that's when we will schedule new requests from kafka topic
        crawler.signals.connect(spider.spider_idle, signal=signals.spider_idle)
        crawler.signals.connect(spider.item_scraped,
                                signal=signals.item_scraped)
        logger.info("Reading URLs from kafka topic '%s'" % spider.kafka_topic)

        return spider
Esempio n. 3
0
def process(time, lines):
    """ Processing tweets 
    Input:
    lines: (ts string, uid string, state string, tweet vector)
    Output:
    Json: (ts string, uid string, topicVec vector)
    """
    print("========= %s =========" % str(time))
    sqlContext = getSqlContextInstance(lines.context)
    #    rowRDD=lines.map(lambda x: (x['timeStamp'], x['userId'], getMeanVector(x['tweet'])))\
    #                .filter(lambda (time, uid, vec): vec!=[])\
    #                .map(lambda x:Row(timestamp=x[0], uid=x[1], topicVec=x[2]))
    rowRDD=lines.map(lambda x: [((x['timeStamp'], x['userId']), word2vec(item)) for item in x['tweet'] if isInVolcabulary(item)] )\
                .flatMap(lambda x:x)\
                .filter(lambda (k, vec): vec!=[])\
                .reduceByKey(lambda x,y:x+y)\
                .map(lambda x:Row(timestamp=x[0][0], uid=x[0][1], topicVec=x[1]))

    #    print(rowRDD.take(10))
    print("========= %d =========" % rowRDD.count())
    #   save corr table to cassandra
    if (rowRDD.count() > 0):
        client = SimpleClient(kafkaNodeBC.value)
        producer = KeyedProducer(client)
        for row in rowRDD.collect():
            line = '{ "timestamp" :"' + str(row['timestamp']) + '",'
            line += '  "uid"  :"' + str(row['uid']) + '",'
            line += ' "topicVec":' + json.dumps(
                [float(i) for i in row['topicVec']]) + '}'
            #	    print(line)
            producer.send_messages(outgoingTopic, str(hash(line)), line)
def process(time, lines):
    """Calculate user-product corr table and select ad-push events
    Input:
    lines: (ts string, uid string, topic vector)
    """
    print("========= %s =========" % str(time))
    sqlContext = getSqlContextInstance(lines.context)
    #   calculate user-product correlation table
    runningWindow=lines.map(lambda (k, v): ( (k[1], time.isoformat()), v ))\
         .reduceByKey(lambda x,y: x+y)\
  .map(lambda (x,u): (x, [(pid, score) for (pid, score) in ( (y, float(u.dot(v)/(norm(u)*norm(v)))) for (y,v) in bv.value ) if score>.90]))\
  .filter(lambda (k, v): v!=[])
    rowRDD = runningWindow.map(
        lambda x: Row(uid=x[0][0], score=x[1], ts=x[0][1]))
    #   print(rowRDD.take(10))
    #   saveRDD(sqlContext, rowRDD, keyspaceName='ad_flow', tableName='records1s')
    print("========= %d =========" % rowRDD.count())
    #   save corr table to cassandra
    if (rowRDD.count() > 0):
        client = SimpleClient(KAFKA_NODE)
        producer = KeyedProducer(client)
        for row in rowRDD.collect():
            line = '{ "timeStamp" :"' + str(time) + '",'
            line += '  "uid"  :"' + str(row['uid']) + '",'
            line += ' "score":"' + json.dumps(dict(row['score'])) + '}'
            #	    print(line)
            producer.send_messages(KAFKA_TOPIC, str(hash(line)), line)
 def __init__(self, addr):
     self.client = SimpleClient(addr)
     self.producer = KeyedProducer(self.client)
     self.sess = requests.Session()
     adapter = requests.adapters.HTTPAdapter(max_retries=5)
     self.sess.mount('http://', adapter)
     self.sess.mount('https://', adapter)
Esempio n. 6
0
def process(time, lines):
    """match user with bidder
    Input:
    lines: (ts string, uid string, {pid:score} dict)
    """
    print("========= %s =========" % str(time))
    sqlContext = getSqlContextInstance(lines.context)
    # calculate user-product correlation table
    rowRDD=lines.map(lambda x: ( x['uid'], matchBids(x['score']) ))\
                .map(lambda x:Row(uid=x[0],pid=x[1][0],price=x[1][1]))
    print("========= %d =========" % rowRDD.count())
    if (rowRDD.count() > 0):
        #   send to kafka
        client = SimpleClient(KAFKA_NODE)
        producer = KeyedProducer(client)
        for row in rowRDD.collect():
            line = '{ "pid" :"' + str(row['pid']) + '",'
            line += '  "uid"  :"' + str(row['uid']) + '",'
            line += ' "price":"' + str(row['price']) + '",'
            line += '  "ts":' + str(time) + '}'
            #	    print(line)
            producer.send_messages(KAFKA_TOPIC, str(hash(line)), line)


#   save to cassandra
        rowRDD.map(lambda x:Row(pid=x['pid'],ts=str(time),price=x['price']))\
              .toDF().write\
              .format("org.apache.spark.sql.cassandra")\
              .options(table='winningbid10s', keyspace='ad_flow')\
              .save(mode="append")
def process(time, lines):
    """1. select user to push ads
       2. save user-product corr table to cassandra
       3. match user with bidder
       4. save bid winner to cassandra 
    Input:
    lines: (ts string, uid string, topic vector)
    """
    print("========= %s =========" % str(time))
    sqlContext=getSqlContextInstance(lines.context)
    # calculate user-product correlation table  
    #lines1s=lines.map(lambda x: ( (x['uid'], roundTime(parser.parse(x['tick']),1).isoformat()), np.asarray([1]+[float(i) for i in x['topic']])))\
#    lines1s=lines.map(lambda x: ( x[0], 1))\
    runningWindow=lines.map(lambda (k, v): ( (k[0], time.isoformat()), v ))\
         .reduceByKey(lambda x,y: x+y)\
	 .map(lambda (x,u): [(x, y, float(u.dot(v)/(norm(u)*norm(v)))) for (y,v) in bv.value])\
 	 .flatMap(lambda x:x)\
	 .filter(lambda (x,y,s):s>.97)
    rowRDD=runningWindow.map(lambda x:Row(uid=x[0][0],pid=x[1],score=x[2],ts=x[0][1])) 
#    print(rowRDD.take(10))
#    saveRDD(sqlContext, rowRDD, keyspaceName='ad_flow', tableName='records1s')
    print("========= %d =========" % rowRDD.count())
    # save corr table to cassandra 
    if (rowRDD.count()>0):
	client = SimpleClient(KAFKA_NODE)
	producer = KeyedProducer(client)
        for row in rowRDD.collect():
	    line = '{ "tick" :"' + str(time.isoformat()) + '",'
            line+= '  "uid"  :"' + str(row['uid']) + '",'
            line+= ' "score":"' + str(row['score'])+'",'
            line+= '  "pid":' + str(row['pid'])+ '}'
#	    print(line)
	    producer.send_messages(KAFKA_TOPIC, str(hash(line)), line)
Esempio n. 8
0
def main(servers: List[str]) -> None:
    """
    Main Method

    Arguments:
        servers: List of Zookeeper Kafka Host IPs
    """
    mysql_session = pymysql.connect(**MYSQL_CONF)

    users = query_for_users(mysql_session)
    photos: Deque = deque([], maxlen=3000)
    tags = query_for_tags(mysql_session)
    locations = query_for_locations(mysql_session)

    simple_client = SimpleClient(servers)
    producer = KeyedProducer(simple_client)

    events = [
        comment_producer,
        #create_user_producer,
        follow_producer,
        like_producer,
        create_photo_producer,
        unfollow_producer
    ]

    while True:
        event = generate_random_events(events)
        print(event(users, photos, tags, locations, producer))
        time.sleep(0.02)
Esempio n. 9
0
 def __init__(self, addr):
     self.client = SimpleClient(addr)
     #self.producer  = KeyedProducer(self.client)
     self.producer = KafkaProducer(
         bootstrap_servers=addr + ":9092",
         value_serializer=lambda v: v.encode('utf-8'),
         acks=0,
         linger_ms=500)
Esempio n. 10
0
    def __init__(self, addr):
        self.client = SimpleClient(addr)
        self.producer = KeyedProducer(self.client)

        # The 1999 KDDCup network traffic dataset
        self.data_file = open('/home/ubuntu/opt/realtimeAnomalies/src/main/test/kddcup.testdata.unlabeled', 'r')
        self.mem_data = []
        for record in self.data_file:
            self.mem_data.append(record)
    def getLogLines(self):
        #infinite loop of magical random numbers
        while not thread_stop_event.isSet():
            client = SimpleClient('127.0.0.1:9092')
            consumer = SimpleConsumer(client, "my-producer", "alarm")

            for message in consumer:
                print message.message.value
                socketio.emit('newmessage', {'message': message.message.value},
                              namespace='/test')
                sleep(self.delay)
Esempio n. 12
0
    def __init__(self, host_list, topic, **kwargs):
        logging.Handler.__init__(self)

        self.kafka_client = SimpleClient(host_list)
        self.key = kwargs.get("key", None)
        self.kafka_topic_name = topic

        if not self.key:
            self.producer = SimpleProducer(self.kafka_client, **kwargs)
        else:
            self.producer = KeyedProducer(self.kafka_client, **kwargs)
Esempio n. 13
0
 def from_settings(cls, settings):
     """
     :param settings: the current Scrapy settings
     :type settings: scrapy.settings.Settings
     :rtype: A :class:`~KafkaPipeline` instance
     """
     k_hosts = settings.get('SCRAPY_KAFKA_HOSTS', '127.0.0.1:9092')
     topic = settings.get('SCRAPY_KAFKA_ITEM_PIPELINE_TOPIC', 'data-topic')
     client = SimpleClient(k_hosts)
     producer = SimpleProducer(client)
     return cls(producer, topic)
Esempio n. 14
0
def process(spouts):
    '''
    Returns a named tuple of type PartitionsSummary.
    '''
    results = []
    total_depth = 0
    total_delta = 0
    brokers = []
    for s in spouts:
        for p in s.partitions:
            try:
                k = SimpleClient([p['broker']['host'] + ":" + str(p['broker']['port'])])
            except socket.gaierror as e:
                raise ProcessorError('Failed to contact Kafka broker %s (%s)' % (p['broker']['host'], str(e)))

            earliest_off = OffsetRequestPayload(p['topic'], p['partition'], -2, 1)
            latest_off = OffsetRequestPayload(p['topic'], p['partition'], -1, 1)

            earliest = k.send_offset_request([earliest_off])[0].offsets[0]
            latest = k.send_offset_request([latest_off])[0].offsets[0]
            current = p['offset']

            brokers.append(p['broker']['host'])
            total_depth = total_depth + (latest - earliest)
            total_delta = total_delta + (latest - current)

            results.append(PartitionState._make([
                p['broker']['host'],
                p['topic'],
                p['partition'],
                earliest,
                latest,
                latest - earliest,
                s.id,
                current,
                latest - current]))
    return PartitionsSummary(total_depth=total_depth,
                             total_delta=total_delta,
                             num_partitions=len(results),
                             num_brokers=len(set(brokers)),
                             partitions=tuple(results))
Esempio n. 15
0
 def get_values_kafak(self, groupName, topicName):
     kafka_values = dict()
     broker = SimpleClient(kafka_conn)
     zk = KazooClient(hosts=zookeepers_conn, read_only=True)
     zk.start()
     logsize = 0
     if topicName:
         logsize = 0
         partitions = broker.get_partition_ids_for_topic(topicName)
         responses = broker.send_offset_fetch_request(groupName,
                                                      [OffsetFetchRequestPayload(topicName, p) for p in partitions],
                                                      fail_on_error=True)
         latest_offset = 0
         for res in responses:
             if topicName != "test":
                 latest_offset += res[2]
         for partition in partitions:
             log = "/consumers/%s/offsets/%s/%s" % (groupName, topicName, partition)
             if zk.exists(log):
                 data, stat = zk.get(log)
                 logsize += int(data)
         lag = latest_offset - logsize
     broker.close()
     zk.stop()
     zk.close()
     kafka_values['offset'] = latest_offset
     kafka_values['logsize'] = logsize
     kafka_values['lag'] = lag
     return kafka_values
Esempio n. 16
0
def get_offset(group, topicname):
    try:
        kafka_client = KafkaClient(kafka_conn, timeout=30)
    except Exception as e:
        print "Error, cannot connect kafka broker."
        sys.exit(1)
    else:
        kafka_topics = kafka_client.topics
    finally:
        kafka_client.close()

    try:
        zookeeper_client = KazooClient(hosts=zookeepers_conn,
                                       read_only=True,
                                       timeout=30)
        zookeeper_client.start()
    except Exception as e:
        print "Error, cannot connect zookeeper server."
        sys.exit(1)

    offset_total = 0
    logsize_totoal = 0
    broker = SimpleClient(kafka_conn)
    # partition_path = 'consumers/%s/offsets/%s' % (group, topicname)
    partitions = broker.get_partition_ids_for_topic(topicname)
    kafka_consumer = KafkaConsumer(bootstrap_servers=kafka_conn)

    for partition in partitions:
        base_path = 'consumers/%s/%s/%s/%s' % (group, '%s', topicname,
                                               partition)
        owner_path, offset_path = base_path % 'owners', base_path % 'offsets'
        pdb.set_trace()
        offset = zookeeper_client.get('/' + offset_path)[0]
        offset_total += int(offset)

        # logsize_num = kafka_consumer.get_partition_offsets(topicname, partition, -1, 1)[0]
        # logsize_totoal += int(logsize_num)

    return offset_total, logsize_totoal
Esempio n. 17
0
 def __init__(self, addr):
     self.timezone = timezone('EST')
     self.host = 'ec2-34-192-152-48.compute-1.amazonaws.com'
     self.auction_db = 'auctiontable'
     self.bid_db = 'bidtable'
     self.client = SimpleClient(addr)
     self.producer = KeyedProducer(self.client)
     self.active_auctions = []
     self.conn_auction_db = None
     self.conn_bid_db = None
     self.auction_table = None
     self.bid_table = None
     self.connected_auction_db = False
     self.connected_bid_db = False
Esempio n. 18
0
    return well_stat, (wellID, str(reform_event_data))


def days_between(d1, d2):
    d1 = datetime.strptime(d1, "%Y-%m-%d %H:%M:%S")
    d2 = datetime.strptime(d2, "%Y-%m-%d %H:%M:%S")
    return abs((d2 - d1).seconds)


if __name__ == "__main__":

    args = sys.argv
    ip_address = str(args[1])
    partition_key = str(args[2])

    client = SimpleClient(ip_address)
    producer = KeyedProducer(client)

    # --------------------------------
    # Defining data variables
    # --------------------------------
    dt = datetime.datetime(2017, 10, 9, 1, 59,
                           59)  # Defining a starting date and time
    max_step = 600  # max time step size
    numOfWells = 50000  # Number of wells

    # --------------------------------
    # Generating time series data
    # --------------------------------
    event_types = ['on', 'off']
    completion_types = ['BP', 'ESP']  # diff from 'completion_type' (no s)
Esempio n. 19
0
def main():
    # initial main parser setup
    parser = argparse.ArgumentParser(
        description='Kafka Dump: Scrapy Cluster Kafka topic dump utility for '
                    'debugging.', add_help=False)
    parser.add_argument('-h', '--help', action=ArgparseHelper,
                        help='show this help message and exit')

    subparsers = parser.add_subparsers(help='commands', dest='command')

    # args to use for all commands
    base_parser = argparse.ArgumentParser(add_help=False)
    base_parser.add_argument('-kh', '--kafka-host', action='store', required=False,
                        help="The override Kafka host")
    base_parser.add_argument('-s', '--settings', action='store', required=False,
                        help="The settings file to read from",
                        default="localsettings.py")
    base_parser.add_argument('-ll', '--log-level', action='store', required=False,
                        help="The log level", default=None,
                        choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'])

    # list command
    list_parser = subparsers.add_parser('list', help='List all Kafka topics',
                                        parents=[base_parser])

    # dump command
    dump_parser = subparsers.add_parser('dump', help='Dump a Kafka topic',
                                        parents=[base_parser])
    dump_parser.add_argument('-t', '--topic', action='store', required=True,
                             help="The Kafka topic to read from")
    dump_parser.add_argument('-c', '--consumer', action='store',
                             required=False, default=None,
                             help="The Kafka consumer id to use")
    dump_parser.add_argument('-b', '--from-beginning', action='store_const',
                             required=False, const=True,
                             help="Read the topic from the beginning")
    dump_parser.add_argument('-nb', '--no-body', action='store_const',
                             required=False, const=True, default=False,
                             help="Do not include the raw html 'body' key in"
                             " the json dump of the topic")
    dump_parser.add_argument('-p', '--pretty', action='store_const',
                             required=False, const=True, default=False,
                             help="Pretty print the json objects consumed")
    dump_parser.add_argument('-d', '--decode-base64', action='store_const',
                             required=False, const=True, default=False,
                             help="Decode the base64 encoded raw html body")
    dump_parser.add_argument('-m', '--mongodb', action="store", help="Set mongodb to save webpages")

    args = vars(parser.parse_args())

    wrapper = SettingsWrapper()
    settings = wrapper.load(args['settings'])

    kafka_host = args['kafka_host'] if args['kafka_host'] else settings['KAFKA_HOSTS']
    log_level = args['log_level'] if args['log_level'] else settings['LOG_LEVEL']
    logger = LogFactory.get_instance(level=log_level, name='kafkadump')

    if args['command'] == 'list':
        try:
            logger.debug("Connecting to {0}...".format(kafka_host))
            kafka = SimpleClient(kafka_host)
            logger.info("Connected to {0}".format(kafka_host))
        except KafkaUnavailableError as ex:
            message = "An exception '{0}' occured. Arguments:\n{1!r}" \
                .format(type(ex).__name__, ex.args)
            logger.error(message)
            sys.exit(1)
        logger.debug('Running list command')
        print("Topics:")
        for topic in list(kafka.topic_partitions.keys()):
            print("-", topic)
        kafka.close()
        return 0
    elif args['command'] == 'dump':
        logger.debug('Running dump command')
        topic = args["topic"]
        consumer_id = args["consumer"]

        try:
            logger.debug("Getting Kafka consumer")

            offset = 'earliest' if args["from_beginning"] else 'latest'

            consumer = KafkaConsumer(
                topic,
                group_id=consumer_id,
                bootstrap_servers=kafka_host,
                consumer_timeout_ms=settings['KAFKA_CONSUMER_TIMEOUT'],
                auto_offset_reset=offset,
                auto_commit_interval_ms=settings['KAFKA_CONSUMER_COMMIT_INTERVAL_MS'],
                enable_auto_commit=settings['KAFKA_CONSUMER_AUTO_COMMIT_ENABLE'],
                max_partition_fetch_bytes=settings['KAFKA_CONSUMER_FETCH_MESSAGE_MAX_BYTES'])
        except NoBrokersAvailable as ex:
                logger.error('Unable to connect to Kafka')
                sys.exit(1)

        num_records = 0
        total_bytes = 0
        item = None

        while True:
            try:
                for message in consumer:
                    if message is None:
                        logger.debug("no message")
                        break
                    logger.debug("Received message")
                    val = message.value
                    try:
                        item = json.loads(val)
                        if args['decode_base64'] and 'body' in item:
                            item['body'] = str(base64.b64decode(item['body']))

                        if args['no_body'] and 'body' in item:
                            del item['body']
                    except BaseException, msg:
                        logger.info("Message is not a JSON object")
                        logger.info("base64 error: ", msg)
                        item = val
                    body_bytes = len(item)

                    if args['pretty']:
                        print(json.dumps(item, indent=4))
                    else:
                        print(item)
                    num_records = num_records + 1
                    total_bytes = total_bytes + body_bytes
            except KeyboardInterrupt:
                logger.debug("Keyboard interrupt received")
                break
            except:
                logger.error(traceback.print_exc())
                break

        total_mbs = old_div(float(total_bytes), (1024*1024))
        if item is not None:
            print("Last item:")
            print(json.dumps(item, indent=4))
        if num_records > 0:
            logger.info("Num Records: {n}, Total MBs: {m}, kb per message: {kb}"
                    .format(n=num_records, m=total_mbs,
                            kb=(float(total_bytes) / num_records / 1024)))
        else:
            logger.info("No records consumed")
            num_records = 0

        logger.info("Closing Kafka connection")
        try:
            consumer.close()
        except:
            # Exception is thrown when group_id is None.
            # See https://github.com/dpkp/kafka-python/issues/619
            pass
        return 0
Esempio n. 20
0
    def configure(self, **configs):
        """Configure the consumer instance

        Configuration settings can be passed to constructor,
        otherwise defaults will be used:

        Keyword Arguments:
            bootstrap_servers (list): List of initial broker nodes the consumer
                should contact to bootstrap initial cluster metadata.  This does
                not have to be the full node list.  It just needs to have at
                least one broker that will respond to a Metadata API Request.
            client_id (str): a unique name for this client.  Defaults to
                'kafka.consumer.kafka'.
            group_id (str): the name of the consumer group to join,
                Offsets are fetched / committed to this group name.
            fetch_message_max_bytes (int, optional): Maximum bytes for each
                topic/partition fetch request.  Defaults to 1024*1024.
            fetch_min_bytes (int, optional): Minimum amount of data the server
                should return for a fetch request, otherwise wait up to
                fetch_wait_max_ms for more data to accumulate.  Defaults to 1.
            fetch_wait_max_ms (int, optional): Maximum time for the server to
                block waiting for fetch_min_bytes messages to accumulate.
                Defaults to 100.
            refresh_leader_backoff_ms (int, optional): Milliseconds to backoff
                when refreshing metadata on errors (subject to random jitter).
                Defaults to 200.
            socket_timeout_ms (int, optional): TCP socket timeout in
                milliseconds.  Defaults to 30*1000.
            auto_offset_reset (str, optional): A policy for resetting offsets on
                OffsetOutOfRange errors. 'smallest' will move to the oldest
                available message, 'largest' will move to the most recent.  Any
                ofther value will raise the exception.  Defaults to 'largest'.
            deserializer_class (callable, optional):  Any callable that takes a
                raw message value and returns a deserialized value.  Defaults to
                 lambda msg: msg.
            auto_commit_enable (bool, optional): Enabling auto-commit will cause
                the KafkaConsumer to periodically commit offsets without an
                explicit call to commit().  Defaults to False.
            auto_commit_interval_ms (int, optional):  If auto_commit_enabled,
                the milliseconds between automatic offset commits.  Defaults to
                60 * 1000.
            auto_commit_interval_messages (int, optional): If
                auto_commit_enabled, a number of messages consumed between
                automatic offset commits.  Defaults to None (disabled).
            consumer_timeout_ms (int, optional): number of millisecond to throw
                a timeout exception to the consumer if no message is available
                for consumption.  Defaults to -1 (dont throw exception).

        Configuration parameters are described in more detail at
        http://kafka.apache.org/documentation.html#highlevelconsumerapi
        """
        configs = self._deprecate_configs(**configs)
        self._config = {}
        for key in self.DEFAULT_CONFIG:
            self._config[key] = configs.pop(key, self.DEFAULT_CONFIG[key])

        if configs:
            raise KafkaConfigurationError('Unknown configuration key(s): ' +
                                          str(list(configs.keys())))

        if self._config['auto_commit_enable']:
            if not self._config['group_id']:
                raise KafkaConfigurationError(
                    'KafkaConsumer configured to auto-commit '
                    'without required consumer group (group_id)'
                )

        # Check auto-commit configuration
        if self._config['auto_commit_enable']:
            logger.info("Configuring consumer to auto-commit offsets")
            self._reset_auto_commit()

        if not self._config['bootstrap_servers']:
            raise KafkaConfigurationError(
                'bootstrap_servers required to configure KafkaConsumer'
            )

        self.metrics_reporter = self._config['metrics_reporter']

        self._client = SimpleClient(
            self._config['bootstrap_servers'],
            client_id=self._config['client_id'],
            timeout=(self._config['socket_timeout_ms'] / 1000.0),
            metrics_reporter=self.metrics_reporter
        )
Esempio n. 21
0
class KafkaConsumer(object):
    """A simpler kafka consumer"""
    DEFAULT_CONFIG = deepcopy(DEFAULT_CONSUMER_CONFIG)

    def __init__(self, *topics, **configs):
        self.configure(**configs)
        self.set_topic_partitions(*topics)

    def configure(self, **configs):
        """Configure the consumer instance

        Configuration settings can be passed to constructor,
        otherwise defaults will be used:

        Keyword Arguments:
            bootstrap_servers (list): List of initial broker nodes the consumer
                should contact to bootstrap initial cluster metadata.  This does
                not have to be the full node list.  It just needs to have at
                least one broker that will respond to a Metadata API Request.
            client_id (str): a unique name for this client.  Defaults to
                'kafka.consumer.kafka'.
            group_id (str): the name of the consumer group to join,
                Offsets are fetched / committed to this group name.
            fetch_message_max_bytes (int, optional): Maximum bytes for each
                topic/partition fetch request.  Defaults to 1024*1024.
            fetch_min_bytes (int, optional): Minimum amount of data the server
                should return for a fetch request, otherwise wait up to
                fetch_wait_max_ms for more data to accumulate.  Defaults to 1.
            fetch_wait_max_ms (int, optional): Maximum time for the server to
                block waiting for fetch_min_bytes messages to accumulate.
                Defaults to 100.
            refresh_leader_backoff_ms (int, optional): Milliseconds to backoff
                when refreshing metadata on errors (subject to random jitter).
                Defaults to 200.
            socket_timeout_ms (int, optional): TCP socket timeout in
                milliseconds.  Defaults to 30*1000.
            auto_offset_reset (str, optional): A policy for resetting offsets on
                OffsetOutOfRange errors. 'smallest' will move to the oldest
                available message, 'largest' will move to the most recent.  Any
                ofther value will raise the exception.  Defaults to 'largest'.
            deserializer_class (callable, optional):  Any callable that takes a
                raw message value and returns a deserialized value.  Defaults to
                 lambda msg: msg.
            auto_commit_enable (bool, optional): Enabling auto-commit will cause
                the KafkaConsumer to periodically commit offsets without an
                explicit call to commit().  Defaults to False.
            auto_commit_interval_ms (int, optional):  If auto_commit_enabled,
                the milliseconds between automatic offset commits.  Defaults to
                60 * 1000.
            auto_commit_interval_messages (int, optional): If
                auto_commit_enabled, a number of messages consumed between
                automatic offset commits.  Defaults to None (disabled).
            consumer_timeout_ms (int, optional): number of millisecond to throw
                a timeout exception to the consumer if no message is available
                for consumption.  Defaults to -1 (dont throw exception).

        Configuration parameters are described in more detail at
        http://kafka.apache.org/documentation.html#highlevelconsumerapi
        """
        configs = self._deprecate_configs(**configs)
        self._config = {}
        for key in self.DEFAULT_CONFIG:
            self._config[key] = configs.pop(key, self.DEFAULT_CONFIG[key])

        if configs:
            raise KafkaConfigurationError('Unknown configuration key(s): ' +
                                          str(list(configs.keys())))

        if self._config['auto_commit_enable']:
            if not self._config['group_id']:
                raise KafkaConfigurationError(
                    'KafkaConsumer configured to auto-commit '
                    'without required consumer group (group_id)'
                )

        # Check auto-commit configuration
        if self._config['auto_commit_enable']:
            logger.info("Configuring consumer to auto-commit offsets")
            self._reset_auto_commit()

        if not self._config['bootstrap_servers']:
            raise KafkaConfigurationError(
                'bootstrap_servers required to configure KafkaConsumer'
            )

        self.metrics_reporter = self._config['metrics_reporter']

        self._client = SimpleClient(
            self._config['bootstrap_servers'],
            client_id=self._config['client_id'],
            timeout=(self._config['socket_timeout_ms'] / 1000.0),
            metrics_reporter=self.metrics_reporter
        )

    def set_topic_partitions(self, *topics):
        """
        Set the topic/partitions to consume
        Optionally specify offsets to start from

        Accepts types:

        * str (utf-8): topic name (will consume all available partitions)
        * tuple: (topic, partition)
        * dict:
            - { topic: partition }
            - { topic: [partition list] }
            - { topic: (partition tuple,) }

        Optionally, offsets can be specified directly:

        * tuple: (topic, partition, offset)
        * dict:  { (topic, partition): offset, ... }

        Example:

        .. code:: python

            kafka = KafkaConsumer()

            # Consume topic1-all; topic2-partition2; topic3-partition0
            kafka.set_topic_partitions("topic1", ("topic2", 2), {"topic3": 0})

            # Consume topic1-0 starting at offset 12, and topic2-1 at offset 45
            # using tuples --
            kafka.set_topic_partitions(("topic1", 0, 12), ("topic2", 1, 45))

            # using dict --
            kafka.set_topic_partitions({ ("topic1", 0): 12, ("topic2", 1): 45 })

        """
        self._topics = []
        self._client.load_metadata_for_topics()

        # Setup offsets
        self._offsets = OffsetsStruct(fetch=dict(),
                                      commit=dict(),
                                      highwater=dict(),
                                      task_done=dict())

        # Handle different topic types
        for arg in topics:

            # Topic name str -- all partitions
            if isinstance(arg, (six.string_types, six.binary_type)):
                topic = arg

                for partition in self._client.get_partition_ids_for_topic(topic):
                    self._consume_topic_partition(topic, partition)

            # (topic, partition [, offset]) tuple
            elif isinstance(arg, tuple):
                topic = arg[0]
                partition = arg[1]
                self._consume_topic_partition(topic, partition)
                if len(arg) == 3:
                    offset = arg[2]
                    self._offsets.fetch[(topic, partition)] = offset

            # { topic: partitions, ... } dict
            elif isinstance(arg, dict):
                for key, value in six.iteritems(arg):

                    # key can be string (a topic)
                    if isinstance(key, (six.string_types, six.binary_type)):
                        topic = key

                        # topic: partition
                        if isinstance(value, int):
                            self._consume_topic_partition(topic, value)

                        # topic: [ partition1, partition2, ... ]
                        elif isinstance(value, (list, tuple)):
                            for partition in value:
                                self._consume_topic_partition(topic, partition)
                        else:
                            raise KafkaConfigurationError(
                                'Unknown topic type '
                                '(dict key must be int or list/tuple of ints)'
                            )

                    # (topic, partition): offset
                    elif isinstance(key, tuple):
                        topic = key[0]
                        partition = key[1]
                        self._consume_topic_partition(topic, partition)
                        self._offsets.fetch[(topic, partition)] = value

            else:
                raise KafkaConfigurationError('Unknown topic type (%s)' % type(arg))

        # If we have a consumer group, try to fetch stored offsets
        if self._config['group_id']:
            self._get_commit_offsets()

        # Update missing fetch/commit offsets
        for topic_partition in self._topics:

            # Commit offsets default is None
            if topic_partition not in self._offsets.commit:
                self._offsets.commit[topic_partition] = None

            # Skip if we already have a fetch offset from user args
            if topic_partition not in self._offsets.fetch:

                # Fetch offsets default is (1) commit
                if self._offsets.commit[topic_partition] is not None:
                    self._offsets.fetch[topic_partition] = self._offsets.commit[topic_partition]

                # or (2) auto reset
                else:
                    self._offsets.fetch[topic_partition] = self._reset_partition_offset(topic_partition)

        # highwater marks (received from server on fetch response)
        # and task_done (set locally by user)
        # should always get initialized to None
        self._reset_highwater_offsets()
        self._reset_task_done_offsets()

        # Reset message iterator in case we were in the middle of one
        self._reset_message_iterator()

    def close(self):
        """Close this consumer's underlying client."""
        self._client.close()

    def next(self):
        """Return the next available message

        Blocks indefinitely unless consumer_timeout_ms > 0

        Returns:
            a single KafkaMessage from the message iterator

        Raises:
            ConsumerTimeout after consumer_timeout_ms and no message

        Note:
            This is also the method called internally during iteration

        """
        self._set_consumer_timeout_start()
        while True:

            try:
                return six.next(self._get_message_iterator())

            # Handle batch completion
            except StopIteration:
                self._reset_message_iterator()

            self._check_consumer_timeout()

    def fetch_messages(self):
        """Sends FetchRequests for all topic/partitions set for consumption

        Returns:
            Generator that yields KafkaMessage structs
            after deserializing with the configured `deserializer_class`

        Note:
            Refreshes metadata on errors, and resets fetch offset on
            OffsetOutOfRange, per the configured `auto_offset_reset` policy

        See Also:
            Key KafkaConsumer configuration parameters:
            * `fetch_message_max_bytes`
            * `fetch_max_wait_ms`
            * `fetch_min_bytes`
            * `deserializer_class`
            * `auto_offset_reset`

        """

        max_bytes = self._config['fetch_message_max_bytes']
        max_wait_time = self._config['fetch_wait_max_ms']
        min_bytes = self._config['fetch_min_bytes']

        if not self._topics:
            raise KafkaConfigurationError('No topics or partitions configured')

        if not self._offsets.fetch:
            raise KafkaConfigurationError(
                'No fetch offsets found when calling fetch_messages'
            )

        fetches = [FetchRequestPayload(topic, partition,
                                self._offsets.fetch[(topic, partition)],
                                max_bytes)
                   for (topic, partition) in self._topics]

        # send_fetch_request will batch topic/partition requests by leader
        responses = self._client.send_fetch_request(
            fetches,
            max_wait_time=max_wait_time,
            min_bytes=min_bytes,
            fail_on_error=False
        )

        for resp in responses:

            if isinstance(resp, FailedPayloadsError):
                if self.metrics_reporter:
                    self.metrics_reporter('failed_payloads_count', 1)

                logger.warning('FailedPayloadsError attempting to fetch data')
                self._refresh_metadata_on_error()
                continue

            topic = resp.topic
            partition = resp.partition
            try:
                check_error(resp)
            except OffsetOutOfRangeError:
                if self.metrics_reporter:
                    self.metrics_reporter('offset_out_of_range_count', 1)

                logger.warning('OffsetOutOfRange: topic %s, partition %d, '
                               'offset %d (Highwatermark: %d)',
                               topic, partition,
                               self._offsets.fetch[(topic, partition)],
                               resp.highwaterMark)
                # Reset offset
                self._offsets.fetch[(topic, partition)] = (
                    self._reset_partition_offset((topic, partition))
                )
                continue

            except NotLeaderForPartitionError:
                if self.metrics_reporter:
                    self.metrics_reporter('not_leader_for_partition_count', 1)

                logger.warning("NotLeaderForPartitionError for %s - %d. "
                               "Metadata may be out of date",
                               topic, partition)
                self._refresh_metadata_on_error()
                continue

            except RequestTimedOutError:
                if self.metrics_reporter:
                    self.metrics_reporter('request_timed_out_count', 1)

                logger.warning("RequestTimedOutError for %s - %d",
                               topic, partition)
                continue

            # Track server highwater mark
            self._offsets.highwater[(topic, partition)] = resp.highwaterMark

            # Yield each message
            # Kafka-python could raise an exception during iteration
            # we are not catching -- user will need to address
            for (offset, message) in resp.messages:
                # deserializer_class could raise an exception here
                val = self._config['deserializer_class'](message.value)
                msg = KafkaMessage(topic, partition, offset, message.key, val)

                # in some cases the server will return earlier messages
                # than we requested. skip them per kafka spec
                if offset < self._offsets.fetch[(topic, partition)]:
                    logger.debug('message offset less than fetched offset '
                                 'skipping: %s', msg)
                    continue
                # Only increment fetch offset
                # if we safely got the message and deserialized
                self._offsets.fetch[(topic, partition)] = offset + 1

                # Then yield to user
                yield msg

    def get_partition_offsets(self, topic, partition, request_time_ms, max_num_offsets):
        """Request available fetch offsets for a single topic/partition

        Keyword Arguments:
            topic (str): topic for offset request
            partition (int): partition for offset request
            request_time_ms (int): Used to ask for all messages before a
                certain time (ms). There are two special values.
                Specify -1 to receive the latest offset (i.e. the offset of the
                next coming message) and -2 to receive the earliest available
                offset. Note that because offsets are pulled in descending
                order, asking for the earliest offset will always return you a
                single element.
            max_num_offsets (int): Maximum offsets to include in the OffsetResponse

        Returns:
            a list of offsets in the OffsetResponse submitted for the provided
            topic / partition. See:
            https://cwiki.apache.org/confluence/display/KAFKA/A+Guide+To+The+Kafka+Protocol#AGuideToTheKafkaProtocol-OffsetAPI
        """
        reqs = [OffsetRequestPayload(topic, partition, request_time_ms, max_num_offsets)]

        (resp,) = self._client.send_offset_request(reqs)

        check_error(resp)

        # Just for sanity..
        # probably unnecessary
        assert resp.topic == topic
        assert resp.partition == partition

        return resp.offsets

    def offsets(self, group=None):
        """Get internal consumer offset values

        Keyword Arguments:
            group: Either "fetch", "commit", "task_done", or "highwater".
                If no group specified, returns all groups.

        Returns:
            A copy of internal offsets struct
        """
        if not group:
            return {
                'fetch': self.offsets('fetch'),
                'commit': self.offsets('commit'),
                'task_done': self.offsets('task_done'),
                'highwater': self.offsets('highwater')
            }
        else:
            return dict(deepcopy(getattr(self._offsets, group)))

    def task_done(self, message):
        """Mark a fetched message as consumed.

        Offsets for messages marked as "task_done" will be stored back
        to the kafka cluster for this consumer group on commit()

        Arguments:
            message (KafkaMessage): the message to mark as complete

        Returns:
            True, unless the topic-partition for this message has not
            been configured for the consumer. In normal operation, this
            should not happen. But see github issue 364.
        """
        topic_partition = (message.topic, message.partition)
        if topic_partition not in self._topics:
            logger.warning('Unrecognized topic/partition in task_done message: '
                           '{0}:{1}'.format(*topic_partition))
            return False

        offset = message.offset

        # Warn on non-contiguous offsets
        prev_done = self._offsets.task_done[topic_partition]
        if prev_done is not None and offset != (prev_done + 1):
            logger.warning('Marking task_done on a non-continuous offset: %d != %d + 1',
                           offset, prev_done)

        # Warn on smaller offsets than previous commit
        # "commit" offsets are actually the offset of the next message to fetch.
        prev_commit = self._offsets.commit[topic_partition]
        if prev_commit is not None and ((offset + 1) <= prev_commit):
            logger.warning('Marking task_done on a previously committed offset?: %d (+1) <= %d',
                           offset, prev_commit)

        self._offsets.task_done[topic_partition] = offset

        # Check for auto-commit
        if self._does_auto_commit_messages():
            self._incr_auto_commit_message_count()

        if self._should_auto_commit():
            self.commit()

        return True

    def commit(self):
        """Store consumed message offsets (marked via task_done())
        to kafka cluster for this consumer_group.

        Returns:
            True on success, or False if no offsets were found for commit

        Note:
            this functionality requires server version >=0.8.1.1
            https://cwiki.apache.org/confluence/display/KAFKA/A+Guide+To+The+Kafka+Protocol#AGuideToTheKafkaProtocol-OffsetCommit/FetchAPI
        """
        if not self._config['group_id']:
            logger.warning('Cannot commit without a group_id!')
            raise KafkaConfigurationError(
                'Attempted to commit offsets '
                'without a configured consumer group (group_id)'
            )

        # API supports storing metadata with each commit
        # but for now it is unused
        metadata = b''

        offsets = self._offsets.task_done
        commits = []
        for topic_partition, task_done_offset in six.iteritems(offsets):

            # Skip if None
            if task_done_offset is None:
                continue

            # Commit offsets as the next offset to fetch
            # which is consistent with the Java Client
            # task_done is marked by messages consumed,
            # so add one to mark the next message for fetching
            commit_offset = (task_done_offset + 1)

            # Skip if no change from previous committed
            if commit_offset == self._offsets.commit[topic_partition]:
                continue

            commits.append(
                OffsetCommitRequestPayload(topic_partition[0], topic_partition[1],
                                    commit_offset, metadata)
            )

        if commits:
            logger.info('committing consumer offsets to group %s', self._config['group_id'])

            resps = []
            if self._config['offset_storage'] in ['zookeeper', 'dual']:
                resps += self._client.send_offset_commit_request(
                    self._config['group_id'], commits,
                    fail_on_error=False,
                )
            if self._config['offset_storage'] in ['kafka', 'dual']:
                resps += self._client.send_offset_commit_request_kafka(
                    self._config['group_id'], commits,
                    fail_on_error=False,
                )

            for r in resps:
                check_error(r)
                topic_partition = (r.topic, r.partition)
                task_done = self._offsets.task_done[topic_partition]
                self._offsets.commit[topic_partition] = (task_done + 1)

            if self._config['auto_commit_enable']:
                self._reset_auto_commit()

            return True

        else:
            logger.info('No new offsets found to commit in group %s', self._config['group_id'])
            return False

    #
    # Topic/partition management private methods
    #

    def _consume_topic_partition(self, topic, partition):
        topic = topic
        if not isinstance(partition, int):
            raise KafkaConfigurationError('Unknown partition type (%s) '
                                          '-- expected int' % type(partition))

        if topic not in self._client.topic_partitions:
            raise UnknownTopicOrPartitionError("Topic %s not found in broker metadata" % topic)
        if partition not in self._client.get_partition_ids_for_topic(topic):
            raise UnknownTopicOrPartitionError("Partition %d not found in Topic %s "
                                               "in broker metadata" % (partition, topic))
        logger.info("Configuring consumer to fetch topic '%s', partition %d", topic, partition)
        self._topics.append((topic, partition))

    def _refresh_metadata_on_error(self):
        refresh_ms = self._config['refresh_leader_backoff_ms']
        jitter_pct = 0.20
        sleep_ms = random.randint(
            int((1.0 - 0.5 * jitter_pct) * refresh_ms),
            int((1.0 + 0.5 * jitter_pct) * refresh_ms)
        )
        while True:
            logger.info("Sleeping for refresh_leader_backoff_ms: %d", sleep_ms)
            time.sleep(sleep_ms / 1000.0)
            try:
                self._client.load_metadata_for_topics()
            except KafkaUnavailableError:
                logger.warning("Unable to refresh topic metadata... cluster unavailable")
                self._check_consumer_timeout()
            else:
                logger.info("Topic metadata refreshed")
                return

    #
    # Offset-managment private methods
    #

    def _get_commit_offsets(self):
        logger.info("Consumer fetching stored offsets")
        for topic_partition in self._topics:
            resps = []
            if self._config['offset_storage'] in ('zookeeper', 'dual'):
                resps += self._client.send_offset_fetch_request(
                    self._config['group_id'],
                    [OffsetFetchRequestPayload(topic_partition[0], topic_partition[1])],
                    fail_on_error=False)
            if self._config['offset_storage'] in ('kafka', 'dual'):
                resps += self._client.send_offset_fetch_request_kafka(
                    self._config['group_id'],
                    [OffsetFetchRequestPayload(topic_partition[0], topic_partition[1])],
                    fail_on_error=False)
            try:
                for r in resps:
                  check_error(r)
            # API spec says server wont set an error here
            # but 0.8.1.1 does actually...
            except UnknownTopicOrPartitionError:
                pass

            # -1 offset signals no commit is currently stored
            max_offset = max(r.offset for r in resps)
            if max_offset == -1:
                self._offsets.commit[topic_partition] = None

            # Otherwise we committed the stored offset
            # and need to fetch the next one
            else:
                self._offsets.commit[topic_partition] = max_offset

    def _reset_highwater_offsets(self):
        for topic_partition in self._topics:
            self._offsets.highwater[topic_partition] = None

    def _reset_task_done_offsets(self):
        for topic_partition in self._topics:
            self._offsets.task_done[topic_partition] = None

    def _reset_partition_offset(self, topic_partition):
        (topic, partition) = topic_partition
        LATEST = -1
        EARLIEST = -2

        request_time_ms = None
        if self._config['auto_offset_reset'] == 'largest':
            request_time_ms = LATEST
        elif self._config['auto_offset_reset'] == 'smallest':
            request_time_ms = EARLIEST
        else:

            # Let's raise an reasonable exception type if user calls
            # outside of an exception context
            if sys.exc_info() == (None, None, None):
                raise OffsetOutOfRangeError('Cannot reset partition offsets without a '
                                            'valid auto_offset_reset setting '
                                            '(largest|smallest)')

            # Otherwise we should re-raise the upstream exception
            # b/c it typically includes additional data about
            # the request that triggered it, and we do not want to drop that
            raise # pylint: disable-msg=E0704

        (offset, ) = self.get_partition_offsets(topic, partition,
                                                request_time_ms, max_num_offsets=1)
        return offset

    #
    # Consumer Timeout private methods
    #

    def _set_consumer_timeout_start(self):
        self._consumer_timeout = False
        if self._config['consumer_timeout_ms'] >= 0:
            self._consumer_timeout = time.time() + (self._config['consumer_timeout_ms'] / 1000.0)

    def _check_consumer_timeout(self):
        if self._consumer_timeout and time.time() > self._consumer_timeout:
            raise ConsumerTimeout('Consumer timed out after %d ms' % + self._config['consumer_timeout_ms'])

    #
    # Autocommit private methods
    #

    def _should_auto_commit(self):
        if self._does_auto_commit_ms():
            if time.time() >= self._next_commit_time:
                return True

        if self._does_auto_commit_messages():
            if self._uncommitted_message_count >= self._config['auto_commit_interval_messages']:
                return True

        return False

    def _reset_auto_commit(self):
        self._uncommitted_message_count = 0
        self._next_commit_time = None
        if self._does_auto_commit_ms():
            self._next_commit_time = time.time() + (self._config['auto_commit_interval_ms'] / 1000.0)

    def _incr_auto_commit_message_count(self, n=1):
        self._uncommitted_message_count += n

    def _does_auto_commit_ms(self):
        if not self._config['auto_commit_enable']:
            return False

        conf = self._config['auto_commit_interval_ms']
        if conf is not None and conf > 0:
            return True
        return False

    def _does_auto_commit_messages(self):
        if not self._config['auto_commit_enable']:
            return False

        conf = self._config['auto_commit_interval_messages']
        if conf is not None and conf > 0:
            return True
        return False

    #
    # Message iterator private methods
    #

    def __iter__(self):
        return self

    def __next__(self):
        return self.next()

    def _get_message_iterator(self):
        # Fetch a new batch if needed
        if self._msg_iter is None:
            self._msg_iter = self.fetch_messages()

        return self._msg_iter

    def _reset_message_iterator(self):
        self._msg_iter = None

    #
    # python private methods
    #

    def __repr__(self):
        return '<{0} topics=({1})>'.format(
            self.__class__.__name__,
            '|'.join(["%s-%d" % topic_partition
                      for topic_partition in self._topics])
        )

    #
    # other private methods
    #

    def _deprecate_configs(self, **configs):
        for old, new in six.iteritems(DEPRECATED_CONFIG_KEYS):
            if old in configs:
                logger.warning('Deprecated Kafka Consumer configuration: %s. '
                               'Please use %s instead.', old, new)
                old_value = configs.pop(old)
                if new not in configs:
                    configs[new] = old_value
        return configs