Example #1
0
for i in range(3):
    msg = "msg %d" % i
    print(msg)
    producer.send('test', msg)
producer.close()

# 生产数据
from pykafka import KafkaClient
host = 'IP:9092, IP:9092, IP:9092'
client = KafkaClient(hosts=host)
# 生产者
topicdocu = client.topics['my-topic']
producer = topicdocu.get_producer()
for i in range(100):
    print(i)
    producer.produce('test message ' + str(i**2))
producer.stop()

# 读取本地所有topic信息
from pykafka import KafkaClient
client = KafkaClient(hosts="127.0.0.1:9092")
for topic in client.topics:
    print(topic)

# 查看brokers信息
from pykafka import KafkaClient
client = KafkaClient(host="127.0.0.1:9092")
print(client.brokers)

for n in client.brokers:
    host = client.brokers[n].host
Example #2
0
            message_value = event.encode('utf-8') if raw else \
                json.dumps(event, sort_keys=True).encode('utf-8')

            # When a event is passed to "produce" it will end up in a local
            # buffer (controlled by librdkafka) first, and then later on it
            # will be delivered to Kafka.
            # It might happen that the buffer is full, so the following logic
            # is needed to implement a simple retry logic before giving up.
            event_enqueued = False
            enqueue_retries = 0
            while (not event_enqueued
                   and enqueue_retries < KAFKA_CONFLUENT_RETRY_BUFFER_FULL):
                try:
                    # Produce the message.
                    enqueue_retries += 1
                    kafka_producer.produce(message_topic, message_value,
                                           message_key)
                    event_enqueued = True
                except BufferError as e:
                    if enqueue_retries < KAFKA_CONFLUENT_RETRY_BUFFER_FULL:
                        logging.warning(
                            'Local produce queue full, waiting for '
                            'events delivered.')
                        kafka_producer.poll(0.5)
                    else:
                        logging.error("Failed to enqueue an event to the "
                                      "local kafka producer queue after %d "
                                      "retries.".format(enqueue_retries))
                        raise e

            # If not async, flush the Kafka produce buffer now and block
            # until we are done.
Example #3
0
def run(argv):
    old_client = False

    if len(argv) > 4:
        test_duration = argv[1]
        msg_batch = argv[2]
        msg_requested_rate = argv[3]
        topic_name = argv[4]
        acks = argv[5]
        linger_ms = argv[6]
        msg_batch = int(msg_batch)
        msg_requested_rate = float(msg_requested_rate)
        test_duration = float(test_duration)
        topic_name = str(topic_name)
        acks = int(acks)
        linger_ms = int(linger_ms)

    # Initialize Kafka PUB Server
    l.info("Starting Kafka Publisher (producer)")
    # Estimate average message size to compute batch_size in [bytes] / Requested by Kafka
    min_message_size = len(str(0) + ' msg' + str(0))
    max_message_size = len(str(msg_requested_rate) + ' msg' + str(msg_requested_rate))
    average_message_size = (min_message_size + max_message_size) / 2
    batch_estimated_size = (average_message_size) * msg_batch
    l.info("Message Average Size is: [%s]. Kafka Batch Size in Bytes set to: [%s]" % (average_message_size,
                                                                                      batch_estimated_size))
    if old_client:
        producer = KafkaProducer(bootstrap_servers=['localhost:9092'], batch_size=batch_estimated_size,
                                 linger_ms=linger_ms, acks=acks)
    else:
        client = KafkaClient(hosts='localhost:9092')
        topic = client.topics[topic_name]
        producer = topic.get_producer(min_queued_messages=batch_estimated_size, linger_ms=linger_ms, required_acks=acks)

    # Initialize simple Rep server, this is used to listen
    # for the signal to start sending data
    pub_rep_port = os.environ.get('PORT0')
    l.info("STARTING KAFKA REP server at port [%s].", pub_rep_port)
    run_data = {'start': False,
                'stats': {'rate': 0, 'msg_cnt': 0},
                'test_status': 'stopped'}
    pub_metrics = {'test_duration': test_duration,
                   'msg_batch': msg_batch,
                   'msg_requested_rate': msg_requested_rate}
    hd = HDKafkapRepSrv(pub_rep_port, run_data, pub_metrics)
    hd.run()

    while True:
        #  Wait for 'signal' to start sending messages to Kafka Broker
        if not run_data['start']:
            l.debug("KAFKA PUB WAITING FOR SIGNAL...")
            time.sleep(1)
            continue
        l.info('PUB server initiating... Test Duration [%f] secs. Messages with batches [%d]'
               'and requested msg rate [%f]' % (hd.test_duration, hd.msg_batch, hd.msg_requested_rate))
        cnt = 0
        msg_cnt = 0
        start_time = time.time()

        # Start Publishing Messages to Broker
        while True:
            # Build 'message'
            messagedata = "msg%d" % msg_cnt
            message = "%d %s" % (msg_cnt, messagedata)

            try:
                # Publish message to the Kafka Cluster
                # topic: specifies the 'topic' where the message will be published
                if old_client:
                    producer.send(topic=topic_name, value=message)
                else:
                    producer.produce(message)
            except KafkaTimeoutError as e:
                l.error("Unable to publish message to the Kafka Cluster. ERROR: %s" % e.message)

            # Insert a 'delay' if tx rate between batches outperforms the expected
            # (minimum) rate to achieve requested tx rate
            cnt += 1
            msg_cnt += 1
            if cnt >= hd.msg_batch:
                # Compute the delay
                duration = time.time() - start_time
                expected_time = msg_cnt / hd.msg_requested_rate
                delay = 0.0
                if expected_time > duration:
                    delay = expected_time - duration
                if delay > 1:
                    delay = 1
                time.sleep(delay)
                cnt = 0
            elapsed_time = time.time() - start_time
            if elapsed_time >= hd.test_duration:
                break
        # Update 'stats' to 'hd' (HDaemon)
        run_data['stats']['time:end'] = json.dumps(time.time())
        run_data['stats']['rate'] = msg_cnt / elapsed_time
        run_data['stats']['msg_cnt'] = msg_cnt
        process = psutil.Process()
        run_data['stats']['net:end'] = json.dumps(psutil.net_io_counters())
        run_data['stats']['cpu:end'] = json.dumps(process.cpu_times())
        run_data['stats']['mem:end'] = json.dumps(process.memory_info())
        run_data['test_status'] = 'stopping'
        # Go back to waiting for the next test
        run_data['start'] = False
        continue
    producer.close()
    l.info("PUB Server stopping after sending %d messages elapsed time %f and message rate %f" %
           (msg_cnt, elapsed_time, run_data['stats']['rate']))
Example #4
0
        if not raw and key:
            try:
                message_key = key.format(**event)
            # If we failed getting key, log and skip the event.
            except KeyError as e:
                logging.error(
                    'Could not get message key from event. KeyError: %s. '
                    'Skipping event.' % e
                )
                continue

        message_value = event.encode('utf-8') if raw else \
            json.dumps(event, sort_keys=True).encode('utf-8')

        # Produce the message.
        kafka_producer.produce(message_topic, message_value, message_key)

        # If not async, the flush Kafka produce buffer now and block
        # until we are done.
        if not async:
            kafka_producer.flush()


@writes('mysql', 'sqlite')
def sql_writer(
    uri,
    replace=False,
    statsd_host='',
    batch_size=3000,
    batch_time=300
):
Example #5
0
from kafka import KafkaProducer
import random

etl_producer = KafkaProducer()


def create_features(data):
    for rec in data:
        feat_y = rec['field_y'] + random.randint(0, 3)
        feat_z = random.choice(['D', 'E', 'F'])
        etl_producer.produce('features', [{
            'id': rec['id'],
            'feat_y': feat_y,
            'feat_z': feat_z
        }])


if __name__ == '__main__':
    data = [{
        'id': random.randint(500, 600),
        'field_x': random.choice(['A', 'B', 'C']),
        'field_y': random.choice([1, 2, 3])
    } for _ in range(10)]

    etl_producer.produce('raw_fields', data)
    create_features(data)
Example #6
0
def run(argv):
    old_client = False

    if len(argv) > 4:
        test_duration = argv[1]
        msg_batch = argv[2]
        msg_requested_rate = argv[3]
        topic_name = argv[4]
        acks = argv[5]
        linger_ms = argv[6]
        msg_batch = int(msg_batch)
        msg_requested_rate = float(msg_requested_rate)
        test_duration = float(test_duration)
        topic_name = str(topic_name)
        acks = int(acks)
        linger_ms = int(linger_ms)

    # Initialize Kafka PUB Server
    l.info("Starting Kafka Publisher (producer)")
    # Estimate average message size to compute batch_size in [bytes] / Requested by Kafka
    min_message_size = len(str(0) + ' msg' + str(0))
    max_message_size = len(
        str(msg_requested_rate) + ' msg' + str(msg_requested_rate))
    average_message_size = (min_message_size + max_message_size) / 2
    batch_estimated_size = (average_message_size) * msg_batch
    l.info(
        "Message Average Size is: [%s]. Kafka Batch Size in Bytes set to: [%s]"
        % (average_message_size, batch_estimated_size))
    if old_client:
        producer = KafkaProducer(bootstrap_servers=['localhost:9092'],
                                 batch_size=batch_estimated_size,
                                 linger_ms=linger_ms,
                                 acks=acks)
    else:
        client = KafkaClient(hosts='localhost:9092')
        topic = client.topics[topic_name]
        producer = topic.get_producer(min_queued_messages=batch_estimated_size,
                                      linger_ms=linger_ms,
                                      required_acks=acks)

    # Initialize simple Rep server, this is used to listen
    # for the signal to start sending data
    pub_rep_port = os.environ.get('PORT0')
    l.info("STARTING KAFKA REP server at port [%s].", pub_rep_port)
    run_data = {
        'start': False,
        'stats': {
            'rate': 0,
            'msg_cnt': 0
        },
        'test_status': 'stopped'
    }
    pub_metrics = {
        'test_duration': test_duration,
        'msg_batch': msg_batch,
        'msg_requested_rate': msg_requested_rate
    }
    hd = HDKafkapRepSrv(pub_rep_port, run_data, pub_metrics)
    hd.run()

    while True:
        #  Wait for 'signal' to start sending messages to Kafka Broker
        if not run_data['start']:
            l.debug("KAFKA PUB WAITING FOR SIGNAL...")
            time.sleep(1)
            continue
        l.info(
            'PUB server initiating... Test Duration [%f] secs. Messages with batches [%d]'
            'and requested msg rate [%f]' %
            (hd.test_duration, hd.msg_batch, hd.msg_requested_rate))
        cnt = 0
        msg_cnt = 0
        start_time = time.time()

        # Start Publishing Messages to Broker
        while True:
            # Build 'message'
            messagedata = "msg%d" % msg_cnt
            message = "%d %s" % (msg_cnt, messagedata)

            try:
                # Publish message to the Kafka Cluster
                # topic: specifies the 'topic' where the message will be published
                if old_client:
                    producer.send(topic=topic_name, value=message)
                else:
                    producer.produce(message)
            except KafkaTimeoutError as e:
                l.error(
                    "Unable to publish message to the Kafka Cluster. ERROR: %s"
                    % e.message)

            # Insert a 'delay' if tx rate between batches outperforms the expected
            # (minimum) rate to achieve requested tx rate
            cnt += 1
            msg_cnt += 1
            if cnt >= hd.msg_batch:
                # Compute the delay
                duration = time.time() - start_time
                expected_time = msg_cnt / hd.msg_requested_rate
                delay = 0.0
                if expected_time > duration:
                    delay = expected_time - duration
                if delay > 1:
                    delay = 1
                time.sleep(delay)
                cnt = 0
            elapsed_time = time.time() - start_time
            if elapsed_time >= hd.test_duration:
                break
        # Update 'stats' to 'hd' (HDaemon)
        run_data['stats']['time:end'] = json.dumps(time.time())
        run_data['stats']['rate'] = msg_cnt / elapsed_time
        run_data['stats']['msg_cnt'] = msg_cnt
        process = psutil.Process()
        run_data['stats']['net:end'] = json.dumps(psutil.net_io_counters())
        run_data['stats']['cpu:end'] = json.dumps(process.cpu_times())
        run_data['stats']['mem:end'] = json.dumps(process.memory_info())
        run_data['test_status'] = 'stopping'
        # Go back to waiting for the next test
        run_data['start'] = False
        continue
    producer.close()
    l.info(
        "PUB Server stopping after sending %d messages elapsed time %f and message rate %f"
        % (msg_cnt, elapsed_time, run_data['stats']['rate']))