class CrawlScheduler(object): def __init__(self): if False: self.kafka = KafkaClient(*KAFKA_SERVER) self.consumer = SimpleConsumer(self.kafka, "crawl", "wiki-links", driver_type=KAFKA_THREAD_DRIVER, auto_commit=False) else: self.kafka = None self.consumer = ZSimpleConsumer(ZKHOSTS, "crawl", "wiki-links", driver_type=KAFKA_THREAD_DRIVER, manage_offsets=True, auto_commit=False) self.submitter = HeadquarterSubmitter(HQ_BASE_URL, HQ_JOB) self.stats = dict(fetched=0, scheduled=0, discarded=0) def shutdown(self): if self.kafka: self.kafka.close() def submit(self, curls): logging.info('submitting %d curls to HQ', len(curls)) for n in itertools.count(): try: self.submitter.put(curls) if n > 0: logging.info('submission retry succeeded') break except Exception, ex: logging.warn('submission failed (%s), retrying after 30s', ex) time.sleep(30.0) self.consumer.commit() self.stats['scheduled'] += len(curls)
def test_produce_consume(self): # Send two messages and consume them message1 = KafkaClient.create_message("testing 1") message2 = KafkaClient.create_message("testing 2") req = ProduceRequest("test-produce-consume", 0, [message1, message2]) self.kafka.send_message_set(req) self.assertTrue(self.server.wait_for("Created log for 'test-produce-consume'-0")) self.assertTrue(self.server.wait_for("Flushing log 'test-produce-consume-0'")) req = FetchRequest("test-produce-consume", 0, 0, 1024) (messages, req) = self.kafka.get_message_set(req) self.assertEquals(len(messages), 2) self.assertEquals(messages[0], message1) self.assertEquals(messages[1], message2) # Do the same, but for a different partition message3 = KafkaClient.create_message("testing 3") message4 = KafkaClient.create_message("testing 4") req = ProduceRequest("test-produce-consume", 1, [message3, message4]) self.kafka.send_message_set(req) self.assertTrue(self.server.wait_for("Created log for 'test-produce-consume'-1")) self.assertTrue(self.server.wait_for("Flushing log 'test-produce-consume-1'")) req = FetchRequest("test-produce-consume", 1, 0, 1024) (messages, req) = self.kafka.get_message_set(req) self.assertEquals(len(messages), 2) self.assertEquals(messages[0], message3) self.assertEquals(messages[1], message4)
def test_check_offset(self): # Produce/consume a message, check that the next offset looks correct message1 = KafkaClient.create_message("testing 1") req = ProduceRequest("test-check-offset", 0, [message1]) self.kafka.send_message_set(req) self.assertTrue(self.server.wait_for("Created log for 'test-check-offset'-0")) self.assertTrue(self.server.wait_for("Flushing log 'test-check-offset-0'")) req = FetchRequest("test-check-offset", 0, 0, 1024) (messages, nextReq) = self.kafka.get_message_set(req) self.assertEquals(len(messages), 1) self.assertEquals(messages[0], message1) self.assertEquals(nextReq.offset, len(KafkaClient.encode_message(message1))) # Produce another message, consume with the last offset message2 = KafkaClient.create_message("test 2") req = ProduceRequest("test-check-offset", 0, [message2]) self.kafka.send_message_set(req) self.assertTrue(self.server.wait_for("Flushing log 'test-check-offset-0'")) # Verify (messages, nextReq) = self.kafka.get_message_set(nextReq) self.assertEquals(len(messages), 1) self.assertEquals(messages[0], message2) self.assertEquals( nextReq.offset, len(KafkaClient.encode_message(message1)) + len(KafkaClient.encode_message(message2)) )
def output_kafka(graph_db, registry, kafka_url=None): ldict = {"step": MODULEFILE + "/" + inspect.stack()[0][3], "hostname": platform.node().split(".")[0]} l = logging.LoggerAdapter(common.fetch_lg(), ldict) kafka_topic = "cs" if kafka_url is None: kafka_url = registry.get_config("kafka_url", "localhost:9092") else: l.info("Updating registry with kafka_url: {}".format(kafka_url)) registry.put_config("kafka_url", kafka_url) (nodes, rels) = out.output_json(graph_db, None, None, as_list=True) l.info("Connecting to kafka_url {}".format(kafka_url)) kafka = KafkaClient(kafka_url) # To send messages asynchronously producer = SimpleProducer(kafka) l.info("Sending nodes to kafka {}/{}".format(kafka_url, kafka_topic)) for n in nodes: producer.send_messages(kafka_topic, n) l.info("Sending rels to kafka {}/{}".format(kafka_url, kafka_topic)) for n in rels: producer.send_messages(kafka_topic, n) kafka.close()
class CrawlScheduler(object): def __init__(self): self.kafka = KafkaClient(hosts=KAFKA_SERVER) self.consumer = SimpleConsumer( self.kafka, KAFKA_CONSUMER_GROUP, KAFKA_TOPIC, auto_commit=True, max_buffer_size=1024*1024) self.submitter = HeadquarterSubmitter(HQ_BASE_URL, HQ_JOB) self.stats = dict(fetched=0, scheduled=0, discarded=0) def shutdown(self): if self.kafka: self.kafka.close() def submit(self, curls): logging.info('submitting %d curls to HQ', len(curls)) for n in itertools.count(): try: self.submitter.put(curls) if n > 0: logging.info('submission retry succeeded') break except Exception, ex: logging.warn('submission failed (%s), retrying after 30s', ex) time.sleep(30.0) self.consumer.commit() self.stats['scheduled'] += len(curls)
class KafkaBusClient(BasicBusClient): def __init__(self, client_type, config): super(KafkaBusClient, self).__init__(client_type, config) self.address = str(self.address) self.topic = str(self.topic) self.client = KafkaClient("%s:%d" %(self.address, self.port)) if config.has_key('async'): self.async = config['async'] else: self.async = True if self.client_type == 'producer': self.producer = SimpleProducer(self.client, async=self.async) else: self.consumer_group = str(self.consumer_group) if not config.has_key('consumer_procs'): self.consumer_procs = multiprocessing.cpu_count() #print "Using %d processes" %(self.consumer_procs) self.consumer = SimpleConsumer(self.client, self.consumer_group, self.topic) #num_procs=self.consumer_procs) def close(self): self.client.close()
def process(spouts,json_data): ''' Returns a named tuple of type PartitionsSummary. ''' results = [] total_depth = 0 total_delta = 0 brokers = [] for s in spouts: for p in s.partitions: try: print "process function: broker host:" + p['broker']['host'] k = KafkaClient(p['broker']['host'], str(p['broker']['port'])) except socket.gaierror, e: raise ProcessorError('Failed to contact Kafka broker %s (%s)' % (p['broker']['host'], str(e))) earliest_off = OffsetRequest(str(p['topic']), p['partition'], -2, 1) latest_off = OffsetRequest(str(p['topic']), p['partition'], -1, 1) earliest = k.send_offset_request([earliest_off])[0] latest = k.send_offset_request([latest_off])[0] current = p['offset'] brokers.append(p['broker']['host']) total_depth = total_depth + (latest.offsets[0] - earliest.offsets[0]) total_delta = total_delta + (latest.offsets[0] - current)
class KafkaConnector(object): def __init__(self, host_name, host_port): self.client = KafkaClient(host_name + ":" + host_port) self.producer = SimpleProducer(self.client) def create_topic(self, topic_name): topic_exists = self.client.has_metadata_for_topic(topic_name) if not topic_exists: self.client.ensure_topic_exists(topic_name) def send_message(self, topic_name, message): self.producer.send_messages(topic_name, message) def register_consumer(self, callback, parse_json, topic_group, topic_name): consumer = SimpleConsumer(self.client, topic_group, topic_name) consumer_thread = ConsumerThread(consumer, callback, parse_json) consumer_thread.start() def blocking_consumer(self, message_consume_function, parse_json, topic_group, topic_name): print "starting blocking consumer with topic group %s and topic name %s" % (topic_group, topic_name) consumer = SimpleConsumer(self.client, topic_group, topic_name) consumer.seek(0,2) for message in consumer: message = parse_json(message) print "=============" + str(message) + "============" message_consume_function(message) print "called message consume function"
def test_message_simple(self): msg = KafkaClient.create_message("testing") enc = KafkaClient.encode_message(msg) expect = "\x00\x00\x00\r\x01\x00\xe8\xf3Z\x06testing" self.assertEquals(enc, expect) (messages, read) = KafkaClient.read_message_set(enc) self.assertEquals(len(messages), 1) self.assertEquals(messages[0], msg)
def test_message_snappy(self): msg = KafkaClient.create_snappy_message("one", "two", "three") enc = KafkaClient.encode_message(msg) (messages, read) = KafkaClient.read_message_set(enc) self.assertEquals(len(messages), 3) self.assertEquals(messages[0].payload, "one") self.assertEquals(messages[1].payload, "two") self.assertEquals(messages[2].payload, "three")
def listen(self): client = KafkaClient(hosts(self.server_list, self.kafka_port)) client.ensure_topic_exists(self.topic_name) # print client.topic_partitions() consumer = SimpleConsumer(client, self.consumer_name, self.topic_name) for message in consumer: value = message.message.value print value
def configure(self, **configs): """ Configuration settings can be passed to constructor, otherwise defaults will be used: client_id='kafka.consumer.kafka', group_id=None, fetch_message_max_bytes=1024*1024, fetch_min_bytes=1, fetch_wait_max_ms=100, refresh_leader_backoff_ms=200, metadata_broker_list=None, socket_timeout_ms=30*1000, auto_offset_reset='largest', deserializer_class=lambda msg: msg, auto_commit_enable=False, auto_commit_interval_ms=60 * 1000, auto_commit_interval_messages=None, consumer_timeout_ms=-1 Configuration parameters are described in more detail at http://kafka.apache.org/documentation.html#highlevelconsumerapi """ self._config = {} for key in DEFAULT_CONSUMER_CONFIG: self._config[key] = configs.pop(key, DEFAULT_CONSUMER_CONFIG[key]) if configs: raise KafkaConfigurationError('Unknown configuration key(s): ' + str(list(configs.keys()))) # Handle str/bytes conversions for config_key in BYTES_CONFIGURATION_KEYS: if isinstance(self._config[config_key], six.string_types): logger.warning("Converting configuration key '%s' to bytes" % config_key) self._config[config_key] = self._config[config_key].encode( 'utf-8') if self._config['auto_commit_enable']: if not self._config['group_id']: raise KafkaConfigurationError( 'KafkaConsumer configured to auto-commit without required consumer group (group_id)' ) # Check auto-commit configuration if self._config['auto_commit_enable']: logger.info("Configuring consumer to auto-commit offsets") self._reset_auto_commit() if self._config['metadata_broker_list'] is None: raise KafkaConfigurationError('metadata_broker_list required to ' 'configure KafkaConsumer') self._client = KafkaClient(self._config['metadata_broker_list'], client_id=self._config['client_id'], timeout=(self._config['socket_timeout_ms'] / 1000.0))
def test_message_gzip(self): msg = KafkaClient.create_gzip_message("one", "two", "three") enc = KafkaClient.encode_message(msg) # Can't check the bytes directly since Gzip is non-deterministic (messages, read) = KafkaClient.read_message_set(enc) self.assertEquals(len(messages), 3) self.assertEquals(messages[0].payload, "one") self.assertEquals(messages[1].payload, "two") self.assertEquals(messages[2].payload, "three")
def test_kafka_queue(): kafka = KafkaClient("kafka01", 9092) q = KafkaQueue(kafka, "queue", [0]) q.put("first") q.put("second") assert q.get() == "first" assert q.get() == "second" q.close() kafka.close()
def test_message_simple_random(self): for i in xrange(ITERATIONS): n = random.randint(0, 10) msgs = [KafkaClient.create_message(random_string()) for j in range(n)] enc = KafkaClient.encode_message_set(msgs) (messages, read) = KafkaClient.read_message_set(enc) self.assertEquals(len(messages), n) for j in range(n): self.assertEquals(messages[j], msgs[j])
def create_topic_if_not_existing(): client = KafkaClient(bootstrap_servers='localhost:9092') future = client.cluster.request_update() client.poll(future=future) metadata = client.cluster if TOPIC in metadata.topics(): logger.info("Topic already existing %s", TOPIC) else: create_topic()
def test_message_gzip_random(self): for i in xrange(ITERATIONS): n = random.randint(0, 10) strings = [random_string() for j in range(n)] msg = KafkaClient.create_gzip_message(*strings) enc = KafkaClient.encode_message(msg) (messages, read) = KafkaClient.read_message_set(enc) self.assertEquals(len(messages), n) for j in range(n): self.assertEquals(messages[j].payload, strings[j])
def wait_for_kafka_topic(hostport, topic, timeout=60): """Wait for a Kafka topic to become available.""" start = time.time() client = KafkaClient(hostport, client_id=b'dummy', timeout=1) while not client.has_metadata_for_topic(topic): if time.time() - start > timeout: raise Exception('timeout reached waiting for topic') time.sleep(0.1) client.load_metadata_for_topics()
def test_produce(self): # Produce a message, check that the log got created req = ProduceRequest("test-produce", 0, [KafkaClient.create_message("testing")]) self.kafka.send_message_set(req) self.assertTrue(self.server.wait_for("Created log for 'test-produce'-0")) # Same thing, different partition req = ProduceRequest("test-produce", 1, [KafkaClient.create_message("testing")]) self.kafka.send_message_set(req) self.assertTrue(self.server.wait_for("Created log for 'test-produce'-1"))
def _feed(settings_file, json_item): settings = importlib.import_module(settings_file[:-3]) kafka_conn = KafkaClient(settings.KAFKA_HOSTS) topic = settings.KAFKA_INCOMING_TOPIC producer = SimpleProducer(kafka_conn) print "=> feeding JSON request into {0}...".format(topic) print json.dumps(json_item, indent=4) kafka_conn.ensure_topic_exists(topic) producer.send_messages(topic, json.dumps(json_item)) print "=> done feeding request."
def __init__(self, url, auto_commit=True, unique_key='_id'): """Connect to kafka instance """ url_info = url.split(":") if len(url_info) < 2: raise SystemError self.server = KafkaClient(url_info[0], int(url_info[1])) self.producer_dict = {} self.auto_commit = auto_commit
def __init__(self, broker): try: self.client = KafkaClient(broker) self.prod = SimpleProducer(self.client) except KafkaUnavailableError: log.critical("\nCluster Unavailable %s : Check broker string\n", broker) raise except: raise
def __init__(self, topic, producer_type=ProducerType.SIMPLE,\ host_port="127.0.0.1:9092", **producer_opts): self.topic = topic self.host_port = host_port if producer_type == ProducerType.SIMPLE: self.producer = SimpleProducer(KafkaClient(host_port),\ **producer_opts) else: self.producer = KeyedProducer(KafkaClient(host_port),\ **producer_opts)
def __init__(self): self.kafka = KafkaClient(hosts=KAFKA_SERVER) self.consumer = SimpleConsumer(self.kafka, KAFKA_CONSUMER_GROUP, KAFKA_TOPIC, auto_commit=True, max_buffer_size=1024 * 1024) self.submitter = HeadquarterSubmitter(HQ_BASE_URL, HQ_JOB) self.stats = dict(fetched=0, scheduled=0, discarded=0)
def test_message_simple_random(self): for i in xrange(ITERATIONS): n = random.randint(0, 10) msgs = [ KafkaClient.create_message(random_string()) for j in range(n) ] enc = KafkaClient.encode_message_set(msgs) (messages, read) = KafkaClient.read_message_set(enc) self.assertEquals(len(messages), n) for j in range(n): self.assertEquals(messages[j], msgs[j])
def test_create_gzip(self): msg = KafkaClient.create_gzip_message("testing") self.assertEquals(msg.magic, 1) self.assertEquals(msg.attributes, 1) # Can't check the crc or payload for gzip since it's non-deterministic (messages, _) = KafkaClient.read_message_set(gzip_decode(msg.payload)) inner = messages[0] self.assertEquals(inner.magic, 1) self.assertEquals(inner.attributes, 0) self.assertEquals(inner.payload, "testing") self.assertEquals(inner.crc, -386704890)
def main(): kafka = KafkaClient("localhost:9092") producer = SimpleProducer(kafka) consumer = SimpleConsumer(kafka, "my-group", "activity.stream", max_buffer_size=None) producer.send_messages("activity.stream", "some message test") for message in consumer: print(message) kafka.close()
def checkTopicExists(topic_name): try: kafkaClient = KafkaClient(bootstrap_servers=HOSTPORT) metadata = kafkaClient.poll() server_topics = list(x[1] for x in metadata[0].topics) kafkaClient.close() return topic_name in server_topics except IndexError: return False except KafkaUnavailableError: logging.error("Kafka não está disponivel")
def __init__(self, conn_pool, topic, group): self.conn_pool = conn_pool self.topic = topic self.group = group self.kafka = KafkaClient(self.conn_pool) self.kafka.ensure_topic_exists(self.topic) self.consumer = SimpleConsumer(self.kafka, self.group, self.topic, max_buffer_size=None) self.consumer.seek(0, 2) # move to the tail of the queue
def test_create_snappy(self): msg = KafkaClient.create_snappy_message("testing") self.assertEquals(msg.magic, 1) self.assertEquals(msg.attributes, 2) self.assertEquals(msg.crc, -62350868) (messages, _) = KafkaClient.read_message_set(snappy_decode(msg.payload)) inner = messages[0] self.assertEquals(inner.magic, 1) self.assertEquals(inner.attributes, 0) self.assertEquals(inner.payload, "testing") self.assertEquals(inner.crc, -386704890)
def connect(self, kafkaHost, countdown=COUNT_DOWN): if countdown == 0: logger.error('kafka server can not be connected in {} times'.format(COUNT_DOWN)) return try: self.kafkaClient = KafkaClient(kafkaHost, timeout=self.SOCKET_TIMEOUT) except: logger.warning('try to connect kafka server again {}'.format(countdown)) self.connect(kafkaHost, countdown - 1) logger.info('Kafka client connected {}'.format(self.kafkaClient))
def listen(self): client = KafkaClient(hosts(self.server_list, self.kafka_port)) client.ensure_topic_exists(self.topic_name) consumer = SimpleConsumer(client, self.consumer_name, self.topic_name) for message in consumer: value = message.message.value value = json.loads(value) if value['no'] % 10 == 0: print value subject = "test mail => "+message.message.value body = "Good day! Now is "+datetime.now().strftime('%Y-%m-%d %H:%M:%S') send_mail(self.email_address,subject,body)
def add_data(): global users try: mcl = pm.MongoClient('10.137.168.196:27017') kafka = KafkaClient('mozo.cloudapp.net:9092', timeout=None) producer = UserProducer(kafka, kafkaTopic, users, parts, async=False, req_acks=UserProducer.ACK_AFTER_LOCAL_WRITE, ack_timeout=200) coll = mcl.DataSet['PMLExpression'] ii = 0 # max is 151413 (number of doc in PMLExpression) for ent in coll.find({'userId': { '$in': UoI.keys() }}, { '_id': True, 'userId': True }, timeout=False): ii += 1 entity = str(ent['_id']) userId = ent['userId'] if (stop_add_data(userId)): continue UoI[userId] += 1 encodedMessage = simplejson.dumps({ 'turtleId': turtleId, 'userId': userId, 'entity': entity, 'operation': 'add_data' }) print producer.send(userId, encodedMessage) for userId, partitionId in users.iteritems(): encodedMessage = simplejson.dumps({ 'turtleId': turtleId, 'userId': userId, 'operation': 'save_one' }) print producer.send(userId, encodedMessage) userColl = mcl.DataSet['PMLUsers'] if users: userColl.insert([{ 'userId': userId, 'partitionId': partitionId } for userId, partitionId in users.iteritems()]) finally: producer.stop() mcl.close() kafka.close()
class KafkaDatawakeLookaheadSpout(Spout): group = 'datawake-crawler-out-consumer'.encode() def __init__(self): Spout.__init__(self) self.queue = None def initialize(self, stormconf, context): try: self.settings = all_settings.get_settings(stormconf['topology.deployment']) self.topic = self.settings['crawler-out-topic'].encode() self.conn_pool = self.settings['crawler_conn_pool'].encode() self.log('KafkaDatawakeLookaheadSpout initialized with topic =' + self.topic + ' conn_pool=' + self.conn_pool) self.kafka = KafkaClient(self.conn_pool) self.kafka.ensure_topic_exists(self.topic) self.consumer = SimpleConsumer(self.kafka, self.group, self.topic, max_buffer_size=None) self.consumer.seek(0, 2) # move to the tail of the queue except: self.log("KafkaDatawakeLookaheadSpout initialize error", level='error') self.log(traceback.format_exc(), level='error') raise def next_tuple(self): """ input message: dict( crawlid = input['crawlid'], appid = input['appid'], url = url, status_code = response.getcode(), status_msg = 'Success', timestamp = response.info()['date'], links_found = links, body = html, attrs = input['attrs'] ) :return: (url, status, headers, flags, body, timestamp, source,context) """ offsetAndMessage = self.consumer.get_messages(timeout=None)[0] message = offsetAndMessage.message.value crawled = json.loads(message) if crawled['appid'] == self.settings["appid"]: safeurl = crawled['url'].encode('utf-8', 'ignore') self.log("Lookahead spout received id: " + crawled['crawlid'] + " url: " + safeurl) context = { 'source': 'datawake-lookahead', 'domain': crawled['attrs']['domain'] } self.emit([crawled['url'], crawled['status_code'], '', '', crawled['body'], crawled['timestamp'], context['source'], context])
class Producer(): def __init__(self, server_list, kafka_port, topic_name): self.server_list = server_list self.kafka_port = kafka_port self.topic_name = topic_name self.client = KafkaClient(hosts(self.server_list, self.kafka_port)) self.producer = SimpleProducer(self.client, batch_send=False) def ensure_topic_exists(self): self.client.ensure_topic_exists(self.topic_name) def forwarder(self, message): self.producer.send_messages(self.topic_name, message)
def initialize(self, stormconf, context): try: settings = all_settings.get_settings(stormconf['topology.deployment']) self.topic = settings['crawler-in-topic'].encode() self.conn_pool = settings['conn_pool'].encode() self.log('CrawlerSpout initialized with topic ='+self.topic+' conn_pool='+self.conn_pool) self.kafka = KafkaClient(self.conn_pool) self.kafka.ensure_topic_exists(self.topic) self.consumer = SimpleConsumer(self.kafka,self.group,self.topic,max_buffer_size=None, fetch_size_bytes=2000000) self.consumer.seek(0,2) # move to the tail of the queue except: self.log("CrawlerSpout initialize error",level='error') self.log(traceback.format_exc(),level='error') raise
def __init__(self, api, kafka_host='localhost:9092', stream_config={}): super(tweepy.StreamListener, self).__init__() self.api = api self.stream_config = stream_config print('bootstrap_servers:', kafka_host) self.producer = KafkaProducer(bootstrap_servers=kafka_host) # Add Kafka topics topic = self.stream_config.get('kafka_topic') if topic: client = KafkaClient(bootstrap_servers=kafka_host) client.add_topic(topic)
def send_msg(msgs): cli = KafkaClient("localhost:9092") producer = SimpleProducer(cli) if isinstance(msgs, list): content = [(json.dumps(msg) if isinstance(msg, dict) else msg) for msg in msgs] else: content = [msgs] try: resp = producer.send_messages("tp_test1", *content) print resp except Exception: print traceback.format_exc() finally: cli.close()
def configure_input_queue(self): """ configures the input queue that other services can use to schedule an event to be delivered """ client = KafkaClient(hosts=self.kafka_hosts) client.ensure_topic_exists(self.input_topic) indexed_consumer = IndexedConsumer(self.input_topic, self.kafka_hosts) queue_consumer = KafkaConsumer(self.input_topic, bootstrap_servers=self.kafka_hosts, group_id=CONSUMER_GROUP) queue_producer = SimpleProducer(KafkaClient(hosts=self.kafka_hosts)) self.queues.append( InputQueue(queue_consumer, indexed_consumer, queue_producer, self.number_of_queues))
def test_produce(self): # Produce a message, check that the log got created req = ProduceRequest("test-produce", 0, [KafkaClient.create_message("testing")]) self.kafka.send_message_set(req) self.assertTrue( self.server.wait_for("Created log for 'test-produce'-0")) # Same thing, different partition req = ProduceRequest("test-produce", 1, [KafkaClient.create_message("testing")]) self.kafka.send_message_set(req) self.assertTrue( self.server.wait_for("Created log for 'test-produce'-1"))
class KafkaDatawakeVisitedSpout(Spout): group = 'datawake-visited-consumer'.encode() def __init__(self): Spout.__init__(self) self.queue = None def initialize(self, stormconf, context): try: settings = all_settings.get_settings( stormconf['topology.deployment']) self.topic = settings['visited-topic'].encode() self.conn_pool = settings['conn_pool'].encode() self.log('KafkaDatawakeVisitedSpout initialized with topic =' + self.topic + ' conn_pool=' + self.conn_pool) self.kafka = KafkaClient(self.conn_pool) self.kafka.ensure_topic_exists(self.topic) self.consumer = SimpleConsumer(self.kafka, self.group, self.topic, max_buffer_size=None) self.consumer.seek(0, 2) # move to the tail of the queue except: self.log("KafkaDatawakeVisitedSpout initialize error", level='error') self.log(traceback.format_exc(), level='error') raise def next_tuple(self): """ input: (timestamp,org,domain,user_id,url,html) :return: (url, status, headers, flags, body, timestamp, source,context) """ try: for message in self.consumer: self.log("msg") self.log(message) #offsetAndMessage = self.consumer.get_messages(timeout=None)[0] message = message.split('\0') (timestamp, org, domain, userId, url, html) = message context = {'source': 'datawake-visited', 'domain': domain} self.emit([ url, '', '', '', html, timestamp, context['source'], context ]) except: self.log(traceback.format_exc(), level='error') def fail(self, tup_id): pass
def offsetCommit(): global users checkUserPartitionMapping() kafkaClient = KafkaClient(kafkaHost, timeout=None) producer = KeyedProducer(kafkaClient, async=False, req_acks=UserProducer.ACK_AFTER_LOCAL_WRITE, ack_timeout=200) for partition in partitions: encodedMessage = simplejson.dumps({'turtleName':turtleName, 'user':'', 'operation':'offsetCommit'}) print producer.send(kafkaTopic, partition, encodedMessage) producer.stop(1) kafkaClient.close()
def mockTradingdesk(sleeptime = 0.1): ISOTIMEFORMAT='%Y-%m-%d %X' global count count = 1 while True: print count Client = KafkaClient("172.20.0.51:9092") data = json.dumps({"time_stamp":time.strftime(ISOTIMEFORMAT, time.localtime()),"click_id":"yf_td_test_topic","campaign_id":"3","offer_id":"4","ref_site":"5","site":"6","click_time":"7","cost_per_click":"8","payout":"9","real_ip":"10","proxy_ip":"11","device_id":"12","os_id":"13","carrier_id":"14","mobile_brand_id":"15","screen_h":"16","screen_w":"17","screen_id":"18","city_id":"19","brand_id":"20","model_id":"21","country_id":"22","state_id":"23","conversion_time":"24","event":"25","sub1":"26","sub2":"27","sub3":"28","sub4":"29","sub5":"30","sub6":"31","sub7":"32","sub8":"33","click":"34","lp_click":"35","conversion":"36","sub_campaign_id":"37"}) producer = SimpleProducer(Client,async=False,req_acks=SimpleProducer.ACK_AFTER_LOCAL_WRITE) producer.send_messages("td_test_topic1", data) count += 1 Client.close() time.sleep(0.1) thread.exit_thread()
def test_message_list(self): msgs = [ KafkaClient.create_message("one"), KafkaClient.create_message("two"), KafkaClient.create_message("three") ] enc = KafkaClient.encode_message_set(msgs) expect = ("\x00\x00\x00\t\x01\x00zl\x86\xf1one\x00\x00\x00\t\x01\x00\x11" "\xca\x8aftwo\x00\x00\x00\x0b\x01\x00F\xc5\xd8\xf5three") self.assertEquals(enc, expect) (messages, read) = KafkaClient.read_message_set(enc) self.assertEquals(len(messages), 3) self.assertEquals(messages[0].payload, "one") self.assertEquals(messages[1].payload, "two") self.assertEquals(messages[2].payload, "three")
class KafkaClientPlugin(cherrypy.process.plugins.SimplePlugin): def start(self): self.client = KafkaClient(config.KAFKA_HOST) self.producer = SimpleProducer(self.client) self.writer = WriterProcess(config.KAFKA_HOST) self.writer.start() self.bus.subscribe("dbwrite", self.dbwrite) def stop(self): self.writer.terminate() self.client.close() def dbwrite(self, key, value): message = IWAMessage(key, value) self.producer.send_messages(config.KAFKA_TOPIC, message.dumps()) cherrypy.log("Queued: %s => %s" % (message.key, message.value))
def run(self): self.barrier.wait() log.info("Starting %s" % self) messages = [] last_produce = time.time() def flush(messages): self.client.send_message_set(ProduceRequest(self.topic, -1, messages)) del messages[:] while True: if self.barrier.is_set() is False: log.info("Shutdown %s, flushing messages" % self) flush(messages) self.client.close() break if len(messages) > self.producer_flush_buffer: log.debug("Message count threshold reached. Flushing messages") flush(messages) last_produce = time.time() elif (time.time() - last_produce) > self.producer_flush_timeout: log.debug("Producer timeout reached. Flushing messages") flush(messages) last_produce = time.time() try: msg = KafkaClient.create_message( self.in_queue.get(True, self.producer_timeout)) messages.append(msg) except Empty: continue
def __init__(self, settings): # dynamic import of settings file # remove the .py from the filename self.settings = importlib.import_module(settings[:-3]) # only need kafka for both uses self.kafka_conn = KafkaClient(self.settings.KAFKA_HOSTS)
class Kafka(PluginBase): def __init__(self): self.pluginName = "KafkaProducer" super(Kafka,self).__init__() #TODO: move kafka client config to config.ini #print(dir(kafkaproducer)) self.myKafka = KafkaClient("192.168.100.91", 9092) #self.producer = SimpleProducer(self.myKafka, "netflow", async=True) def run(self,inputObject): r = self._fmt(inputObject) self.myKafka.send_messages_simple("netflow",r) def _fmt(self,inputObject): r = {key:getattr(inputObject,key) for key in Settings.SETTINGS.getlist(Settings.SETTINGS.get("output","fieldNames"))} self.logger.debug("Sending: %s"%(json.dumps(r))) return json.dumps(r)
def test_10k_messages(self): msg_tmpl = "this is a test message with a few bytes in it. this is message number %d" # TODO 10k actually fails, why? msg = KafkaClient.create_gzip_message(*[msg_tmpl % i for i in range(1000)]) req = ProduceRequest("test-10k", 0, [msg]) self.kafka.send_message_set(req) self.assertTrue(self.server.wait_for("Created log for 'test-10k'-0")) self.assertTrue(self.server.wait_for("Flushing log 'test-10k-0'"))
class KafkaConsumer: group = "python-lookahead-consumer" def __init__(self,conn_pool,topic,group): self.conn_pool = conn_pool self.topic = topic self.group = group self.kafka = KafkaClient(self.conn_pool) self.kafka.ensure_topic_exists(self.topic) self.consumer = SimpleConsumer(self.kafka,self.group,self.topic,max_buffer_size=None) self.consumer.seek(0,2) # move to the tail of the queue def next(self): offsetAndMessage = self.consumer.get_messages(timeout=None)[0] message = offsetAndMessage.message.value return message
def __init__(self,conn_pool,topic,group): self.conn_pool = conn_pool self.topic = topic self.group = group self.kafka = KafkaClient(self.conn_pool) self.kafka.ensure_topic_exists(self.topic) self.consumer = SimpleConsumer(self.kafka,self.group,self.topic,max_buffer_size=None) self.consumer.seek(0,2) # move to the tail of the queue
def train(numIters): global users checkUserPartitionMapping() kafka = KafkaClient(kafkaHost, timeout=None) producer = UserProducer(kafka, kafkaTopic, users, partitions, async=False, req_acks=UserProducer.ACK_AFTER_LOCAL_WRITE, ack_timeout=200) for i in range(numIters): for user, partitionId in users.iteritems(): if user == '' or user == 'monk': continue encodedMessage = simplejson.dumps({'turtleName':turtleName, 'user':user, 'operation':'train'}) print i, producer.send(user, encodedMessage) producer.stop(1) kafka.close()