""" Usage: k-topic.py <host> [new-topic] """ from kafka import SimpleProducer, KafkaClient import logging import sys logging.basicConfig() kafka = KafkaClient(sys.argv[1] + ':9092') if len(sys.argv) > 2: topic = sys.argv[2] print("creating topic: {0}".format(topic)) kafka.ensure_topic_exists(topic) for t in kafka.topics: print("{0!r}:".format(t)) # print(" partitions: {0!r}:".format(kafka.get_partition_ids_for_topic(t))) print("done.") ## end
def main(): producer = KafkaProducer(bootstrap_servers=ipAddress+':9092', value_serializer=lambda v: json.dumps(v).encode('utf-8')) kafka = KafkaClient(ipAddress+':9092') kafka.ensure_topic_exists(knpTenant) ############VIALIS############## readsendTenant(producer, 'vialis')
def send_payload(self, endpoint, topic, message, ensure_topic_exists=True): kafka = KafkaClient(endpoint) if ensure_topic_exists: kafka.ensure_topic_exists(topic) producer = SimpleProducer(kafka, async=True) producer.send_messages(topic, message)
def run(self, topic, message, hosts=None): """ Simple round-robin synchronous producer to send one message to one topic. :param hosts: Kafka hostname(s) to connect in host:port format. Comma-separated for several hosts. :type hosts: ``str`` :param topic: Kafka Topic to publish the message on. :type topic: ``str`` :param message: The message to publish. :type message: ``str`` :returns: Response data: `topic`, target `partition` where message was sent, `offset` number and `error` code (hopefully 0). :rtype: ``dict`` """ if hosts: _hosts = hosts elif self.config.get('hosts', None): _hosts = self.config['hosts'] else: raise ValueError("Need to define 'hosts' in either action or in config") # set default for empty value _client_id = self.config.get('client_id') or self.DEFAULT_CLIENT_ID client = KafkaClient(_hosts, client_id=_client_id) client.ensure_topic_exists(topic) producer = SimpleProducer(client) result = producer.send_messages(topic, kafka_bytestring(message)) if result[0]: return result[0].__dict__
def run(self, topic, message, hosts=None): """ Simple round-robin synchronous producer to send one message to one topic. :param hosts: Kafka hostname(s) to connect in host:port format. Comma-separated for several hosts. :type hosts: ``str`` :param topic: Kafka Topic to publish the message on. :type topic: ``str`` :param message: The message to publish. :type message: ``str`` :returns: Response data: `topic`, target `partition` where message was sent, `offset` number and `error` code (hopefully 0). :rtype: ``dict`` """ if hosts: _hosts = hosts elif self.config.get('hosts', None): _hosts = self.config['hosts'] else: raise ValueError( "Need to define 'hosts' in either action or in config") # set default for empty value _client_id = self.config.get('client_id') or self.DEFAULT_CLIENT_ID client = KafkaClient(_hosts, client_id=_client_id) client.ensure_topic_exists(topic) producer = SimpleProducer(client) result = producer.send_messages(topic, kafka_bytestring(message)) if result[0]: return result[0].__dict__
def ensure_topic_existed(topic): try: server_info = get_server_kafka() client = KafkaClient(server_info) client.ensure_topic_exists(topic) client.close() except ValueError: print(ValueError.message)
class KafkaSender(): def __init__(self): self.client=KafkaClient(hosts) #self.producer = SimpleProducer(self.client,batch_send=batch_send,batch_send_every_n=batch_send_every_n) self.producer=KafkaProducer(bootstrap_servers=hosts) self.client.ensure_topic_exists(topic) def send_messages(self,msg): self.producer.send(topic,msg)
class KafkaSender(): def __init__(self): self.client = KafkaClient(hosts) self.producer = KafkaProducer(bootstrap_servers=hosts) self.client.ensure_topic_exists(topic) def send_messages(self, msg): self.producer.send(topic, msg) self.producer.flush()
class KafkaIntegrationTestCase(unittest.TestCase): create_client = True topic = None bytes_topic = None zk = None server = None def setUp(self): super(KafkaIntegrationTestCase, self).setUp() if not os.environ.get('KAFKA_VERSION'): self.skipTest('Integration test requires KAFKA_VERSION') if not self.topic: topic = "%s-%s" % (self.id()[self.id().rindex(".") + 1:], random_string(10)) self.topic = topic self.bytes_topic = topic.encode('utf-8') if self.create_client: self.client = KafkaClient('%s:%d' % (self.server.host, self.server.port)) self.client.ensure_topic_exists(self.topic) self._messages = {} def tearDown(self): super(KafkaIntegrationTestCase, self).tearDown() if not os.environ.get('KAFKA_VERSION'): return if self.create_client: self.client.close() def current_offset(self, topic, partition): try: offsets, = self.client.send_offset_request([ OffsetRequest(kafka_bytestring(topic), partition, -1, 1) ]) except: # XXX: We've seen some UnknownErrors here and cant debug w/o server logs self.zk.child.dump_logs() self.server.child.dump_logs() raise else: return offsets.offsets[0] def msgs(self, iterable): return [ self.msg(x) for x in iterable ] def msg(self, s): if s not in self._messages: self._messages[s] = '%s-%s-%s' % (s, self.id(), str(uuid.uuid4())) return self._messages[s].encode('utf-8') def key(self, k): return k.encode('utf-8')
def test_ensure_topic_exists(self, decode_metadata_response, conn): conn.recv.return_value = 'response' # anything but None brokers = [ BrokerMetadata(0, 'broker_1', 4567), BrokerMetadata(1, 'broker_2', 5678) ] topics = [ TopicMetadata(b'topic_still_creating', NO_LEADER, []), TopicMetadata(b'topic_doesnt_exist', UNKNOWN_TOPIC_OR_PARTITION, []), TopicMetadata(b'topic_noleaders', NO_ERROR, [ PartitionMetadata(b'topic_noleaders', 0, -1, [], [], NO_LEADER), PartitionMetadata(b'topic_noleaders', 1, -1, [], [], NO_LEADER), ]), ] decode_metadata_response.return_value = MetadataResponse(brokers, topics) client = KafkaClient(hosts=['broker_1:4567']) with self.assertRaises(UnknownTopicOrPartitionError): client.ensure_topic_exists('topic_doesnt_exist', timeout=1) with self.assertRaises(KafkaTimeoutError): client.ensure_topic_exists('topic_still_creating', timeout=1) # This should not raise client.ensure_topic_exists('topic_noleaders', timeout=1) client.ensure_topic_exists(b'topic_noleaders', timeout=1)
def test_ensure_topic_exists(self, decode_metadata_response, conn): conn.recv.return_value = 'response' # anything but None brokers = [ BrokerMetadata(0, 'broker_1', 4567), BrokerMetadata(1, 'broker_2', 5678) ] topics = [ TopicMetadata(b'topic_still_creating', NO_LEADER, []), TopicMetadata(b'topic_doesnt_exist', UNKNOWN_TOPIC_OR_PARTITION, []), TopicMetadata(b'topic_noleaders', NO_ERROR, [ PartitionMetadata(b'topic_noleaders', 0, -1, [], [], NO_LEADER), PartitionMetadata(b'topic_noleaders', 1, -1, [], [], NO_LEADER), ]), ] decode_metadata_response.return_value = MetadataResponse( brokers, topics) client = KafkaClient(hosts=['broker_1:4567']) with self.assertRaises(UnknownTopicOrPartitionError): client.ensure_topic_exists('topic_doesnt_exist', timeout=1) with self.assertRaises(KafkaTimeoutError): client.ensure_topic_exists('topic_still_creating', timeout=1) # This should not raise client.ensure_topic_exists('topic_noleaders', timeout=1) client.ensure_topic_exists(b'topic_noleaders', timeout=1)
class KafkaHandler(object): topic = 'test' def __init__(self): self.client = KafkaClient('kafka:9092') self.producer = SimpleProducer(self.client) def create_topic(self): self.client.ensure_topic_exists(self.topic) return self def load_messages(self): with open('/data.json', 'r') as handle: self.producer.send_messages(self.topic, *handle.read().splitlines())
class Worker(object): def __init__(self, topic, hosts=None, log_level=logging.WARNING): hosts = hosts or "localhost:9092" self.group = "kafque" self.topic = "{}_{}".format(self.group, topic) self.client = KafkaClient(hosts) self.client.ensure_topic_exists(str(self.topic)) self.consumer = SimpleConsumer( self.client, str(self.group), str(self.topic), auto_commit=False) self.consumer.provide_partition_info() self.consumer.fetch_last_known_offsets() self.logger = setup_logger(__name__, level=log_level) self.failed_queue = None if self.topic != "{}_failed".format(self.group): self.failed_queue = FailedQueue( hosts=hosts, log_level=logging.ERROR) def handle_signals(self): def warm_shutdown(signum, frame): # TODO: if worker is busy, defer cleanup to cold_shutdown self.logger.debug("Got signal {}.".format(signum)) self.logger.warning("Warm shut down.") raise SystemExit() signal.signal(signal.SIGINT, warm_shutdown) signal.signal(signal.SIGTERM, warm_shutdown) def run(self): self.logger.info("kafque worker started.") self.handle_signals() for partition, message in self.consumer: self.logger.debug("Offset {}".format(message.offset)) job = json.loads(message.message.value) callback = callback_from_string(job.pop("callback")) try: result = callback(*job["args"], **job["kwargs"]) self.logger.info(result) self.consumer.commit() except Exception as exc: self.logger.error(exc, exc_info=True) # TODO: set job as failed if self.failed_queue: self.failed_queue.enqueue( callback, args=job["args"], kwargs=job["kwargs"]) self.consumer.commit()
class KafkaBase(Base): """ A block defining common Kafka functionality. Properties: host (str): location of the database port (int): open port served by database topic (str): topic name """ host = StringProperty(title='Host', default='[[KAFKA_HOST]]') port = IntProperty(title='Port', default=9092) topic = StringProperty(title='Topic', default="", allow_none=False) def __init__(self): super().__init__() self._kafka = None self._encoded_topic = None def configure(self, context): super().configure(context) if not len(self.topic()): raise ValueError("Topic cannot be empty") self._connect() def stop(self): self._disconnect() super().stop() def _connect(self): self._kafka = KafkaClient("{0}:{1}".format(self.host(), self.port())) self._encoded_topic = self.topic() # ensuring topic is valid try: self._kafka.ensure_topic_exists(self._encoded_topic) except Exception: self.logger.exception("Topic: {0} does not exist" .format(self.topic())) raise def _disconnect(self): if self._kafka: self._kafka.close() self._kafka = None @property def connected(self): return self._kafka
def kafka_sender(): while True: try: log.info('connecting to kafka server at %s' % kafka_server) cl = KafkaClient(kafka_server) pr = SimpleProducer(cl) cn = KafkaConsumer('planedata', bootstrap_servers=[kafka_server], group_id='planedata') cl.ensure_topic_exists('planedata') while True: msg = yield log.debug('committed 1 msg (%db) to kafka' % len(msg)) pr.send_messages('planedata', msg) except: log.exception('failed to send kafka message - will retry in %d seconds' % kafka_connect_retry_wait) time.sleep(kafka_connect_retry_wait)
class KafkaIntegrationTestCase(unittest.TestCase): create_client = True topic = None server = None def setUp(self): super(KafkaIntegrationTestCase, self).setUp() if not os.environ.get('KAFKA_VERSION'): return if not self.topic: topic = "%s-%s" % (self.id()[self.id().rindex(".") + 1:], random_string(10).decode('utf-8')) self.topic = topic.encode('utf-8') if self.create_client: self.client = KafkaClient('%s:%d' % (self.server.host, self.server.port)) self.client.ensure_topic_exists(self.topic) self._messages = {} def tearDown(self): super(KafkaIntegrationTestCase, self).tearDown() if not os.environ.get('KAFKA_VERSION'): return if self.create_client: self.client.close() def current_offset(self, topic, partition): offsets, = self.client.send_offset_request( [OffsetRequest(topic, partition, -1, 1)]) return offsets.offsets[0] def msgs(self, iterable): return [self.msg(x) for x in iterable] def msg(self, s): if s not in self._messages: self._messages[s] = '%s-%s-%s' % (s, self.id(), str(uuid.uuid4())) return self._messages[s].encode('utf-8') def key(self, k): return k.encode('utf-8')
class KafkaTransport(BaseTransport): def __init__(self, beaver_config, logger=None): super(KafkaTransport, self).__init__(beaver_config, logger=logger) self._kafka_config = {} config_to_store = [ 'client_id', 'hosts', 'async', 'topic', 'key', 'ack_timeout', 'codec', 'batch_n', 'batch_t', 'round_robin' ] for key in config_to_store: self._kafka_config[key] = beaver_config.get('kafka_' + key) try: self._client = KafkaClient(self._kafka_config['hosts'], self._kafka_config['client_id']) self._client.ensure_topic_exists(self._kafka_config['topic']) self._key = self._kafka_config['key'] if self._key is None: self._prod = SimpleProducer( self._client, async=self._kafka_config['async'], req_acks=SimpleProducer.ACK_AFTER_LOCAL_WRITE, ack_timeout=self._kafka_config['ack_timeout'], codec=self._kafka_config['codec'], batch_send=True, batch_send_every_n=self._kafka_config['batch_n'], batch_send_every_t=self._kafka_config['batch_t']) else: partitioner = None if self._kafka_config['round_robin']: partitioner = RoundRobinPartitioner self._prod = KeyedProducer( self._client, async=self._kafka_config['async'], partitioner=partitioner, req_acks=SimpleProducer.ACK_AFTER_LOCAL_WRITE, ack_timeout=self._kafka_config['ack_timeout'], codec=self._kafka_config['codec'], batch_send=True, batch_send_every_n=self._kafka_config['batch_n'], batch_send_every_t=self._kafka_config['batch_t']) self._is_valid = True except Exception, e: raise TransportException(e.message)
def topic_security(ip): """Ensures our topic exists If we're the first one online it won't exist, this will not be needed once we configure topics in the kafka configuration This will open a connection, create the topic, then close the connection **Issues**: - The Port is hardcoded :param ip: The IP of our Kafka Box :type ip: str """ kafka = KafkaClient("%s:9092" % (ip)) kafka.ensure_topic_exists(TOPIC) kafka.close()
def topic_security(ip): """Ensures our topic exists If we're the first one online it won't exist, this will not be needed once we configure topics in the kafka configuration This will open a connection, create the topic, then close the connection **Issues**: - The Port is hardcoded :param ip: The IP of our Kafka Box :type ip: str """ kafka = KafkaClient("%s:9092" % (ip)) kafka.ensure_topic_exists(TOPIC) kafka.close()
def setUp(self): logging.basicConfig() handler = logging.StreamHandler(stdout) logger = logging.getLogger("kafka") logger.setLevel(logging.INFO) logger.addHandler(handler) self.logger = logging.getLogger("tester") self.logger.debug("setup started") kafka_location = "127.0.0.1:9092" client = KafkaClient(kafka_location) client.ensure_topic_exists("frontier-todo") client.ensure_topic_exists("frontier-done") client.ensure_topic_exists("frontier-score") client.close() settings = Settings() settings.set('KAFKA_LOCATION', kafka_location) settings.set('SPIDER_FEED_PARTITIONS', 1) settings.set('SPIDER_LOG_PARTITIONS', 1) settings.set('QUEUE_HOSTNAME_PARTITIONING', True) self.messagebus = KafkaMessageBus(settings) spiderlog = self.messagebus.spider_log() # sw self.sw_sl_c = KafkaConsumerPolling( spiderlog.consumer(partition_id=0, type=b'sw')) scoring_log = self.messagebus.scoring_log() self.sw_us_p = scoring_log.producer() # db self.db_sl_c = KafkaConsumerPolling( spiderlog.consumer(partition_id=None, type=b'db')) self.db_us_c = KafkaConsumerPolling(scoring_log.consumer()) spider_feed = self.messagebus.spider_feed() self.db_sf_p = spider_feed.producer() # spider self.sp_sl_p = spiderlog.producer() self.sp_sf_c = KafkaConsumerPolling( spider_feed.consumer(partition_id=0)) self.logger.debug("init is done")
class KafkaIntegrationTestCase(unittest.TestCase): create_client = True topic = None server = None def setUp(self): super(KafkaIntegrationTestCase, self).setUp() if not os.environ.get('KAFKA_VERSION'): return if not self.topic: topic = "%s-%s" % (self.id()[self.id().rindex(".") + 1:], random_string(10).decode('utf-8')) self.topic = topic.encode('utf-8') if self.create_client: self.client = KafkaClient('%s:%d' % (self.server.host, self.server.port)) self.client.ensure_topic_exists(self.topic) self._messages = {} def tearDown(self): super(KafkaIntegrationTestCase, self).tearDown() if not os.environ.get('KAFKA_VERSION'): return if self.create_client: self.client.close() def current_offset(self, topic, partition): offsets, = self.client.send_offset_request([ OffsetRequest(topic, partition, -1, 1) ]) return offsets.offsets[0] def msgs(self, iterable): return [ self.msg(x) for x in iterable ] def msg(self, s): if s not in self._messages: self._messages[s] = '%s-%s-%s' % (s, self.id(), str(uuid.uuid4())) return self._messages[s].encode('utf-8') def key(self, k): return k.encode('utf-8')
def test_kafka_message_bus_integration(): kafka_location = "127.0.0.1:9092" client = KafkaClient(kafka_location) client.ensure_topic_exists("frontier-todo") client.ensure_topic_exists("frontier-done") client.ensure_topic_exists("frontier-score") logging.basicConfig(level=logging.INFO) #kafkabus = logging.getLogger("kafkabus") #kafkabus.addHandler(logging.StreamHandler()) settings = Settings() settings.set('KAFKA_LOCATION', kafka_location) settings.set('FRONTIER_GROUP', 'frontier2') settings.set('SCORING_TOPIC', "frontier-score") tester = MessageBusTester(KafkaMessageBus, settings) tester.spider_log_activity(64) assert tester.sw_activity() == 64 assert tester.db_activity(128) == (64, 32) assert tester.spider_feed_activity() == 128
class Queue(object): def __init__(self, topic, hosts=None, log_level=logging.WARNING): hosts = hosts or "localhost:9092" self.topic = "{}_{}".format("kafque", topic) self.client = KafkaClient(hosts) self.client.ensure_topic_exists(str(self.topic)) self.producer = SimpleProducer( self.client, req_acks=SimpleProducer.ACK_AFTER_CLUSTER_COMMIT) self.logger = setup_logger(__name__, level=log_level) def enqueue(self, callback, args=None, kwargs=None): _callback = "{}.{}".format(callback.__module__, callback.__name__) job = json.dumps({ "callback": _callback, "args": args or (), "kwargs": kwargs or {}, }) return self.producer.send_messages(str(self.topic), job)
class KafkaTransport(BaseTransport): def __init__(self, beaver_config, logger=None): super(KafkaTransport, self).__init__(beaver_config, logger=logger) self._kafka_config = {} config_to_store = [ 'client_id', 'hosts', 'async', 'topic', 'key', 'ack_timeout', 'codec', 'batch_n', 'batch_t', 'round_robin' ] for key in config_to_store: self._kafka_config[key] = beaver_config.get('kafka_' + key) try: self._client = KafkaClient(self._kafka_config['hosts'], self._kafka_config['client_id']) self._client.ensure_topic_exists(self._kafka_config['topic']) self._key = self._kafka_config['key'] if self._key is None: self._prod = SimpleProducer(self._client, async=self._kafka_config['async'], req_acks=SimpleProducer.ACK_AFTER_LOCAL_WRITE, ack_timeout=self._kafka_config['ack_timeout'], codec=self._kafka_config['codec'], batch_send=True, batch_send_every_n=self._kafka_config['batch_n'], batch_send_every_t=self._kafka_config['batch_t']) else: partitioner = None if self._kafka_config['round_robin']: partitioner = RoundRobinPartitioner self._prod = KeyedProducer(self._client, async=self._kafka_config['async'], partitioner=partitioner, req_acks=SimpleProducer.ACK_AFTER_LOCAL_WRITE, ack_timeout=self._kafka_config['ack_timeout'], codec=self._kafka_config['codec'], batch_send=True, batch_send_every_n=self._kafka_config['batch_n'], batch_send_every_t=self._kafka_config['batch_t']) self._is_valid = True except Exception, e: raise TransportException(e.message)
def setUp(self): logging.basicConfig() handler = logging.StreamHandler(stdout) logger = logging.getLogger("kafka") logger.setLevel(logging.INFO) logger.addHandler(handler) self.logger = logging.getLogger("tester") self.logger.debug("setup started") kafka_location = "127.0.0.1:9092" client = KafkaClient(kafka_location) client.ensure_topic_exists("frontier-todo") client.ensure_topic_exists("frontier-done") client.ensure_topic_exists("frontier-score") client.close() settings = Settings() settings.set('KAFKA_LOCATION', kafka_location) settings.set('SPIDER_FEED_PARTITIONS', 1) settings.set('SPIDER_LOG_PARTITIONS', 1) settings.set('QUEUE_HOSTNAME_PARTITIONING', True) self.messagebus = KafkaMessageBus(settings) spiderlog = self.messagebus.spider_log() # sw self.sw_sl_c = KafkaConsumerPolling(spiderlog.consumer(partition_id=0, type=b'sw')) scoring_log = self.messagebus.scoring_log() self.sw_us_p = scoring_log.producer() # db self.db_sl_c = KafkaConsumerPolling(spiderlog.consumer(partition_id=None, type=b'db')) self.db_us_c = KafkaConsumerPolling(scoring_log.consumer()) spider_feed = self.messagebus.spider_feed() self.db_sf_p = spider_feed.producer() # spider self.sp_sl_p = spiderlog.producer() self.sp_sf_c = KafkaConsumerPolling(spider_feed.consumer(partition_id=0)) self.logger.debug("init is done")
class KafkaHandler(logging.Handler): def __init__(self, settings): self.settings = settings self.client = KafkaClient(settings.get("KAFKA_HOSTS")) self.producer = SimpleProducer(self.client) self.producer.send_messages = failedpayloads_wrapper( settings.get("KAFKA_RETRY_TIME", 5))(self.producer.send_messages) super(KafkaHandler, self).__init__() def emit(self, record): self.client.ensure_topic_exists(self.settings.get("TOPIC")) buf = self.formatter.format(record) if hasattr(buf, "encode"): buf = buf.encode(sys.getdefaultencoding()) self.producer.send_messages(self.settings.get("TOPIC"), buf) def close(self): self.acquire() super(KafkaHandler, self).close() self.client.close() self.release()
def test_ensure_topic_exists(self, protocol, conn): conn.recv.return_value = "response" # anything but None brokers = [BrokerMetadata(0, "broker_1", 4567), BrokerMetadata(1, "broker_2", 5678)] topics = [ TopicMetadata("topic_still_creating", NO_LEADER, []), TopicMetadata("topic_doesnt_exist", UNKNOWN_TOPIC_OR_PARTITION, []), TopicMetadata( "topic_noleaders", NO_ERROR, [ PartitionMetadata("topic_noleaders", 0, -1, [], [], NO_LEADER), PartitionMetadata("topic_noleaders", 1, -1, [], [], NO_LEADER), ], ), ] protocol.decode_metadata_response.return_value = MetadataResponse(brokers, topics) client = KafkaClient(hosts=["broker_1:4567"]) with self.assertRaises(UnknownTopicOrPartitionError): client.ensure_topic_exists("topic_doesnt_exist", timeout=1) with self.assertRaises(KafkaTimeoutError): client.ensure_topic_exists("topic_still_creating", timeout=1) # This should not raise client.ensure_topic_exists("topic_noleaders", timeout=1)
def main(): # check for --version or -V if args.version: print("Ask [email protected]") if args.run: topic = args.run.split('/')[0] msg = bytes('RUN ' + str(args.run.split('/')[1]), 'utf8') kafka = KafkaClient(':'.join([ipAddress, str(portKafka)])) producer = SimpleProducer(kafka) kafka.ensure_topic_exists(topic) try: print_response(producer.send_messages(topic, msg)) except LeaderNotAvailableError: time.sleep(1) print_response(producer.send_messages(topic, msg)) kafka.close()
def main(): # To send messages synchronously kafka = KafkaClient('localhost:9092') producer = KeyedProducer(kafka) # Insure that topic exists kafka.ensure_topic_exists('test') while True: input_str = raw_input("Press enter to send another message, otherwise press 'q' to quit: ") if input_str and input_str in "qQ": sys.exit(0) if not input_str: print "No input was provided" else: producer.send_messages( 'test', # topic 'topic-key', # key "(time: {}, message: {})".format(get_time(), input_str), # message )
class KafkaHandler(logging.Handler): def __init__(self, settings): self.settings = settings self.client = KafkaClient(settings.get("KAFKA_HOSTS")) self.producer = SimpleProducer(self.client) self.producer.send_messages = failedpayloads_wrapper( settings.get("KAFKA_RETRY_TIME", 5))(self.producer.send_messages) super(KafkaHandler, self).__init__() def emit(self, record): self.client.ensure_topic_exists(self.settings.get("TOPIC")) buf = self.formatter.format(record) if hasattr(buf, "encode"): buf = buf.encode(sys.getdefaultencoding()) self.producer.send_messages(self.settings.get("TOPIC"), buf) def close(self): self.acquire() super(KafkaHandler, self).close() self.client.close() self.release()
class KafkaHelper: """ Utility class to interact with Kafka Brokers Internally uses kafka-python library """ def __init__(self): # TODO: Make kafka broker list configurable try: self.kafka = KafkaClient(kw_settings.KW_KAFKA_BROKER_LIST) except: print 'Error - connecting to Kafka broker : ' + kw_settings.KW_KAFKA_BROKER_LIST self.kafka = None self.retry_count = 5 self.retry_interval_in_ms = 5000 def close(self): if self.kafka: self.kafka.close() def _ensure_kafka_topic_exists(self, topic): result = False for i in range(self.retry_count): try: self.kafka.ensure_topic_exists(topic) result = True break except: print 'Warning - Unable to create kafka topic : ' + topic print traceback.print_exc() time.sleep(self.retry_interval_in_ms / 1000) return result def upload_file_to_kafka(self, topic, file_path, **kwargs): """ Utility function to upload contents of file to a given kafka topic :param topic: Kafka topic to which the file will be uploaded :param file_path: Absolute path of the file to be uploaded :param kwargs: append - If True, then file content will be uploaded to existing topic. If topic is not present then new one will be created. If false, and topic is not present then new topic is created. If topic is already present then error is returned. Default, async=False :return: True if content was uploaded else false """ append = kwargs.get('append', False) result = False producer = None try: if not append: # Check if topic is already present if self.kafka.has_metadata_for_topic(topic): print 'Error - Kafka topic : ' + topic + ' already present and append is : ' + str(append) return False # In case of append is True and topic already present/not present # and append is False and topic already not present if self._ensure_kafka_topic_exists(topic): producer = SimpleProducer(self.kafka, batch_send=True, batch_send_every_n=20) with open(file_path, 'rU') as fh: for line in fh: producer.send_messages(topic, line.strip()) result = True except: print 'Error - uploading file : ' + file_path + ' to topic : ' + topic finally: if producer: producer.stop() return result
def createTopic(self, brokers, topicName): client = KafkaClient(brokers) client.ensure_topic_exists(topic=topicName)
class TestRedisMonitor(TestCase): maxDiff = None queue_key = "link:istresearch.com:queue" def setUp(self): self.redis_monitor = RedisMonitor("localsettings.py") self.redis_monitor.settings = self.redis_monitor.wrapper.load("localsettings.py") self.redis_monitor.logger = MagicMock() self.redis_monitor.settings['KAFKA_TOPIC_PREFIX'] = "demo_test" self.redis_monitor.settings['STATS_TOTAL'] = False self.redis_monitor.settings['STATS_PLUGINS'] = False self.redis_monitor.settings['PLUGINS'] = { 'plugins.info_monitor.InfoMonitor': None, 'plugins.stop_monitor.StopMonitor': None, 'plugins.expire_monitor.ExpireMonitor': None, 'tests.tests_online.CustomMonitor': 100, } self.redis_monitor.redis_conn = redis.Redis( host=self.redis_monitor.settings['REDIS_HOST'], port=self.redis_monitor.settings['REDIS_PORT']) self.redis_monitor._load_plugins() self.redis_monitor.stats_dict = {} self.kafka_conn = KafkaClient(self.redis_monitor.settings[ 'KAFKA_HOSTS']) self.kafka_conn.ensure_topic_exists("demo_test.outbound_firehose") self.consumer = SimpleConsumer( self.kafka_conn, "demo-id", "demo_test.outbound_firehose" ) def test_process_item(self): # we only want to go to the end now, not after this test is ran self.consumer.seek(0, 2) # set the info flag key = "info-test:blah" value = "ABC123" self.redis_monitor.redis_conn.set(key, value) # process the request plugin = self.redis_monitor.plugins_dict.items()[0][1] self.redis_monitor._process_plugin(plugin) # ensure the key is gone self.assertEquals(self.redis_monitor.redis_conn.get(key), None) def test_sent_to_kafka(self): success = { u'info-test': "ABC123", u"appid": u"someapp" } # ensure it was sent out to kafka message_count = 0 for message in self.consumer.get_messages(): if message is None: break else: the_dict = json.loads(message.message.value) self.assertEquals(success, the_dict) message_count += 1 self.assertEquals(message_count, 1)
class Aria2Dispatcher: def __init__(self, host, topic, consumer_id, settings): self.host = host self.topic = topic self.consumer_id = consumer_id or "Aria2Dispatcher" self.settings = importlib.import_module(settings[:-3]) self.kafka_client = KafkaClient(self.settings.KAFKA_HOSTS) self.producer = SimpleProducer(self.kafka_client) self.topic_prefix = self.settings.KAFKA_TOPIC_PREFIX self.topic_list = [] self.aria2_clients = [] for x in self.settings.ARIA2_ADDRESSES: rpc_uri = "ws://%s/jsonrpc" % x try: aria2_connection = create_connection(rpc_uri) self.aria2_clients.append({ 'rpc_uri': rpc_uri, 'ws': aria2_connection }) except: logger.error('create aria2_connection error!') raise def _process_item(self, item, aria2_client_index): prefix = self.topic_prefix crawled_firehose_images_topic = "{prefix}.crawled_firehose_images".format( prefix=prefix) if 'updates' in item['meta']['collection_name']: message = json.dumps(item) print("in..... if 'updates' in item['meta']['collection_name']:") print('collection_name::', item['meta']['collection_name']) else: self._process_item_images(item, aria2_client_index) try: if 'images' in item and len(item['images']) > 0: message = json.dumps(item) else: message = 'no images.' except: message = 'json failed to parse' logger.error(message) self._check_topic(crawled_firehose_images_topic) self.producer.send_messages(crawled_firehose_images_topic, message) logger.info("send message to kafka topic:: %s " % crawled_firehose_images_topic) logger.info("message= %s" % message) def _process_item_images(self, item, aria2_client_index): image_urls = item["image_urls"] if len(image_urls) > 0: req_methods = [] images = [] for url in image_urls: filename, file_ext = splitext(basename(urlparse(url).path)) if len(file_ext) == 0: file_ext = ".jpg" out_file_name_base = sha1(url) out_file_name = "%s%s" % (out_file_name_base, file_ext) dir_name = '%s/%s/%s/%s/%s' % ( self.settings.IMAGES_STORE, item['meta']['spiderid'], out_file_name_base[:3], out_file_name_base[3:6], out_file_name_base[6:]) options = dict(dir=dir_name, out=out_file_name) if not exists(dir_name + '/' + out_file_name): req_methods.append({ "methodName": "aria2.addUri", "params": [[url], options] }) images.append({ 'url': url, 'path': "%s/%s" % (dir_name, out_file_name), 'aria2': { 'rpc_uri': self.aria2_clients[aria2_client_index]['rpc_uri'] } }) req = { "jsonrpc": 2, "id": str(uuid.uuid1()), "method": "system.multicall", "params": [req_methods] } jsonreq = json.dumps(req) try: self.aria2_clients[aria2_client_index]['ws'].send(jsonreq) resp = self.aria2_clients[aria2_client_index]['ws'].recv() ws_resp = json.loads(resp) print('resp:', resp) logger.info('resp:: %s ' % resp) for image, gid in zip(images, map(lambda x: x[0], ws_resp['result'])): image['aria2']['gid'] = gid except Exception as err: print('error::', err) logger.error(err) item['images'] = images def _check_topic(self, topic_name): if topic_name not in self.topic_list: self.kafka_client.ensure_topic_exists(topic_name) self.topic_list.append(topic_name) def dispatch(self): consumer = SimpleConsumer( self.kafka_client, self.consumer_id, self.topic, buffer_size=1024 * 100, # 100kb fetch_size_bytes=1024 * 100, # 100kb max_buffer_size=None # eliminate big message errors ) consumer.seek(0, 1) i = 0 while True: try: message = consumer.get_message() if message is None: print datetime.datetime.now().strftime( "%Y-%m-%d %H:%M:%S"), ' message is None:' logger.info('message is None.') time.sleep(1) continue val = message.message.value try: item = json.loads(val) i += 1 self._process_item(item, i % len(self.aria2_clients)) except: print("error heppened in loads val to process : %s" % val) logger.error("error heppened in loads val to process: %s" % val) continue except: traceback.print_exc() break self.kafka_client.close() return 0
from kafka import KafkaClient from ksql import KSQLAPI kafka_client = KafkaClient(hosts=['localhost:9092']) kafka_client.ensure_topic_exists('gas_prices') kafka_client.ensure_topic_exists('locations') client = KSQLAPI('http://localhost:8088') client.ksql("SET 'auto.offset.reset' = 'earliest';") # Drop existing streams client.ksql('DROP STREAM alerts;') client.ksql('DROP STREAM locations;') client.ksql('DROP STREAM gas_prices;') # Creates gas_prices as a stream client.ksql(''' CREATE STREAM gas_prices \ (stationid VARCHAR, lat DOUBLE, long DOUBLE, price DOUBLE, recordtime BIGINT, joinner INT) \ WITH (KAFKA_TOPIC='gas_prices', VALUE_FORMAT='JSON'); ''') # Creates the location stream client.ksql(''' CREATE STREAM locations \ (userid VARCHAR, lat DOUBLE, long DOUBLE, recordtime BIGINT, joinner INT) \ WITH (KAFKA_TOPIC='locations', VALUE_FORMAT='JSON'); ''') # Creates the alert stream using the gas_prices stream client.sql('''
class Traptor(object): def __init__(self, redis_conn, pubsub_conn, traptor_type, apikeys, traptor_id=0, kafka_hosts='localhost:9092', kafka_topic='traptor', kafka_enabled=True, log_level='INFO', test=False, traptor_notify_channel='traptor-notify' ): """ Traptor base class. :param dict apikeys: dictionary of API keys for traptor instnace. See settings.py for details. :param str traptor_type: follow, track, or geo. :param int traptor_id: numerical ID of traptor instance. :param str kafka_hosts: kafka hosts to connect to. :param str kafka_topic: name of the kafka topic to write to. :param str redis_conn: redis connection to use. :param bool kafka_enabled: write to kafka or just log to something else. :param str log_level: log level of the traptor logger instance. :param bool test: True for traptor test instance. :param str traptor_notify_channel: name of the Traptor PubSub channel to subscribe to :param str pubsub_conn: redis pubsub connection to use """ self.apikeys = apikeys self.traptor_type = traptor_type self.traptor_id = traptor_id self.kafka_hosts = kafka_hosts self.kafka_topic = kafka_topic self.redis_conn = redis_conn self.kafka_enabled = kafka_enabled self.log_level = log_level self.test = test self.traptor_notify_channel = traptor_notify_channel self.pubsub_conn = pubsub_conn def __repr__(self): return 'Traptor({}, {}, {}, {}, {}, {}, {}, {}, {}, {} ,{})'.format( self.apikeys, self.traptor_type, self.traptor_id, self.kafka_hosts, self.kafka_topic, self.redis_conn, self.kafka_enabled, self.log_level, self.test, self.traptor_notify_channel, self.pubsub_conn ) def _setup_birdy(self): """ Set up a birdy twitter stream. If there is a TwitterApiError it will exit with status code 3. This was done to prevent services like supervisor from automatically restart the process causing the twitter API to get locked out. Creates ``self.birdy_conn``. """ # Set up a birdy twitter streaming client self.logger.info('Setting up birdy connection...') self.birdy_conn = MyBirdyClient( self.apikeys['CONSUMER_KEY'], self.apikeys['CONSUMER_SECRET'], self.apikeys['ACCESS_TOKEN'], self.apikeys['ACCESS_TOKEN_SECRET'] ) def _setup_kafka(self): """ Set up a Kafka connection. Creates ``self.kafka_conn`` if it can reach the kafka brokers. """ if self.kafka_enabled: self.logger.info('Setting up kafka connection...') self.kafka_conn = KafkaClient(hosts=self.kafka_hosts) else: self.logger.info('Skipping kafka connection setup') self.kafka_conn = None def _setup(self): """ Load everything up. Note that any arg here will override both default and custom settings. """ # Set up logging self.logger = LogFactory.get_instance(name='traptor', level=self.log_level) # Set the restart_flag to False self.restart_flag = False # Set up required connections self._setup_kafka() self._setup_birdy() def _create_kafka_producer(self, kafka_topic): """ Create a kafka producer. If it cannot find one it will exit with error code 3. Creates ``self.kafka_producer``. """ if self.kafka_conn: try: self.logger.debug('Creating kafka producer for "{}"...'.format(self.kafka_topic)) self.kafka_producer = SimpleProducer(self.kafka_conn) except KafkaUnavailableError as e: self.logger.critical(e) sys.exit(3) try: self.logger.debug('Ensuring the "{}" kafka topic exists'.format(self.kafka_topic)) self.kafka_conn.ensure_topic_exists(self.kafka_topic) except: raise else: self.kafka_producer = None def _create_birdy_stream(self): """ Create a birdy twitter stream. If there is a TwitterApiError it will exit with status code 3. This was done to prevent services like supervisor from automatically restart the process causing the twitter API to get locked out. Creates ``self.birdy_stream``. """ if self.traptor_type == 'follow': # Try to set up a twitter stream using twitter id list try: self.logger.info('Creating birdy "follow" stream') self.birdy_stream = self.birdy_conn.stream.statuses.filter.post(follow=self.twitter_rules) except TwitterApiError as e: self.logger.critical(e) sys.exit(3) elif self.traptor_type == 'track': # Try to set up a twitter stream using twitter term list try: self.logger.info('Creating birdy "track" stream') self.birdy_stream = self.birdy_conn.stream.statuses.filter.post(track=self.twitter_rules) except TwitterApiError as e: self.logger.critical(e) sys.exit(3) elif self.traptor_type == 'locations': # Try to set up a twitter stream using twitter term list try: self.logger.info('Creating birdy "locations" stream') self.birdy_stream = self.birdy_conn.stream.statuses.filter.post(locations=self.twitter_rules) except TwitterApiError as e: self.logger.critical(e) sys.exit(3) else: self.logger.critical('That traptor type has not been implemented') sys.exit(3) def _make_twitter_rules(self, rules): """ Convert the rules from redis into a format compatible with the Twitter API. :param list rules: The rules are expected to be a list of dictionaries that comes from redis. :returns: A ``str`` of twitter rules that can be loaded into the a birdy twitter stream. """ rules_str = ','.join([rule['value'] for rule in rules]) self.logger.debug('Twitter rules string: {}'.format(rules_str.encode('utf-8'))) return rules_str def _add_rule_tag_and_value_to_tweet(self, tweet_dict, search_str, rule_tag, rule_value): for k, v in FlatDict(tweet_dict).iteritems(): if isinstance(v, unicode) and search_str.lower() in v.lower(): tweet_dict['traptor']['rule_tag'] = rule_tag tweet_dict['traptor']['rule_value'] = rule_value return tweet_dict def _find_rule_matches(self, tweet_dict): """ Find which rule the tweet matched. This code only expects there to be one match. If there is more than one, it will use the last one it finds since the first match will be overwritten. :param dict tweet_dict: The dictionary twitter object. :returns: a ``dict`` with the augmented data fields. """ new_dict = self._create_traptor_obj(tweet_dict) self.logger.debug('Finding tweet rule matches') for rule in self.redis_rules: search_str = rule['value'] # self.logger.debug("Search string used for the rule match: {}".format(search_str.encode('utf-8'))) if re.search(',', search_str): for s in search_str.split(','): new_dict = self._add_rule_tag_and_value_to_tweet(new_dict, s, rule['tag'], rule['value']) else: search_str = rule['value'].split()[0] for i in new_dict.keys(): new_dict = self._add_rule_tag_and_value_to_tweet(new_dict, search_str, rule['tag'], rule['value']) # self.logger.debug('Rule matched - tag:{}, value:{}'.format(rule['tag'], # rule['value'].encode('utf-8'))) if 'rule_tag' not in new_dict['traptor']: self.logger.warning('Could not find rule_tag: {}, rule_value: {}, in tweet {}'.format( rule['tag'], rule['value'], new_dict.get('id_str'))) new_dict['traptor']['rule_tag'] = 'Not found' new_dict['traptor']['rule_value'] = 'Not found' return new_dict def _get_redis_rules(self): """ Yields a traptor rule from redis. This function expects that the redis keys are set up like follows: traptor-<traptor_type>:<traptor_id>:<rule_id> For example, traptor-follow:0:34 traptor-track:0:5 traptor-locations:0:2 For 'follow' twitter streaming, each traptor may only follow 5000 twitter ids, as per the Twitter API. For 'track' twitter stream, each traptor may only track 400 keywords, as per the Twitter API. For 'locations' twitter stream, each traptor may only track 25 bounding boxes, as per the Twitter API. :returns: Yields a traptor rule from redis. """ # Set up API limitation checks if self.traptor_type == 'follow': rule_max = 5000 elif self.traptor_type == 'track': rule_max = 400 elif self.traptor_type == 'locations': rule_max = 25 else: self.logger.error('traptor_type of {0} is not supported'.format( self.traptor_type)) raise(NotImplementedError) # for rule in xrange(rule_max): redis_key = 'traptor-{0}:{1}'.format(self.traptor_type, self.traptor_id) match = ':'.join([redis_key, '*']) try: for idx, hashname in enumerate(self.redis_conn.scan_iter(match=match)): if idx < rule_max: redis_rule = self.redis_conn.hgetall(hashname) yield redis_rule self.logger.debug('Index: {0}, Redis_rule: {1}'.format( idx, redis_rule)) except ConnectionError as e: self.logger.critical(e) sys.exit(3) # Special error code to track known failures @staticmethod def _tweet_time_to_iso(tweet_time): """ Convert tweet time into ISO time format. :returns: A ``str`` of the ISO formated time. """ return parser.parse(tweet_time).isoformat() def _create_traptor_obj(self, tweet_dict): if 'traptor' not in tweet_dict: tweet_dict['traptor'] = {} return tweet_dict def _fix_tweet_object(self, tweet_dict): """ Do any pre-processing to raw tweet data. :param dict tweet_dict: A tweet dictionary object. :returns: A ``dict`` with a new 'created_at_iso field. """ new_dict = self._create_traptor_obj(tweet_dict) if new_dict.get('created_at'): new_dict['traptor']['created_at_iso'] = self._tweet_time_to_iso( new_dict['created_at']) # self.logger.debug('Fixed tweet object: \n {}'.format( # json.dumps(new_dict, indent=2))) return new_dict def _check_redis_pubsub_for_restart(self): """ Subscribe to Redis PubSub and restart if necessary. Check the Redis PubSub channel and restart Traptor if a message for this Traptor is found. """ self.logger.info("Subscribing to the Traptor notification PubSub.") self.logger.debug("restart_flag = {}".format(self.restart_flag)) p = self.pubsub_conn.pubsub() p.subscribe(self.traptor_notify_channel) while self.restart_flag is not True: m = p.get_message() if m is not None: data = str(m['data']) t = data.split(':') self.logger.debug("PubSub Message: {}".format(t)) if t[0] == self.traptor_type and t[1] == str(self.traptor_id): # Log the action and restart self.restart_flag = True self.logger.debug("Redis PubSub message found. \ Setting restart flag to True.") def _main_loop(self): """ Main loop for iterating through the twitter data. This method iterates through the birdy stream, does any pre-processing, and adds enrichments to the data. If kafka is enabled it will write to the kafka topic defined when instantiating the Traptor class. """ # Iterate through the twitter results for item in self.birdy_stream._stream_iter(): if item: try: _data = json.loads(item) except: pass else: # self.logger.debug('Raw Tweet Data: \n {0}'.format( # json.dumps(_data, indent=2))) # Do tweet data pre-processing data = self._fix_tweet_object(_data) # Do any data enrichment on the base tweet data enriched_data = self._find_rule_matches(data) # Stdout data output for Traptor. print json.dumps(enriched_data, indent=2) if self.kafka_enabled: self.kafka_producer.send_messages(self.kafka_topic, json.dumps(enriched_data)) if self.restart_flag: self.logger.info("Reset flag is true; restarting myself.") break def run(self): """ Run method for running a traptor instance. It sets up the logging, connections, grabs the rules from redis, and starts writing data to kafka if enabled. """ # Setup connections and logging self._setup() ps_check = threading.Thread(group=None, target=self._check_redis_pubsub_for_restart ) ps_check.setDaemon(True) ps_check.start() while True: # Grab a list of {tag:, value:} rules self.redis_rules = [rule for rule in self._get_redis_rules()] self.logger.debug("Redis rules: {}".format(self.redis_rules)) # Concatenate all of the rule['value'] fields self.twitter_rules = self._make_twitter_rules(self.redis_rules) self.logger.debug("Twitter rules: {}".format(self.twitter_rules.encode('utf-8'))) if self.kafka_enabled: self._create_kafka_producer(self.kafka_topic) if not self.test: self._create_birdy_stream() self.restart_flag = False # Start collecting data self._main_loop()
def kafka_writer( path, producer='simple', topic='eventlogging_%(schema)s', key='%(schema)s_%(revision)s', blacklist=None, raw=False, **kafka_producer_args ): """ Write events to Kafka. Kafka URIs look like: kafka:///b1:9092,b2:9092?topic=eventlogging_%s(schema)&async=True&... This producer uses either SimpleProducer or KeyedProducer from kafka-python. You may pass any configs that base Producer takes as keyword arguments via URI query params. NOTE: If you do not explicitly set it, async will default to True. path - URI path should be comma separated Kafka Brokers. e.g. kafka01:9092,kafka02:9092,kafka03:9092 producer - Either 'keyed' or 'simple'. Default: 'simple'. topic - Python format string topic name. If the incoming event is a dict (not a raw string) topic will be interpolated against event. I.e. topic % event. Default: eventlogging_%(schema)s key - Python format string key of the event message in Kafka. If the incoming event is a dict (not a raw string) key will be interpolated against event. I.e. key % event. Default: %(schema)s_%(revision)s. This is ignored if you are using the simple producer. blacklist - Pattern string matching a list of schemas that should not be written. This is useful to keep high volume schemas from being written to an output stream. This will be ignored if the incoming events are raw. raw - Should the events be written as raw (encoded) or not? """ # Brokers should be in the uri path brokers = path.strip('/') # remove non Kafka Producer args from kafka_consumer_args kafka_producer_args = { k: v for k, v in items(kafka_producer_args) if k in inspect.getargspec(Producer.__init__).args } # Use async producer by default if 'async' not in kafka_producer_args: kafka_producer_args['async'] = True kafka = KafkaClient(brokers) if producer == 'keyed': ProducerClass = KeyedProducer else: ProducerClass = SimpleProducer kafka_producer = ProducerClass(kafka, **kafka_producer_args) # These will be used if incoming events are not interpolatable. default_topic = topic.encode('utf8') default_key = key.encode('utf8') kafka_topic_create_timeout_seconds = 0.1 if blacklist: blacklist_pattern = re.compile(blacklist) else: blacklist_pattern = None while 1: event = (yield) # If event is a dict (not Raw) then we can interpolate topic and key # as format strings. # E.g. message_topic = 'eventlogging_%(schema)s' % event. # WARNING! Be sure that your topic and key strings don't try # to interpolate out a field in event that doesn't exist! if isinstance(event, dict): if blacklist_pattern and blacklist_pattern.match(event['schema']): logging.debug( '%s is blacklisted, not writing event %s.' % (event['schema'], event['uuid']) ) continue message_topic = (topic % event).encode('utf8') if producer == 'keyed': message_key = (key % event).encode('utf8') else: message_topic = default_topic message_key = default_key try: # Make sure this topic exists before we attempt to produce to it. # This call will timeout in kafka_topic_create_timeout_seconds. # This should return faster than this if this kafka client has # already cached topic metadata for this topic. Otherwise # it will try to ask Kafka for it each time. Make sure # auto.create.topics.enabled is true for your Kafka cluster! kafka.ensure_topic_exists( message_topic, kafka_topic_create_timeout_seconds ) except KafkaTimeoutError: error_message = "Failed to ensure Kafka topic %s exists " \ "in %f seconds when producing event" % ( message_topic, kafka_topic_create_timeout_seconds ) if isinstance(event, dict): error_message += " of schema %s revision %d" % ( event['schema'], event['revision'] ) error_message += ". Skipping event. " \ "(This might be ok if this is a new topic.)" logging.warn(error_message) continue if raw: value = event.encode('utf-8') else: value = json.dumps(event, sort_keys=True) # send_messages() for the different producer types have different # signatures. Call it appropriately. if producer == 'keyed': kafka_producer.send_messages(message_topic, message_key, value) else: kafka_producer.send_messages(message_topic, value)
class TestLinkSpider(TestCase): example_feed = "\x80\x02}q\x00(X\x0f\x00\x00\x00allowed_domainsq\x01NX"\ "\x0b\x00\x00\x00allow_regexq\x02NX\a\x00\x00\x00crawlidq\x03X\x19"\ "\x00\x00\x0001234567890abcdefghijklmnq\x04X\x03\x00\x00\x00urlq\x05X"\ "\x13\x00\x00\x00www.istresearch.comq\x06X\a\x00\x00\x00expiresq\aK"\ "\x00X\b\x00\x00\x00priorityq\bK\x01X\n\x00\x00\x00deny_regexq\tNX\b"\ "\x00\x00\x00spideridq\nX\x0b\x00\x00\x00test-spiderq\x0bX\x05\x00"\ "\x00\x00attrsq\x0cNX\x05\x00\x00\x00appidq\rX\a\x00\x00\x00testappq"\ "\x0eX\x06\x00\x00\x00cookieq\x0fNX\t\x00\x00\x00useragentq\x10NX\x0f"\ "\x00\x00\x00deny_extensionsq\x11NX\b\x00\x00\x00maxdepthq\x12K\x00u." def setUp(self): self.settings = get_project_settings() self.settings.set('KAFKA_TOPIC_PREFIX', "demo_test") # set up redis self.redis_conn = redis.Redis(host=self.settings['REDIS_HOST'], port=self.settings['REDIS_PORT']) try: self.redis_conn.info() except ConnectionError: print "Could not connect to Redis" # plugin is essential to functionality sys.exit(1) # clear out older test keys if any keys = self.redis_conn.keys("test-spider:*") for key in keys: self.redis_conn.delete(key) # set up kafka to consumer potential result self.kafka_conn = KafkaClient(self.settings['KAFKA_HOSTS']) self.kafka_conn.ensure_topic_exists("demo_test.crawled_firehose") self.consumer = SimpleConsumer( self.kafka_conn, "demo-id", "demo_test.crawled_firehose", buffer_size=1024*100, fetch_size_bytes=1024*100, max_buffer_size=None ) # move cursor to end of kafka topic self.consumer.seek(0, 2) def test_crawler_process(self): runner = CrawlerRunner(self.settings) d = runner.crawl(CustomSpider) d.addBoth(lambda _: reactor.stop()) # add crawl to redis key = "test-spider:istresearch.com:queue" self.redis_conn.zadd(key, self.example_feed, -99) # run the spider, give 20 seconds to see the url, crawl it, # and send to kafka. Then we kill the reactor def thread_func(): time.sleep(20) reactor.stop() thread = threading.Thread(target=thread_func) thread.start() reactor.run() # ensure it was sent out to kafka message_count = 0 for message in self.consumer.get_messages(): if message is None: break else: the_dict = json.loads(message.message.value) if the_dict is not None and the_dict['appid'] == 'testapp' \ and the_dict['crawlid'] == '01234567890abcdefghijklmn': message_count += 1 self.assertEquals(message_count, 1) def tearDown(self): keys = self.redis_conn.keys('stats:crawler:*:test-spider:*') keys = keys + self.redis_conn.keys('test-spider:*') for key in keys: self.redis_conn.delete(key)
class KafkaBaseMonitor(BaseMonitor): ''' Base monitor for handling outbound Kafka results ''' def setup(self, settings): ''' Setup the handler @param settings: The loaded settings file ''' if settings['KAFKA_REMOVE']: return @MethodTimer.timeout(settings['KAFKA_CONN_TIMEOUT'], False) def _hidden_setup(): try: # set up kafka self.kafka_conn = KafkaClient(settings['KAFKA_HOSTS']) self.producer = SimpleProducer(self.kafka_conn) self.topic_prefix = settings['KAFKA_TOPIC_PREFIX'] except KafkaUnavailableError as ex: message = "An exception '{0}' occured while setting up kafka. "\ "Arguments:\n{1!r}".format(type(ex).__name__, ex.args) self.logger.error(message) return False return True ret_val = _hidden_setup() self.use_appid_topics = settings['KAFKA_APPID_TOPICS'] if ret_val: self.logger.debug("Successfully connected to Kafka in {name}" .format(name=self.__class__.__name__)) else: self.logger.error("Failed to set up Kafka Connection in {name} " "within timeout".format(name=self.__class__.__name__)) # this is essential to running the redis monitor sys.exit(1) def _send_to_kafka(self, master): ''' Sends the message back to Kafka @param master: the final dict to send @returns: True if successfully sent to kafka ''' if settings['KAFKA_REMOVE']: return appid_topic = "{prefix}.outbound_{appid}".format( prefix=self.topic_prefix, appid=master['appid']) firehose_topic = "{prefix}.outbound_firehose".format( prefix=self.topic_prefix) try: self.kafka_conn.ensure_topic_exists(firehose_topic) # dont want logger in outbound kafka message dump = json.dumps(master) if self.use_appid_topics: self.kafka_conn.ensure_topic_exists(appid_topic) self.producer.send_messages(appid_topic, dump) self.producer.send_messages(firehose_topic, dump) return True except Exception as ex: message = "An exception '{0}' occured while sending a message " \ "to kafka. Arguments:\n{1!r}" \ .format(type(ex).__name__, ex.args) self.logger.error(message) return False
from kafka import KafkaProducer, KafkaClient, KafkaConsumer from tornado import websocket, httpserver, web, ioloop import socket producer = KafkaProducer(bootstrap_servers=['0.0.0.0:9092']) consumer = KafkaConsumer('Game', bootstrap_servers=['0.0.0.0:9092']) client = KafkaClient('0.0.0.0:9092') client.ensure_topic_exists('Game') def message_generator(): for message in consumer: yield message.value gens = message_generator() class InputSocket(websocket.WebSocketHandler): def open(self): print("WebSocket opened") def on_message(self, message): print(message) producer.send("Game", message.encode("utf-8")) self.write_message(u"Jake magnanimously acknowledges your request") def check_origin(self, origin): return True def on_close(self):
from kafka import KeyedProducer, KafkaClient import json import time file_path = '../sample_data/starbucks/starbucks.geojson' features = "" with open(file_path) as data_file: features = json.load(data_file).get('features') # To send messages asynchronously client = KafkaClient('kafka.dev:9092') producer = KeyedProducer(client, async=True) topic = 'starbucks' client.ensure_topic_exists(topic) #run all? feature_count = len(features) #or run some? #feature_count = 10 interval = 1 #in seconds index = 5 while index < feature_count: time.sleep(interval) photos = [] photos_url = [] if features[index].get('properties').get('photos'): for photo in features[index].get('properties').get('photos').split(','): photos += [photo] photos_url += ['http://127.0.0.1:8001/starbucks/{}.jpg'.format(photo)] if features[index].get('properties').get('photos'): features[index]['properties']['photos'] = photos
def addEntree(request): if request.method != 'POST': return _error_response(request, "Must make POST request") if 'vendor_id' not in request.POST or \ 'entree_name' not in request.POST or \ 'entree_description' not in request.POST or \ 'entree_price' not in request.POST: return _error_response(request, "Missing required fields") vendor_id = request.POST['vendor_id'] entree_name = request.POST['entree_name'] entree_description = request.POST['entree_description'] entree_price = request.POST['entree_price'] # Create a new entree entree_post_data = {'name': entree_name, 'description': entree_description, 'price': entree_price} entree_post_encoded = urllib.parse.urlencode(entree_post_data).encode('utf-8') url = 'http://' + settings.MODEL_API + ':8000/api/v1/entree/create' req = urllib.request.Request(url, data=entree_post_encoded, method='POST') resp_json = urllib.request.urlopen(req).read().decode('utf-8') resp = json.loads(resp_json) if not resp or not resp['ok']: return _error_response(request, "error in database") # Get new entree's id and add to list entree_id = resp['resp']['entree_id'] entree_list = [] entree_list.append(entree_id) # Add to kafka queue kafka = KafkaClient('kafka:9092') producer = SimpleProducer(kafka) new_entree_listing = {'id': entree_id, 'entree_id': entree_id, 'entree_name': entree_name, 'entree_description': entree_description, 'vendor_id': vendor_id} #some_new_listing = {'title': '2nd Test"', 'description': 'Testing to make sure it works', 'id':45} #to avoid LeaderNotAvailable,try twice, then return error try: producer.send_messages('new-entrees-topic', json.dumps(new_entree_listing).encode('utf-8')) except: try: kafka.ensure_topic_exists('new-entrees-topic') producer.send_messages('new-entrees-topic', json.dumps(new_entree_listing).encode('utf-8')) except: return _error_response(request, "Kafka topic not accessible") #es = Elasticsearch(['es']) #new_entree_listing = {'entree_id': entree_id, 'entree_name': entree_name, 'entree_description': entree_description, 'vendor_id': vendor_id} #es.index(index='listing_index', doc_type='listing', id=new_entree_listing['entree_id'], body = new_entree_listing) #es.indices.refresh(index = 'listing_index') # Lookup menu by vendor_id menu_url = 'http://' + settings.MODEL_API + ':8000/api/v1/menu/' + vendor_id + '/lookup_menu_vendor' menu_req = urllib.request.Request(menu_url) resp_json = urllib.request.urlopen(menu_req).read().decode('utf-8') resp = json.loads(resp_json) if not resp or not resp['ok']: return _error_response(request, "Could not lookup menu by vendor id") menu = resp['resp'] menu_id = resp['resp']['menu_id'] # Add new entree to vendor's menu add_menu_post_data = {'entree_id_list': entree_list} add_menu_post_encoded = urllib.parse.urlencode(add_menu_post_data).encode('utf-8') add_to_menu_url = 'http://' + settings.MODEL_API + ':8000/api/v1/menu/' + str(menu_id) + '/add_entrees' add_to_menu_req = urllib.request.Request(add_to_menu_url, data=add_menu_post_encoded, method='POST') resp_json = urllib.request.urlopen(add_to_menu_req).read().decode('utf-8') resp = json.loads(resp_json) if not resp or not resp['ok']: return _error_response(request, "Could not add entree to menu") return _success_response(request, resp)
from kafka import SimpleProducer, KafkaClient kafka = KafkaClient("10.42.2.106:9092") producer = SimpleProducer(kafka) kafka.ensure_topic_exists('updates') class Journaled(object): def send_update(self, name, value): print(name, value) producer.send_messages('updates', str((name, value))) return (True, "Message") def __setattr__(self, name, value): result = self.send_update(name, value) if result[0] == True: super(Journaled, self).__setattr__(name, value) else: raise Exception( "$1 cannot be set to $2 due to the following error: \n $3". format(name, value, result[1])) j = Journaled() j.prop = "prop"
from kafka import SimpleProducer, KafkaClient from random import randint # Producer module sends out Kafka messages on port 9092 k = KafkaClient("localhost:9092") producer = SimpleProducer(k) # User specifies name for event log (e.g. name of module or activity) title = str(raw_input("Name event log: ")) k.ensure_topic_exists(title) # Unique user sends messages as needed. uid = "Learner" + str(randint(0, 20000)) while True: event = raw_input("Add what to event log?: ('Q' to end.): ") if event == 'Q': break else: msg = event.encode('UTF-8', 'ignore') producer.send_messages(title, "%s: %s" % (uid, msg)) # Module closes connection on exit. k.close()
class RedisMonitor: def __init__(self): self.setup() def setup(self): ''' Connection stuff here so we can mock it ''' self.redis_conn = redis.Redis(host=REDIS_HOST, port=REDIS_PORT) # set up kafka self.kafka_conn = KafkaClient(KAFKA_HOSTS) self.producer = SimpleProducer(self.kafka_conn) self.topic_prefix = KAFKA_TOPIC_PREFIX def run(self): ''' The external main run loop ''' self._main_loop() def _main_loop(self): ''' The internal while true main loop for the redis monitor ''' while True: self._do_info() self._do_expire() self._do_stop() time.sleep(0.1) def _do_info(self): ''' Processes info action requests ''' for key in self.redis_conn.scan_iter(match="info:*:*"): # the master dict to return master = {} master['uuid'] = self.redis_conn.get(key) master['total_pending'] = 0 master['server_time'] = int(time.time()) # break down key elements = key.split(":") dict = {} dict['spiderid'] = elements[1] dict['appid'] = elements[2] if len(elements) == 4: dict['crawlid'] = elements[3] # we received the info message print "received info request" # generate the information requested if 'crawlid' in dict: print "got crawlid info" master = self._build_crawlid_info(master, dict) else: print "got appid info" master = self._build_appid_info(master, dict) self.redis_conn.delete(key) if self._send_to_kafka(master): print 'Sent info to kafka' else: print 'Failed to send info to kafka' def _send_to_kafka(self, master): ''' Sends the message back to Kafka @param master: the final dict to send @log_extras: the extras to append to the log output @returns: True if successfully sent to kafka ''' appid_topic = "{prefix}.outbound_{appid}".format( prefix=self.topic_prefix, appid=master['appid']) firehose_topic = "{prefix}.outbound_firehose".format( prefix=self.topic_prefix) try: self.kafka_conn.ensure_topic_exists(appid_topic) self.kafka_conn.ensure_topic_exists(firehose_topic) # dont want logger in outbound kafka message dump = json.dumps(master) self.producer.send_messages(appid_topic, dump) self.producer.send_messages(firehose_topic, dump) return True except Exception as ex: print traceback.format_exc() pass return False def _build_appid_info(self, master, dict): ''' Builds the appid info object @param master: the master dict @param dict: the dict object received @return: the appid info object ''' master['total_crawlids'] = 0 master['total_pending'] = 0 master['total_domains'] = 0 master['crawlids'] = {} master['appid'] = dict['appid'] match_string = '{sid}:queue'.format(sid=dict['spiderid']) sortedDict = self._get_bin(match_string) # now iterate through binned dict for score in sortedDict: for item in sortedDict[score]: if 'meta' in item: item = item['meta'] if item['appid'] == dict['appid']: crawlid = item['crawlid'] # add new crawlid to master dict if crawlid not in master['crawlids']: master['crawlids'][crawlid] = {} master['crawlids'][crawlid]['total'] = 0 master['crawlids'][crawlid]['high_priority'] = -9999 master['crawlids'][crawlid]['low_priority'] = 9999 timeout_key = 'timeout:{sid}:{aid}:{cid}'.format( sid=dict['spiderid'], aid=dict['appid'], cid=crawlid) if self.redis_conn.exists(timeout_key): master['crawlids'][crawlid][ 'expires'] = self.redis_conn.get(timeout_key) master['total_crawlids'] = master['total_crawlids'] + 1 if item['priority'] > master['crawlids'][crawlid][ 'high_priority']: master['crawlids'][crawlid]['high_priority'] = item[ 'priority'] if item['priority'] < master['crawlids'][crawlid][ 'low_priority']: master['crawlids'][crawlid]['low_priority'] = item[ 'priority'] master['crawlids'][crawlid][ 'total'] = master['crawlids'][crawlid]['total'] + 1 master['total_pending'] = master['total_pending'] + 1 return master def _get_bin(self, key): ''' Returns a binned dictionary based on redis zscore @return: The sorted dict ''' # keys based on score sortedDict = {} # this doesnt return them in order, need to bin first for item in self.redis_conn.zscan_iter(key): my_item = pickle.loads(item[0]) # score is negated in redis my_score = -item[1] if my_score not in sortedDict: sortedDict[my_score] = [] sortedDict[my_score].append(my_item) return sortedDict def _build_crawlid_info(self, master, dict): ''' Builds the crawlid info object @param master: the master dict @param dict: the dict object received @return: the crawlid info object ''' master['total_pending'] = 0 master['appid'] = dict['appid'] master['crawlid'] = dict['crawlid'] timeout_key = 'timeout:{sid}:{aid}:{cid}'.format(sid=dict['spiderid'], aid=dict['appid'], cid=dict['crawlid']) if self.redis_conn.exists(timeout_key): master['expires'] = self.redis_conn.get(timeout_key) # get all domain queues match_string = '{sid}:queue'.format(sid=dict['spiderid']) sortedDict = self._get_bin(match_string) # now iterate through binned dict for score in sortedDict: for item in sortedDict[score]: if 'meta' in item: item = item['meta'] if item['appid'] == dict['appid'] and \ item['crawlid'] == dict['crawlid']: if 'high_priority' not in master: master['high_priority'] = -99999 if 'low_priority' not in master: master['low_priority'] = 99999 if item['priority'] > master['high_priority']: master['high_priority'] = item['priority'] if item['priority'] < master['low_priority']: master['low_priority'] = item['priority'] master['total_pending'] = master['total_pending'] + 1 return master def _do_expire(self): ''' Processes expire requests Very similar to _do_stop() ''' for key in self.redis_conn.scan_iter(match="timeout:*:*:*"): timeout = float(self.redis_conn.get(key)) curr_time = time.time() if curr_time > timeout: # break down key elements = key.split(":") spiderid = elements[1] appid = elements[2] crawlid = elements[3] # add crawl to blacklist so it doesnt propagate redis_key = spiderid + ":blacklist" value = '{appid}||{crawlid}'.format(appid=appid, crawlid=crawlid) # add this to the blacklist set self.redis_conn.sadd(redis_key, value) # everything stored in the queue is now expired result = self._purge_crawl(spiderid, appid, crawlid) # item to send to kafka extras = {} extras['action'] = "expire" extras['spiderid'] = spiderid extras['appid'] = appid extras['crawlid'] = crawlid extras['total_expired'] = result self.redis_conn.delete(key) if self._send_to_kafka(extras): print 'Sent expired ack to kafka' else: print 'Failed to send expired ack to kafka' def _do_stop(self): ''' Processes stop action requests ''' for key in self.redis_conn.scan_iter(match="stop:*:*:*"): # break down key elements = key.split(":") spiderid = elements[1] appid = elements[2] crawlid = elements[3] uuid = self.redis_conn.get(key) # log we received the stop message print 'Received stop request' redis_key = spiderid + ":blacklist" value = '{appid}||{crawlid}'.format(appid=appid, crawlid=crawlid) # add this to the blacklist set self.redis_conn.sadd(redis_key, value) # purge crawlid from current set result = self._purge_crawl(spiderid, appid, crawlid) # item to send to kafka extras = {} extras['action'] = "stop" extras['spiderid'] = spiderid extras['appid'] = appid extras['crawlid'] = crawlid extras['total_purged'] = result self.redis_conn.delete(key) if self._send_to_kafka(extras): # delete timeout for crawl (if needed) since stopped timeout_key = 'timeout:{sid}:{aid}:{cid}'.format(sid=spiderid, aid=appid, cid=crawlid) self.redis_conn.delete(timeout_key) print 'Sent stop ack to kafka' else: print 'Failed to send stop ack to kafka' def _purge_crawl(self, spiderid, appid, crawlid): ''' Wrapper for purging the crawlid from the queues @param spiderid: the spider id @param appid: the app id @param crawlid: the crawl id @return: The number of requests purged ''' # purge three times to try to make sure everything is cleaned total = self._mini_purge(spiderid, appid, crawlid) total = total + self._mini_purge(spiderid, appid, crawlid) total = total + self._mini_purge(spiderid, appid, crawlid) return total def _mini_purge(self, spiderid, appid, crawlid): ''' Actually purges the crawlid from the queue @param spiderid: the spider id @param appid: the app id @param crawlid: the crawl id @return: The number of requests purged ''' total_purged = 0 match_string = '{sid}:queue'.format(sid=spiderid) # using scan for speed vs keys for item in self.redis_conn.zscan_iter(match_string): item_key = item[0] item = pickle.loads(item_key) if 'meta' in item: item = item['meta'] if item['appid'] == appid and item['crawlid'] == crawlid: self.redis_conn.zrem(match_string, item_key) total_purged = total_purged + 1 return total_purged
TOPIC_NAME = "sensor_temp" try: KAFKA_IP = os.environ['KAFKA_PORT_9092_TCP_ADDR'] KAFKA_PORT = os.environ['KAFKA_PORT_9092_TCP_PORT'] except KeyError: print "Please set the environment variables for KAFKA" sys.exit(1) application = Flask(__name__) sleep(10) # hack to wait for kafka to be up in docker deloyment kafka = KafkaClient("{0}:{1}".format(KAFKA_IP, KAFKA_PORT)) producer = SimpleProducer(kafka) @application.route('/') def home(): return "<h1>Hello World</h1>" @application.route('/temperature', methods=['POST']) def temperature(): producer.send_messages(TOPIC_NAME, json.dumps(request.json)) return "ok" kafka.ensure_topic_exists(TOPIC_NAME) if __name__ == "__main__": application.run(host="0.0.0.0", debug=True)
import base64 KAFKA_CLUSTER_SERVERS = [ '10.10.25.50:19092', '10.10.25.51:19092', '10.10.25.52:19092', '10.10.25.53:19092', '10.10.25.54:19092', ] topic = "topic.itom.metric.mobile" # client client = KafkaClient(KAFKA_CLUSTER_SERVERS) client.ensure_topic_exists(topic) # producer producer = KafkaProducer(bootstrap_servers=KAFKA_CLUSTER_SERVERS) def f(num): taskid = "taskid{num}".format(num=num) version = "version{num}".format(num=num) # send message to kafka producer.send(topic, json.dumps( { "https": [ {"taskId": taskid, "appVersion": version, "errorId": 500, "responseTime": 1}, {"taskId": taskid, "appVersion": version, "errorId": 5, "responseTime": 1} ] } ))
import os from kafka import KafkaClient, SimpleProducer, KafkaConsumer import logging logging.basicConfig(level=logging.DEBUG) log = logging.getLogger() server = os.getenv('KAFKA_PORT_9092_TCP', 'tcp://localhost:9092')[6:] print server cl = KafkaClient(server) pr = SimpleProducer(cl) cn = KafkaConsumer('test', bootstrap_servers=[server], group_id='test') cl.ensure_topic_exists('test') for i in xrange(0,100): pr.send_messages('test',str(i)) print 'wrote', i print 'starting consumer' for message in cn: print "%s:%d:%d: key=%s value=%s" % ( message.topic, message.partition, message.offset, message.key, message.value ) # vim: set ts=4 sw=4 expandtab:
class EpidataStreamingContext: def __init__(self, sc=None, ssc=None, sql_ctx=None, topics=None, brokers=None, cassandra_conf=None, measurement_class=None): self._sc = sc self._sql_ctx = sql_ctx self._topics = topics self._ssc = ssc self._brokers = brokers self._cassandra_conf = cassandra_conf self._measurement_class = measurement_class # set up Schema self._sensor_measurement_schema = SensorMeasurement.get_schema() self._sensor_measurement_stats_schema = SensorMeasurement.get_stats_schema( ) self._automated_test_schema = AutomatedTest.get_schema() self._automated_test_stats_schema = AutomatedTest.get_stats_schema() self._kafka_producer = KafkaProducer(bootstrap_servers=self._brokers) self._client = KafkaClient(self._brokers) def run_stream(self, ops, clean_up=True): self._client.ensure_topic_exists(self._topics) kvs = KafkaUtils.createDirectStream( self._ssc, [self._topics], {"metadata.broker.list": self._brokers}) if self._measurement_class == "sensor_measurement": rows = kvs.map(SensorMeasurement.to_row) elif self._measurement_class == "automated_test": rows = kvs.map(AutomatedTest.to_row) def process(time, rdd): if rdd.isEmpty() == False: rdd_df = self._sql_ctx.createDataFrame(rdd) # convert to panda dataframe panda_df = ConvertUtils.convert_to_pandas_dataframe_model( rdd_df, clean_up) # perform all transformation and save it to cassandra for op in ops: # try: # apply transformation output_df = op.apply(panda_df, self._sql_ctx) if not output_df.empty: if op.datastore() == "cassandra": # clean up unnecessary column output_df = ConvertUtils.convert_meas_value( output_df, op.destination()) # convert it back to spark data frame spark_output_df = self._sql_ctx.createDataFrame( output_df, self._get_schema(op.destination())) # convert to db model to save to cassandra output_df_db = self._convert_to_db_model( spark_output_df, op.destination()) # save to cassandra output_df_db.write.format( "org.apache.spark.sql.cassandra" ).mode('append').options( table=op.destination(), keyspace=self._cassandra_conf['keyspace'], user=self._cassandra_conf['user'], password=self._cassandra_conf['password'] ).save() elif op.datastore() == "kafka": output_df_kafka = output_df for i in output_df_kafka.index: row_json = output_df_kafka.loc[i].to_json() # push to kafka self._kafka_producer.send( op.destination(), row_json) # Flush kakfa producer self._kafka_producer.flush() # except BaseException: # print("Failed transformation: " + op.destination()) rows.foreachRDD(process) def _start(self): self._ssc.start() self._ssc.awaitTermination() def _get_schema(self, destination): if destination == "measurements_summary": if self._measurement_class == "sensor_measurement": return self._sensor_measurement_stats_schema elif self._measurement_class == "automated_test": return self._automated_test_stats_schema else: if self._measurement_class == "sensor_measurement": return self._sensor_measurement_schema elif self._measurement_class == "automated_test": return self._automated_test_schema def _convert_to_db_model(self, input_df, dest): if self._measurement_class == "sensor_measurement": return SensorMeasurement.convert_to_db_model(input_df, dest) elif self._measurement_class == "automated_test": return AutomatedTest.convert_to_db_model(input_df, dest)
from kafka import SimpleProducer, KafkaClient kafka = KafkaClient("10.42.2.106:9092") producer = SimpleProducer(kafka) kafka.ensure_topic_exists('updates') class Journaled(object): def send_update(self, name, value): print (name, value) producer.send_messages('updates', str((name, value))) return (True, "Message") def __setattr__(self, name, value): result = self.send_update(name, value) if result[0] == True: super(Journaled, self).__setattr__(name, value) else: raise Exception("$1 cannot be set to $2 due to the following error: \n $3".format(name, value, result[1])) j = Journaled() j.prop = "prop"
""" Usage: k-debug.py <host> """ from kafka import SimpleProducer, KafkaClient import logging import sys logging.basicConfig() kafka = KafkaClient(sys.argv[1] + ':9092') #producer = SimpleProducer(kafka) kafka.ensure_topic_exists(b'picasso-stackato-logs') print("Client: {0!r}".format(kafka)) md = kafka.send_metadata_request() print(" {0!r}".format(md)) for t in kafka.topics: print("{0!r}:".format(t)) print(" partitions: {0!r}:".format(kafka.get_partition_ids_for_topic(t))) #kafka.ensure_topic_exists(b'my-topic') print("done.") ## end
class RedisMonitor: def __init__(self): self.setup() def setup(self): ''' Connection stuff here so we can mock it ''' self.redis_conn = redis.Redis(host=REDIS_HOST, port=REDIS_PORT) # set up kafka self.kafka_conn = KafkaClient(KAFKA_HOSTS) self.producer = SimpleProducer(self.kafka_conn) self.topic_prefix = KAFKA_TOPIC_PREFIX def run(self): ''' The external main run loop ''' self._main_loop() def _main_loop(self): ''' The internal while true main loop for the redis monitor ''' while True: self._do_info() self._do_expire() self._do_stop() time.sleep(0.1) def _do_info(self): ''' Processes info action requests ''' for key in self.redis_conn.scan_iter(match="info:*:*"): # the master dict to return master = {} master['uuid'] = self.redis_conn.get(key) master['total_pending'] = 0 master['server_time'] = int(time.time()) # break down key elements = key.split(":") dict = {} dict['spiderid'] = elements[1] dict['appid'] = elements[2] if len(elements) == 4: dict['crawlid'] = elements[3] # generate the information requested if 'crawlid' in dict: master = self._build_crawlid_info(master, dict) else: master = self._build_appid_info(master, dict) self.redis_conn.delete(key) if self._send_to_kafka(master): pass #print 'Sent info to kafka' else: print 'Failed to send info to kafka' def _send_to_kafka(self, master): ''' Sends the message back to Kafka @param master: the final dict to send @log_extras: the extras to append to the log output @returns: True if successfully sent to kafka ''' appid_topic = "{prefix}.outbound_{appid}".format( prefix=self.topic_prefix, appid=master['appid']) firehose_topic = "{prefix}.outbound_firehose".format( prefix=self.topic_prefix) try: self.kafka_conn.ensure_topic_exists(appid_topic) self.kafka_conn.ensure_topic_exists(firehose_topic) # dont want logger in outbound kafka message dump = json.dumps(master) self.producer.send_messages(appid_topic, dump) self.producer.send_messages(firehose_topic, dump) return True except Exception as ex: print traceback.format_exc() pass return False def _build_appid_info(self, master, dict): ''' Builds the appid info object @param master: the master dict @param dict: the dict object received @return: the appid info object ''' master['total_crawlids'] = 0 master['total_pending'] = 0 master['total_domains'] = 0 master['crawlids'] = {} master['appid'] = dict['appid'] match_string = '{sid}:queue'.format(sid=dict['spiderid']) sortedDict = self._get_bin(match_string) # now iterate through binned dict for score in sortedDict: for item in sortedDict[score]: if 'meta' in item: item = item['meta'] if item['appid'] == dict['appid']: crawlid = item['crawlid'] # add new crawlid to master dict if crawlid not in master['crawlids']: master['crawlids'][crawlid] = {} master['crawlids'][crawlid]['total'] = 0 master['crawlids'][crawlid]['high_priority'] = -9999 master['crawlids'][crawlid]['low_priority'] = 9999 timeout_key = 'timeout:{sid}:{aid}:{cid}'.format( sid=dict['spiderid'], aid=dict['appid'], cid=crawlid) if self.redis_conn.exists(timeout_key): master['crawlids'][crawlid]['expires'] = self.redis_conn.get(timeout_key) master['total_crawlids'] = master['total_crawlids'] + 1 if item['priority'] > master['crawlids'][crawlid]['high_priority']: master['crawlids'][crawlid]['high_priority'] = item['priority'] if item['priority'] < master['crawlids'][crawlid]['low_priority']: master['crawlids'][crawlid]['low_priority'] = item['priority'] master['crawlids'][crawlid]['total'] = master['crawlids'][crawlid]['total'] + 1 master['total_pending'] = master['total_pending'] + 1 return master def _get_bin(self, key): ''' Returns a binned dictionary based on redis zscore @return: The sorted dict ''' # keys based on score sortedDict = {} # this doesnt return them in order, need to bin first for item in self.redis_conn.zscan_iter(key): my_item = pickle.loads(item[0]) # score is negated in redis my_score = -item[1] if my_score not in sortedDict: sortedDict[my_score] = [] sortedDict[my_score].append(my_item) return sortedDict def _build_crawlid_info(self,master, dict): ''' Builds the crawlid info object @param master: the master dict @param dict: the dict object received @return: the crawlid info object ''' master['total_pending'] = 0 master['appid'] = dict['appid'] master['crawlid'] = dict['crawlid'] timeout_key = 'timeout:{sid}:{aid}:{cid}'.format(sid=dict['spiderid'], aid=dict['appid'], cid=dict['crawlid']) if self.redis_conn.exists(timeout_key): master['expires'] = self.redis_conn.get(timeout_key) # get all domain queues match_string = '{sid}:queue'.format(sid=dict['spiderid']) sortedDict = self._get_bin(match_string) # now iterate through binned dict for score in sortedDict: for item in sortedDict[score]: if 'meta' in item: item = item['meta'] if item['appid'] == dict['appid'] and \ item['crawlid'] == dict['crawlid']: if 'high_priority' not in master: master['high_priority'] = -99999 if 'low_priority' not in master: master['low_priority'] = 99999 if item['priority'] > master['high_priority']: master['high_priority'] = item['priority'] if item['priority'] < master['low_priority']: master['low_priority'] = item['priority'] master['total_pending'] = master['total_pending'] + 1 return master def _do_expire(self): ''' Processes expire requests Very similar to _do_stop() ''' for key in self.redis_conn.scan_iter(match="timeout:*:*:*"): timeout = float(self.redis_conn.get(key)) curr_time = time.time() if curr_time > timeout: # break down key elements = key.split(":") spiderid = elements[1] appid = elements[2] crawlid = elements[3] # add crawl to blacklist so it doesnt propagate redis_key = spiderid + ":blacklist" value = '{appid}||{crawlid}'.format(appid=appid, crawlid=crawlid) # add this to the blacklist set self.redis_conn.sadd(redis_key, value) # everything stored in the queue is now expired result = self._purge_crawl(spiderid, appid, crawlid) # item to send to kafka extras = {} extras['action'] = "expire" extras['spiderid'] = spiderid extras['appid'] = appid extras['crawlid'] = crawlid extras['total_expired'] = result self.redis_conn.delete(key) if self._send_to_kafka(extras): #print 'Sent expired ack to kafka' pass else: print 'Failed to send expired ack to kafka' def _do_stop(self): ''' Processes stop action requests ''' for key in self.redis_conn.scan_iter(match="stop:*:*:*"): # break down key elements = key.split(":") spiderid = elements[1] appid = elements[2] crawlid = elements[3] uuid = self.redis_conn.get(key) redis_key = spiderid + ":blacklist" value = '{appid}||{crawlid}'.format(appid=appid, crawlid=crawlid) # add this to the blacklist set self.redis_conn.sadd(redis_key, value) # purge crawlid from current set result = self._purge_crawl(spiderid, appid, crawlid) # item to send to kafka extras = {} extras['action'] = "stop" extras['spiderid'] = spiderid extras['appid'] = appid extras['crawlid'] = crawlid extras['total_purged'] = result self.redis_conn.delete(key) if self._send_to_kafka(extras): # delete timeout for crawl (if needed) since stopped timeout_key = 'timeout:{sid}:{aid}:{cid}'.format( sid=spiderid, aid=appid, cid=crawlid) self.redis_conn.delete(timeout_key) #print 'Sent stop ack to kafka' else: print 'Failed to send stop ack to kafka' def _purge_crawl(self, spiderid, appid, crawlid): ''' Wrapper for purging the crawlid from the queues @param spiderid: the spider id @param appid: the app id @param crawlid: the crawl id @return: The number of requests purged ''' # purge three times to try to make sure everything is cleaned total = self._mini_purge(spiderid, appid, crawlid) total = total + self._mini_purge(spiderid, appid, crawlid) total = total + self._mini_purge(spiderid, appid, crawlid) return total def _mini_purge(self, spiderid, appid, crawlid): ''' Actually purges the crawlid from the queue @param spiderid: the spider id @param appid: the app id @param crawlid: the crawl id @return: The number of requests purged ''' total_purged = 0 match_string = '{sid}:queue'.format(sid=spiderid) # using scan for speed vs keys for item in self.redis_conn.zscan_iter(match_string): item_key = item[0] item = pickle.loads(item_key) if 'meta' in item: item = item['meta'] if item['appid'] == appid and item['crawlid'] == crawlid: self.redis_conn.zrem(match_string, item_key) total_purged = total_purged + 1 return total_purged
import redis from kafka import KafkaConsumer, KafkaClient import flask from message_pb2 import Message from Market.commodities_pb2 import Commodity conn = redis.StrictRedis(host='localhost', port=6379) def get_type_of_message(message): return message.ListFields()[0][0].name client = KafkaClient(hosts=['0.0.0.0:9092']) client.ensure_topic_exists('resource') consumer = KafkaConsumer('resource', bootstrap_servers=['0.0.0.0:9092']) for message in consumer: print(message) mess = Message() mess.ParseFromString(message.value) if get_type_of_message(mess) == "trade": user_a, user_b = mess.trade.user_a_id, mess.trade.user_b_id amount = mess.trade.amount conn.incr("user:"******"user:" + str(user_b), int(amount))
class TestRedisMonitor(TestCase): maxDiff = None queue_key = "link:istresearch.com:queue" def setUp(self): self.redis_monitor = RedisMonitor("localsettings.py") self.redis_monitor.settings = self.redis_monitor.wrapper.load( "localsettings.py") self.redis_monitor.logger = MagicMock() self.redis_monitor.settings['KAFKA_TOPIC_PREFIX'] = "demo_test" self.redis_monitor.settings['STATS_TOTAL'] = False self.redis_monitor.settings['STATS_PLUGINS'] = False self.redis_monitor.settings['PLUGINS'] = { 'plugins.info_monitor.InfoMonitor': None, 'plugins.stop_monitor.StopMonitor': None, 'plugins.expire_monitor.ExpireMonitor': None, 'tests.tests_online.CustomMonitor': 100, } self.redis_monitor.redis_conn = redis.Redis( host=self.redis_monitor.settings['REDIS_HOST'], port=self.redis_monitor.settings['REDIS_PORT']) self.redis_monitor._load_plugins() self.redis_monitor.stats_dict = {} self.kafka_conn = KafkaClient( self.redis_monitor.settings['KAFKA_HOSTS']) self.kafka_conn.ensure_topic_exists("demo_test.outbound_firehose") self.consumer = SimpleConsumer(self.kafka_conn, "demo-id", "demo_test.outbound_firehose") def test_process_item(self): # we only want to go to the end now, not after this test is ran self.consumer.seek(0, 2) # set the info flag key = "info-test:blah" value = "ABC123" self.redis_monitor.redis_conn.set(key, value) # process the request plugin = self.redis_monitor.plugins_dict.items()[0][1] self.redis_monitor._process_plugin(plugin) # ensure the key is gone self.assertEquals(self.redis_monitor.redis_conn.get(key), None) def test_sent_to_kafka(self): success = {u'info-test': "ABC123", u"appid": u"someapp"} # ensure it was sent out to kafka message_count = 0 for message in self.consumer.get_messages(): if message is None: break else: the_dict = json.loads(message.message.value) self.assertEquals(success, the_dict) message_count += 1 self.assertEquals(message_count, 1)
def ensure_topic(self, topic): from kafka import KafkaClient client = KafkaClient(bootstrap_servers=self.connection_string) client.ensure_topic_exists(topic)
class TestLinkSpider(TestCase): example_feed = "\x80\x02}q\x00(X\x0f\x00\x00\x00allowed_domainsq\x01NX"\ "\x0b\x00\x00\x00allow_regexq\x02NX\a\x00\x00\x00crawlidq\x03X\x19"\ "\x00\x00\x0001234567890abcdefghijklmnq\x04X\x03\x00\x00\x00urlq\x05X"\ "\x13\x00\x00\x00www.istresearch.comq\x06X\a\x00\x00\x00expiresq\aK"\ "\x00X\b\x00\x00\x00priorityq\bK\x01X\n\x00\x00\x00deny_regexq\tNX\b"\ "\x00\x00\x00spideridq\nX\x0b\x00\x00\x00test-spiderq\x0bX\x05\x00"\ "\x00\x00attrsq\x0cNX\x05\x00\x00\x00appidq\rX\a\x00\x00\x00testappq"\ "\x0eX\x06\x00\x00\x00cookieq\x0fNX\t\x00\x00\x00useragentq\x10NX\x0f"\ "\x00\x00\x00deny_extensionsq\x11NX\b\x00\x00\x00maxdepthq\x12K\x00u." def setUp(self): self.settings = get_project_settings() self.settings.set('KAFKA_TOPIC_PREFIX', "demo_test") # set up redis self.redis_conn = redis.Redis(host=self.settings['REDIS_HOST'], port=self.settings['REDIS_PORT']) try: self.redis_conn.info() except ConnectionError: print "Could not connect to Redis" # plugin is essential to functionality sys.exit(1) # clear out older test keys if any keys = self.redis_conn.keys("test-spider:*") for key in keys: self.redis_conn.delete(key) # set up kafka to consumer potential result self.kafka_conn = KafkaClient(self.settings['KAFKA_HOSTS']) self.kafka_conn.ensure_topic_exists("demo_test.crawled_firehose") self.consumer = SimpleConsumer(self.kafka_conn, "demo-id", "demo_test.crawled_firehose", buffer_size=1024 * 100, fetch_size_bytes=1024 * 100, max_buffer_size=None) # move cursor to end of kafka topic self.consumer.seek(0, 2) def test_crawler_process(self): runner = CrawlerRunner(self.settings) d = runner.crawl(CustomSpider) d.addBoth(lambda _: reactor.stop()) # add crawl to redis key = "test-spider:istresearch.com:queue" self.redis_conn.zadd(key, self.example_feed, -99) # run the spider, give 20 seconds to see the url, crawl it, # and send to kafka. Then we kill the reactor def thread_func(): time.sleep(20) reactor.stop() thread = threading.Thread(target=thread_func) thread.start() reactor.run() # ensure it was sent out to kafka message_count = 0 for message in self.consumer.get_messages(): if message is None: break else: the_dict = json.loads(message.message.value) if the_dict is not None and the_dict['appid'] == 'testapp' \ and the_dict['crawlid'] == '01234567890abcdefghijklmn': message_count += 1 self.assertEquals(message_count, 1) def tearDown(self): keys = self.redis_conn.keys('stats:crawler:*:test-spider:*') keys = keys + self.redis_conn.keys('test-spider:*') for key in keys: self.redis_conn.delete(key)
""" Usage k-swarm.py <hostname> """ from kafka import SimpleProducer, KafkaClient import logging import sys logging.basicConfig() kafka = KafkaClient(sys.argv[1] + ':9092') kafka.ensure_topic_exists(b'my-topic') producer = SimpleProducer(kafka) for ii in range(100000): msg = "msg-{}".format(ii) producer.send_messages(b'my-topic', msg) print("done.") ## end