class Consumer(object): def __init__(self, addr, group, topic): self.client = KafkaClient(addr) self.consumer = SimpleConsumer(self.client, group, topic, max_buffer_size=1310720000, auto_offset_reset='smallest') self.temp_file_path = None self.temp_file = None self.topic = topic self.group = group self.block_cnt = 0 def consume_topic(self): timestamp = time.strftime('%Y%m%d%H%M%S') #open file for writing self.temp_file_path = "/home/ubuntu/datamill/kafka_%s_%s_%s.dat" % ( self.topic, self.group, timestamp) self.temp_file = open(self.temp_file_path, "w") header = 'experiment_id,job_id,results_file,package_id,package_name,worker_id,config_id,replicate_no,setup_time,run_time,collect_time,hw_cpu_arch,hw_cpu_mhz,hw_gpu_mhz,hw_num_cpus,hw_page_sz,hw_ram_mhz,hw_ram_sz,sw_address_randomization,sw_autogroup,sw_compiler,sw_drop_caches,sw_env_padding,sw_filesystem,sw_freq_scaling,sw_link_order,sw_opt_flag,sw_swap,sw_sys_time' self.temp_file.write(header) while True: try: messages = self.consumer.get_messages(count=100, block=False) for message in messages: self.temp_file.write(message.message.value + "\n") if self.temp_file.tell() > 20000: self.save_to_hdfs() self.consumer.commit() except: self.consumer.seek(0, 2) self.consumer.commit() def save_to_hdfs(self): self.temp_file.close() timestamp = time.strftime('%Y%m%d%H%M%S') hadoop_path = "/datamill/%s_%s_%s.csv" % (self.group, self.topic, timestamp) print "Block " + str( self.block_cnt) + ": Saving file to HDFS " + hadoop_path self.block_cnt += 1 # place blocked messages into history and cached folders on hdfs os.system("hdfs dfs -put %s %s" % (self.temp_file_path, hadoop_path)) os.remove(self.temp_file_path) timestamp = time.strftime('%Y%m%d%H%M%S') self.temp_file_path = "/home/ubuntu/datamill/kafka_%s_%s_%s.dat" % ( self.topic, self.group, timestamp) self.temp_file = open(self.temp_file_path, "w")
def main(): client = KafkaClient("localhost:9092") consumer = SimpleConsumer(client, "test-group", "twitter_raw") consumer.seek(0,2) num = 0 for message in consumer: print "redis publish:", num num+=1 try: data_depickled = pickle.loads(message.message.value.decode('utf-8')) except Exception, e: continue # print data_depickled # { # 'text':'@_LulaMoore me hamas perra', # 'created_at':datetime.datetime(2015, 10, 9, 23, 36, 49), # 'source':u'Twitter Web Client', # 'lang:':u'es', # 'place':{ # 'country_code':u'AR', # 'coordinates':[ # [ # -68.176283, # -38.984724 # ], # [ # -68.176283, # -38.921051 # ], # [ # -68.015162, # -38.921051 # ], # [ # -68.015162, # -38.984724 # ] # ] # }, # 'user':{ # 'statuses_count':15067, # 'name':u'Dama negra *\uffe6*', # 'friends_count':390, # 'created_at':datetime.datetime(2014, 3, 15,2,37, 10), # 'profile_image_url': u'http://pbs.twimg.com/profile_images/652333268256313344/x9K9Nlys_normal.jpg', # 'followers_count':384, # 'id':2390242428 # }, # 'id':652628813935980544 # } ### process data here ### # text = data_depickled['text'] filtered_data = data_filter(data_depickled) data_pickled = pickle.dumps(filtered_data) redis.publish('tweets_processed', data_pickled)
def main(): client = KafkaClient("localhost:9092") consumer = SimpleConsumer(client, "test-group", "twitter_raw") consumer.seek(0,2) for message in consumer: # data_deserialized = str.decode(message.message.value) data_depickled = pickle.loads(message.message.value.decode('utf-8')) # print str(data_depickled).decode('string_escape') print data_depickled
def blocking_consumer(self, message_consume_function, parse_json, topic_group, topic_name): print "starting blocking consumer with topic group %s and topic name %s" % (topic_group, topic_name) consumer = SimpleConsumer(self.client, topic_group, topic_name) consumer.seek(0,2) for message in consumer: message = parse_json(message) print "=============" + str(message) + "============" message_consume_function(message) print "called message consume function"
def run(self): client = KafkaClient("10.206.216.13:19092,10.206.212.14:19092,10.206.209.25:19092") consumer = SimpleConsumer(client, "test-group", "jiketest",auto_commit=False,partitions=self.part) consumer.seek(0,0) while True: message = consumer.get_message(True,60) self.__offset = message.offset print message.message.value
class Consumer(object): def __init__(self, addr, group, topic): """Initialize Consumer with kafka broker IP, group, and topic.""" self.client = KafkaClient(addr) self.consumer = SimpleConsumer(self.client, group, topic, max_buffer_size=1310720000) self.temp_file_path = None self.temp_file = None self.hadoop_path = "/insight/artsy/geo" self.topic = topic self.group = group self.block_cnt = 0 def consume_topic(self, output_dir): """Consumes a stream of messages from the "post_geo_activity" topic. Code template from https://github.com/ajmssc/bitcoin-inspector.git """ timestamp = time.strftime('%Y%m%d%H%M%S') # open file for writing self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % (output_dir,self.topic,self.group,timestamp) self.temp_file = open(self.temp_file_path,"w") while True: try: # get 1000 messages at a time, non blocking messages = self.consumer.get_messages(count=1000, block=False) for message in messages: self.temp_file.write(message.message.value + "\n") # file size > 20MB if self.temp_file.tell() > 20000000: self.flush_to_hdfs(output_dir) self.consumer.commit() except: # move to tail of kafka topic if consumer is referencing # unknown offset self.consumer.seek(0, 2) def flush_to_hdfs(self, output_dir): """Flushes the 20MB file into HDFS.""" self.temp_file.close() timestamp = time.strftime('%Y%m%d%H%M%S') hadoop_fullpath = "%s/%s_%s_%s.dat" % (self.hadoop_path, self.group,self.topic, timestamp) print "Block {}: Flushing data file to HDFS => {}".format(str(self.block_cnt),hadoop_fullpath) self.block_cnt += 1 os.system("hdfs dfs -put %s %s" % (self.temp_file_path, hadoop_fullpath)) # save from local to hdfs os.remove(self.temp_file_path) # remove temp local file timestamp = time.strftime('%Y%m%d%H%M%S') self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % (output_dir,self.topic,self.group,timestamp) self.temp_file = open(self.temp_file_path, "w")
class Consumer(object): def __init__(self, addr, group, topic): self.client = KafkaClient(addr) self.consumer = SimpleConsumer(self.client, group, topic, max_buffer_size=1310720000, auto_offset_reset='smallest') self.temp_file_path = None self.temp_file = None self.topic = topic self.group = group self.block_cnt = 0 def consume_topic(self): timestamp = time.strftime('%Y%m%d%H%M%S') #open file for writing self.temp_file_path = "/home/ubuntu/datamill/kafka_%s_%s_%s.dat" % (self.topic, self.group, timestamp) self.temp_file = open(self.temp_file_path,"w") header = 'experiment_id,job_id,results_file,package_id,package_name,worker_id,config_id,replicate_no,setup_time,run_time,collect_time,hw_cpu_arch,hw_cpu_mhz,hw_gpu_mhz,hw_num_cpus,hw_page_sz,hw_ram_mhz,hw_ram_sz,sw_address_randomization,sw_autogroup,sw_compiler,sw_drop_caches,sw_env_padding,sw_filesystem,sw_freq_scaling,sw_link_order,sw_opt_flag,sw_swap,sw_sys_time' self.temp_file.write(header) while True: try: messages = self.consumer.get_messages(count=100, block=False) for message in messages: self.temp_file.write(message.message.value + "\n") if self.temp_file.tell() > 20000: self.save_to_hdfs() self.consumer.commit() except: self.consumer.seek(0, 2) self.consumer.commit() def save_to_hdfs(self): self.temp_file.close() timestamp = time.strftime('%Y%m%d%H%M%S') hadoop_path = "/datamill/%s_%s_%s.csv" % (self.group, self.topic, timestamp) print "Block " + str(self.block_cnt) + ": Saving file to HDFS " + hadoop_path self.block_cnt += 1 # place blocked messages into history and cached folders on hdfs os.system("hdfs dfs -put %s %s" % (self.temp_file_path, hadoop_path)) os.remove(self.temp_file_path) timestamp = time.strftime('%Y%m%d%H%M%S') self.temp_file_path = "/home/ubuntu/datamill/kafka_%s_%s_%s.dat" % (self.topic, self.group, timestamp) self.temp_file = open(self.temp_file_path, "w")
class KafkaDatawakeLookaheadSpout(Spout): group = 'datawake-crawler-out-consumer'.encode() def __init__(self): Spout.__init__(self) self.queue = None def initialize(self, stormconf, context): try: settings = all_settings.get_settings(stormconf['topology.deployment']) self.topic = settings['crawler-out-topic'].encode() self.conn_pool = settings['conn_pool'].encode() self.log('KafkaDatawakeLookaheadSpout initialized with topic =' + self.topic + ' conn_pool=' + self.conn_pool) self.kafka = KafkaClient(self.conn_pool) self.consumer = SimpleConsumer(self.kafka, self.group, self.topic, max_buffer_size=None) self.consumer.seek(0, 2) # move to the tail of the queue except: self.log("KafkaDatawakeLookaheadSpout initialize error", level='error') self.log(traceback.format_exc(), level='error') raise def next_tuple(self): """ input message: dict( id = input['id'], appid = input['appid'], url = url, status_code = response.getcode(), status_msg = 'Success', timestamp = response.info()['date'], links_found = links, raw_html = html, attrs = input['attrs'] ) :return: (url, status, headers, flags, body, timestamp, source,context) """ offsetAndMessage = self.consumer.get_messages(timeout=None)[0] message = offsetAndMessage.message.value crawled = json.loads(message) safeurl = crawled['url'].encode('utf-8', 'ignore') self.log("Lookahead spout received id: " + crawled['id'] + " url: " + safeurl) context = { 'source': 'datawake-lookahead', 'userId': crawled['attrs']['userId'], 'org': crawled['attrs']['org'], 'domain': crawled['attrs']['domain'], 'url': crawled['url'] } self.emit([crawled['url'], crawled['status_code'], '', '', crawled['raw_html'], crawled['timestamp'], context['source'], context])
def run(self): client = None consumer = None try: prev = None # print("Starting Kafka Client") # print("Kafka topic: {}").format(self.topic) print get_kafka_hosts() client = KafkaClient(hosts=get_kafka_hosts()) consumer = SimpleConsumer(client=client, group=self.groupName.encode( 'ascii', 'ignore'), topic=self.topic, iter_timeout=5) consumer.seek(0, 1) print '[Kafka Consumer] START' print 'Topic: {}'.format(self.topic) print 'Listening incoming message...' print '=========================================================' # print("Listening kafka message...") while self.stopCpu is False: for message in consumer.get_messages(count=5, block=False): if self.stopCpu is True: # print("Kafka Consumer Listening Stopped") break if message: offset = message.offset value = message.message.value print 'msg: {0}, offset: {1}'.format(value, offset) if len(value) > 0: # chartdata = [] # j_val = json.loads(value) # j_val['offset'] = offset # chartdata.append(j_val) # print("destination => ws"+str(self.pid)) # self.parentOj.emit("ws"+str(self.type), chartdata) # self.parentOj.emit(self.topic, value) self.parentOj.emit("ws" + str(self.pid), value) print '[Kafka Consumer] STOP' print 'Topic: {}'.format(self.topic) print 'Stop listening...' print '========================================================' # print("Listening kafka Stopped") consumer.stop() client.close() except Exception as e: consumer.stop() client.close()
class KafkaDatawakeVisitedSpout(Spout): group = 'datawake-visited-consumer'.encode() def __init__(self): Spout.__init__(self) self.queue = None def initialize(self, stormconf, context): try: settings = all_settings.get_settings( stormconf['topology.deployment']) self.topic = settings['visited-topic'].encode() self.conn_pool = settings['conn_pool'].encode() self.log('KafkaDatawakeVisitedSpout initialized with topic =' + self.topic + ' conn_pool=' + self.conn_pool) self.kafka = KafkaClient(self.conn_pool) self.kafka.ensure_topic_exists(self.topic) self.consumer = SimpleConsumer(self.kafka, self.group, self.topic, max_buffer_size=None) self.consumer.seek(0, 2) # move to the tail of the queue except: self.log("KafkaDatawakeVisitedSpout initialize error", level='error') self.log(traceback.format_exc(), level='error') raise def next_tuple(self): """ input: (timestamp,org,domain,user_id,url,html) :return: (url, status, headers, flags, body, timestamp, source,context) """ try: for message in self.consumer: self.log("msg") self.log(message) #offsetAndMessage = self.consumer.get_messages(timeout=None)[0] message = message.split('\0') (timestamp, org, domain, userId, url, html) = message context = {'source': 'datawake-visited', 'domain': domain} self.emit([ url, '', '', '', html, timestamp, context['source'], context ]) except: self.log(traceback.format_exc(), level='error') def fail(self, tup_id): pass
class KafkaConsumer: group = "python-lookahead-consumer" def __init__(self,conn_pool,topic,group): self.conn_pool = conn_pool self.topic = topic self.group = group self.kafka = KafkaClient(self.conn_pool) self.consumer = SimpleConsumer(self.kafka,self.group,self.topic,max_buffer_size=None) self.consumer.seek(0,2) # move to the tail of the queue def next(self): offsetAndMessage = self.consumer.get_messages(timeout=None)[0] message = offsetAndMessage.message.value return message
def consume(): message_count = 0 kafka = KafkaClient(KAFKA_HOSTS) consumer = SimpleConsumer(client=kafka, group='XXXXX-YYYYY-ZZZZZ', topic='events', iter_timeout=15, max_buffer_size=1024*1024*2) consumer.seek(0, ) for m in consumer: message_count += 1 pprint.pprint(m.message.value) print message_count print
class KafkaDatawakeVisitedSpout(Spout): group = 'datawake-visited-consumer'.encode() def __init__(self): Spout.__init__(self) self.queue = None def initialize(self, stormconf, context): try: settings = all_settings.get_settings(stormconf['topology.deployment']) self.topic = settings['visited-topic'].encode() self.conn_pool = settings['conn_pool'].encode() self.log('KafkaDatawakeVisitedSpout initialized with topic =' + self.topic + ' conn_pool=' + self.conn_pool) self.kafka = KafkaClient(self.conn_pool) self.kafka.ensure_topic_exists(self.topic) self.consumer = SimpleConsumer(self.kafka, self.group, self.topic, max_buffer_size=None) self.consumer.seek(0, 2) # move to the tail of the queue except: self.log("KafkaDatawakeVisitedSpout initialize error", level='error') self.log(traceback.format_exc(), level='error') raise def next_tuple(self): """ input: (timestamp,org,domain,user_id,url,html) :return: (url, status, headers, flags, body, timestamp, source,context) """ try: for message in self.consumer: self.log("msg") self.log(message) #offsetAndMessage = self.consumer.get_messages(timeout=None)[0] message = message.split('\0') (timestamp, org, domain, userId, url, html) = message context = { 'source': 'datawake-visited', 'domain': domain } self.emit([url, '', '', '', html, timestamp, context['source'], context]) except: self.log(traceback.format_exc(), level='error') def fail(self, tup_id): pass
class CrawlerSpout(Spout): group = 'datawake-crawler-in-consumer'.encode() def initialize(self, stormconf, context): try: settings = all_settings.get_settings(stormconf['topology.deployment']) self.topic = settings['crawler-in-topic'].encode() self.conn_pool = settings['conn_pool'].encode() self.log('CrawlerSpout initialized with topic ='+self.topic+' conn_pool='+self.conn_pool) self.kafka = KafkaClient(self.conn_pool) self.kafka.ensure_topic_exists(self.topic) self.consumer = SimpleConsumer(self.kafka,self.group,self.topic,max_buffer_size=None, fetch_size_bytes=2000000) self.consumer.seek(0,2) # move to the tail of the queue except: self.log("CrawlerSpout initialize error",level='error') self.log(traceback.format_exc(),level='error') raise def next_tuple(self): """ input message: json.dumps(dict( id = 'abcdefg', #TODO generate UUID, appid = self.appid, url = url, priority = 50, depth = 0, attrs = dict( userId = context['userId'], org = context['org'], domain = context['domain'] ) )) :return: """ try: for message in self.consumer: to_crawl = json.loads(message) self.emit([to_crawl]) except: self.log(traceback.format_exc(),level='error')
class KafkaConsumer: group = "python-lookahead-consumer" def __init__(self, conn_pool, topic, group): self.conn_pool = conn_pool self.topic = topic self.group = group self.kafka = KafkaClient(self.conn_pool) self.consumer = SimpleConsumer(self.kafka, self.group, self.topic, max_buffer_size=None) self.consumer.seek(0, 2) # move to the tail of the queue def next(self): offsetAndMessage = self.consumer.get_messages(timeout=None)[0] message = offsetAndMessage.message.value return message
class Command(BaseCommand): option_list = BaseCommand.option_list + ( make_option("--beginning", action="store_true", dest="beginning", default=False, help="Start from the beginning of the topic (seek to 0)"), make_option("--end", action="store_true", dest="end", default=False, help="Start from the end of the topic (seek to 0)"), make_option( "--dev", action="store_true", dest="dev", default=False, help= "Start a few hundred back from the end of the topic to warm local events." ), ) topic = settings.KAFKA_EVENT_TOPIC @property def consumer_name(self): basename = str(__name__).split(".")[-1].replace("_", "-") return "%s-%s" % ( basename, socket.gethostname(), ) def handle(self, *args, **options): self.stdout.write("Starting %s worker for %s." % ( self.topic, self.consumer_name, )) self.consumer = SimpleConsumer(client=kafka_client, group=self.consumer_name, topic=self.topic, max_buffer_size=1024 * 1024 * 2) # self.consumer = MultiProcessConsumer(client=kafka_client, # group=self.consumer_name, # topic=self.topic, # num_procs=multiprocessing.cpu_count()) self.dev_mode = False if options["beginning"]: self.consumer.seek(0, 0) elif options["end"]: self.consumer.seek(0, 2) elif options["dev"]: self.dev_mode = True self.consumer.seek(-300, 2) try: while bool(self.consumer): try: for m in self.consumer: self.handle_message(m) except OffsetOutOfRangeError: self.stderr.write( "Offset out of range error. Seeking to beginning.") self.consumer.seek(0, 0) except KeyboardInterrupt: self.consumer = False kafka_client.close() self.stdout.write("Shutting down %s worker." % (self.topic, )) self.stdout.write("Got Ctrl-C, terminating %s consumer." % (self.topic, )) def handle_message(self, message_and_offset): offset = message_and_offset.offset message = message_and_offset.message payload = json.loads(message.value) team = payload.get("team") event_type = payload.get("event") if not payload.get("event_id"): self.stdout.write("Skipping event with no UUID <Offset: %s>" % offset) return # Process Advisories if event_type in ADVISORY_EVENTS: task = create_advisory_from_event_payload if self.dev_mode: advisory = task(payload) if advisory: print "Created %s" % advisory else: task.apply_async(args=(payload, )) # New Sensor Enrollment if event_type == "sensor_new": return Sensor.create_from_payload(payload) # Cluster Destroy Event if event_type == "dead_packages_group": cluster_id = payload["value"].values()[0] try: return Cluster.objects.get(uuid=cluster_id).delete() except Cluster.DoesNotExist: return
class ZKConsumer(object): zk_timeout = 30 jitter_seconds = 30 broker_prefix = '/brokers/ids' def __init__( self, zk_hosts, group, topic, nodes, zk_handler=None, logger=None, identifier=None, **consumer_kwargs): """Creates a Consumer that tracks state in ZooKeeper, rebalancing partition ownership as registered consumers change. NOTE: this class is intended for version 0.8.1 of Kafka, where offsets are managed by Kafka but there is no rebalancing in the protocol. """ if logger is None: logger = logging.getLogger('kafka.consumer.ZKConsumer') self.logger = logger self.identifier = identifier if KafkaClient is None: raise RuntimeError("Kafka support requires cs.eyrie to be installed with the Kafka extra: install_requires= ['cs.eyrie[Kafka]']") self.zk_handler = zk_handler self.zk_hosts = zk_hosts self.broker_hosts = [] self.group = group self.topic = topic self.zk = None self.nodes = nodes self.client = None self.consumer = None self.consumer_kwargs = consumer_kwargs # This will kick off a cascading sequence to initialize ourselves: # 1. Connect to ZK and pull list of Kafka brokers # 2. Register ourselves as a consumer in ZK # 3. Rebalance partitions across all connected consumers self.init_zk() def zk_session_watch(self, state): self.logger.debug('ZK transitioned to: %s', state) if state == KazooState.SUSPENDED: if self.consumer is not None: self.logger.info('Stopping Kafka consumer') self.consumer.stop() self.consumer = None # Lost connection to ZK; we can't call any methods that would # try to contact it (i.e., we can't do self.zkp.finish() ) self.zkp = None elif state == KazooState.CONNECTED: self.logger.info('Restarting ZK partitioner') self.zk.handler.spawn(self.init_zkp) def _zkp_wait(self): handler = self.zk.handler while 1: if self.zkp.failed: self.logger.warning("Lost or unable to acquire partition") self.stop() elif self.zkp.release: self.zkp.release_set() elif self.zkp.acquired: def group_change_proxy(event): self.logger.warn('Connected consumers changed') if self.zkp is None: self.logger.info('Restarting ZK partitioner') handler.spawn(self.init_zkp) elif self.zkp is not None and self.zkp.failed: self.logger.warning("Lost or unable to acquire partition") self.stop() else: self.logger.info('Scheduling ZK partitioner set release') rel_greenlet = handler.spawn(self.zkp.release_set) self.logger.info('Scheduling group re-join') rel_greenlet.link_value(lambda greenlet: self.zkp.join_group) if not self.nodes: self.logger.info('Partitioner aquired; setting child watch') result = self.zk.get_children_async(self.zkp._group_path) result.rawlink(group_change_proxy) # Break out of while loop to begin consuming events break elif self.zkp.allocating: self.zkp.wait_for_acquire() def init_zkp(self): if not hasattr(self, 'zkp') or self.zkp is None: if self.nodes: self.zkp = StaticZKPartitioner( self.zk, self.group, self.topic, self.nodes, partitions_changed_cb=self.init_consumer, logger=self.logger, identifier=self.identifier) else: self.zkp = ZKPartitioner( self.zk, self.group, self.topic, time_boundary=self.jitter_seconds, partitions_changed_cb=self.init_consumer, logger=self.logger, identifier=self.identifier) self._zkp_wait() def init_zk(self): # TODO: switch to async # 1. implement kazoo.interfaces.IHandler in terms of Tornado's IOLoop self.zk = KazooClient(hosts=self.zk_hosts, handler=self.zk_handler) self.zk.start() self.zk.add_listener(self.zk_session_watch) @self.zk.ChildrenWatch(self.broker_prefix) def broker_change_proxy(broker_ids): self.onBrokerChange(broker_ids) self.init_zkp() def onBrokerChange(self, broker_ids): self.broker_hosts = [] for b_id in broker_ids: b_json, zstat = self.zk.get('/'.join([self.broker_prefix, b_id])) b_data = json.loads(b_json) self.broker_hosts.append('{}:{}'.format(b_data['host'], b_data['port'])) my_partitions = [] if self.consumer is not None: self.logger.warn('Brokers changed, stopping Kafka consumer.') my_partitions = self.consumer.offsets.keys() self.consumer.stop() self.consumer = None if self.client is not None: self.logger.warn('Brokers changed, stopping Kafka client.') self.client.close() self.client = None if my_partitions: msg = 'Brokers changed, queuing restart of Kafka client / consumer.' self.logger.warn(msg) self.zk.handler.spawn(self.init_consumer, my_partitions) def init_consumer(self, my_partitions): if self.consumer is None: self.logger.warn('Starting Kafka client') self.client = KafkaClient(self.broker_hosts, client_id=self.zkp._identifier) else: if self.consumer is None or \ sorted(my_partitions) != sorted(self.consumer.offsets.keys()): self.logger.warn('Partitions changed, restarting Kafka consumer.') self.consumer.stop() else: self.logger.info('Partitions unchanged, not restarting Kafka consumer.') return self.consumer = SimpleConsumer(self.client, self.group, self.topic, partitions=my_partitions, **self.consumer_kwargs) self.consumer.provide_partition_info() self.logger.info("Consumer connected to Kafka: %s", self.consumer.offsets) def stop(self): if self.consumer is not None: self.logger.info('Stopping Kafka consumer') self.consumer.stop() self.consumer = None if self.client is not None: self.logger.info('Stopping Kafka client') self.client.close() self.client = None if self.zk is not None: self.logger.info('Stopping ZooKeeper client') if self.zkp is not None and not self.zkp.failed: self.zkp.finish() self.zk.stop() self.zkp = None self.zk = None def commit(self, partitions=None): """ Commit offsets for this consumer partitions: list of partitions to commit, default is to commit all of them """ if self.consumer is None: return self.logger.debug('Begin committing offsets for partitions: %s', partitions if partitions else 'All') self.consumer.commit(partitions) self.logger.debug('End committing offsets for partitions: %s', partitions if partitions else 'All') def pending(self, partitions=None): """ Gets the pending message count partitions: list of partitions to check for, default is to check all """ return self.consumer.pending(partitions) def provide_partition_info(self): """ Indicates that partition info must be returned by the consumer """ self.consumer.provide_partition_info() def seek(self, offset, whence): """ Alter the current offset in the consumer, similar to fseek offset: how much to modify the offset whence: where to modify it from 0 is relative to the earliest available offset (head) 1 is relative to the current offset 2 is relative to the latest known offset (tail) """ self.consumer.seek(offset, whence) def get_messages(self, count=1, block=True, timeout=0.1): """ Fetch the specified number of messages count: Indicates the maximum number of messages to be fetched block: If True, the API will block till some messages are fetched. timeout: If block is True, the function will block for the specified time (in seconds) until count messages is fetched. If None, it will block forever. """ if self.consumer is None: return [] else: try: messages = self.consumer.get_messages(count, block, timeout) if not messages and self.zkp.failed: raise FailedPayloadsError return messages except FailedPayloadsError as err: msg = 'Failed to retrieve payload, restarting consumer' self.logger.exception(msg) raise err def get_message(self, block=True, timeout=0.1, get_partition_info=None): return self.consumer.get_message(block, timeout, get_partition_info) def _get_message(self, block=True, timeout=0.1, get_partition_info=None, update_offset=True): return self.consumer._get_message(block, timeout, get_partition_info, update_offset) def __iter__(self): for msg in self.consumer: yield msg
class Consumer(object): def __init__(self, addr, group, topic): self.client = KafkaClient(addr) self.consumer = SimpleConsumer(self.client, group, topic, max_buffer_size=1310720000) self.temp_file_path = None self.temp_file = None self.topic = topic self.group = group self.block_cnt = 0 def consume_topic(self, output_dir): timestamp = time.strftime('%Y%m%d%H%M%S') #open file for writing self.temp_file_path = "/home/ubuntu/FantasyFootball/ingestion/kafka_%s_%s_%s.dat" % (self.topic, self.group, timestamp) self.temp_file = open(self.temp_file_path,"w") one_entry = False while True: try: messages = self.consumer.get_messages(count=100, block=False) #OffsetAndMessage(offset=43, message=Message(magic=0, # attributes=0, key=None, value='some message')) for message in messages: one_entry = True self.tempfile.write(message.message.value + "\n") if self.tempfile.tell() > 2000: self.save_to_hdfs(output_dir) self.consumer.commit() except: self.consumer.seek(0, 2) if one_entry: self.save_to_hdfs(output_dir, self.topic) self.consumer.commit() def save_to_hdfs(self, output_dir): self.tempfile.close() timestamp = time.strftime('%Y%m%d%H%M%S') hadoop_path = "/user/solivero/playerpoints/history/%s_%s_%s.dat" % (self.group, self.topic, timestamp) cached_path = "/user/solivero/playerpoints/cached/%s_%s_%s.dat" % (self.group, self.topic, timestamp) print "Block " + str(self.block_cnt) + ": Saving file to HDFS " + hadoop_path self.block_cnt += 1 # place blocked messages into history and cached folders on hdfs os.system("sudo -u hdfs hdfs dfs -put %s %s" % (self.temp_file_path,hadoop_path)) os.system("sudo -u hdfs hdfs dfs -put %s %s" % (self.temp_file_path,cached_path)) os.remove(self.temp_file_path) timestamp = time.strftime('%Y%m%d%H%M%S') self.temp_file_path = "/home/ubuntu/fantasyfootball/ingestion/kafka_%s_%s_%s.dat" % (self.topic, self.group, timestamp) self.temp_file = open(self.temp_file_path, "w")
class KafkaDatawakeLookaheadSpout(Spout): group = 'datawake-crawler-out-consumer'.encode() def __init__(self): Spout.__init__(self) self.queue = None def initialize(self, stormconf, context): try: self.settings = all_settings.get_settings( stormconf['topology.deployment']) self.topic = self.settings['crawler-out-topic'].encode() self.conn_pool = self.settings['crawler_conn_pool'].encode() self.log('KafkaDatawakeLookaheadSpout initialized with topic =' + self.topic + ' conn_pool=' + self.conn_pool) self.kafka = KafkaClient(self.conn_pool) self.kafka.ensure_topic_exists(self.topic) self.consumer = SimpleConsumer(self.kafka, self.group, self.topic, max_buffer_size=None) self.consumer.seek(0, 2) # move to the tail of the queue except: self.log("KafkaDatawakeLookaheadSpout initialize error", level='error') self.log(traceback.format_exc(), level='error') raise def next_tuple(self): """ input message: dict( crawlid = input['crawlid'], appid = input['appid'], url = url, status_code = response.getcode(), status_msg = 'Success', timestamp = response.info()['date'], links_found = links, body = html, attrs = input['attrs'] ) :return: (url, status, headers, flags, body, timestamp, source,context) """ offsetAndMessage = self.consumer.get_messages(timeout=None)[0] message = offsetAndMessage.message.value crawled = json.loads(message) if crawled['appid'] == self.settings["appid"]: safeurl = crawled['url'].encode('utf-8', 'ignore') self.log("Lookahead spout received id: " + crawled['crawlid'] + " url: " + safeurl) context = { 'source': 'datawake-lookahead', 'domain': crawled['attrs']['domain'] } self.emit([ crawled['url'], crawled['status_code'], '', '', crawled['body'], crawled['timestamp'], context['source'], context ])
class Consumer(object): def __init__(self, addr, group, topic): self.client = KafkaClient(addr) self.consumer = SimpleConsumer(self.client, group, topic, max_buffer_size=1310720000) self.temp_file_path = None self.temp_file = None self.topic = topic self.group = group self.block_cnt = 0 os.system ( "hdfs dfs -mkdir /data2" ) def consume_topic(self, output_dir): if not os.path.isdir ( output_dir ): os.makedirs ( output_dir ) timestamp = time.strftime('%Y%m%d%H%M%S') self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % (output_dir, self.topic, self.group, timestamp) self.temp_file = open(self.temp_file_path,"w") while True: try: # get 1000 messages at a time, non blocking messages = self.consumer.get_messages(count=1000, block=False) # OffsetAndMessage(offset=43, message=Message(magic=0, # attributes=0, key=None, value='some message')) for message in messages: self.temp_file.write(message.message.value + "\n") # file size > 40MB if self.temp_file.tell() > 40000000: self.flush_to_hdfs(output_dir) self.consumer.commit() except: # move to tail of kafka topic if consumer is referencing # unknown offset self.consumer.seek(0, 2) def flush_to_hdfs(self, output_dir): self.temp_file.close() timestamp = time.strftime('%Y%m%d%H%M%S') print "Block {}: Flushing 40MB file to HDFS => /data2".format(str(self.block_cnt)) self.block_cnt += 1 # place blocked messages into history and cached folders on hdfs os.system("hdfs dfs -copyFromLocal %s %s" % (self.temp_file_path, "/data2")) os.remove(self.temp_file_path) timestamp = time.strftime('%Y%m%d%H%M%S') self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % (output_dir, self.topic, self.group, timestamp) self.temp_file = open(self.temp_file_path, "w")
class Consumer(object): def __init__(self, addr, group, topic): """Initialize Consumer with kafka broker IP, group, and topic.""" self.client = KafkaClient(addr) self.consumer = SimpleConsumer(self.client, group, topic, max_buffer_size=1310720000) self.temp_file_path = None self.temp_file = None self.hadoop_path = "/insight/artsy/geo" self.topic = topic self.group = group self.block_cnt = 0 def consume_topic(self, output_dir): """Consumes a stream of messages from the "post_geo_activity" topic. Code template from https://github.com/ajmssc/bitcoin-inspector.git """ timestamp = time.strftime('%Y%m%d%H%M%S') # open file for writing self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % ( output_dir, self.topic, self.group, timestamp) self.temp_file = open(self.temp_file_path, "w") while True: try: # get 1000 messages at a time, non blocking messages = self.consumer.get_messages(count=1000, block=False) for message in messages: self.temp_file.write(message.message.value + "\n") # file size > 20MB if self.temp_file.tell() > 20000000: self.flush_to_hdfs(output_dir) self.consumer.commit() except: # move to tail of kafka topic if consumer is referencing # unknown offset self.consumer.seek(0, 2) def flush_to_hdfs(self, output_dir): """Flushes the 20MB file into HDFS.""" self.temp_file.close() timestamp = time.strftime('%Y%m%d%H%M%S') hadoop_fullpath = "%s/%s_%s_%s.dat" % (self.hadoop_path, self.group, self.topic, timestamp) print "Block {}: Flushing data file to HDFS => {}".format( str(self.block_cnt), hadoop_fullpath) self.block_cnt += 1 os.system( "hdfs dfs -put %s %s" % (self.temp_file_path, hadoop_fullpath)) # save from local to hdfs os.remove(self.temp_file_path) # remove temp local file timestamp = time.strftime('%Y%m%d%H%M%S') self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % ( output_dir, self.topic, self.group, timestamp) self.temp_file = open(self.temp_file_path, "w")
class Consumer(object): def __init__(self, addr, group, topic): self.client = KafkaClient(addr) self.consumer = SimpleConsumer(self.client, group, topic, max_buffer_size=1310720000) self.temp_file_path = None self.temp_file = None self.hadoop_path = "/user/AdReport/%s/history" %(topic) self.cached_path = "/user/AdReport/%s/cached" % (topic) self.topic = topic self.group = group self.block_cnt = 0 def consume_topic(self, output_dir): timestamp = time.strftime('%Y%m%d%H%M%S') #open file for writing self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % (output_dir, self.topic, self.group, timestamp) self.temp_file = open(self.temp_file_path,"w") print ( self.temp_file) #one_entry = False while True: try: messages = self.consumer.get_messages(count=10, block=False) #OffsetAndMessage(offset=43, message=Message(magic=0, # attributes=0, key=None, value='some message')) for message in messages: print (message) #one_entry = True #print (self.temp_file.tell()) self.temp_file.write(message.message.value + "\n") if self.temp_file.tell() > 2000000: self.save_to_hdfs(output_dir) self.consumer.commit() except: self.consumer.seek(0, 2) #if one_entry: #print ("sending to hdfs") #self.save_to_hdfs(output_dir, self.topic) #self.consumer.commit() def save_to_hdfs(self, output_dir): print ("Saving file to hdfs") self.temp_file.close() print ("Closed open file") timestamp = time.strftime('%Y%m%d%H%M%S') hadoop_fullpath = "%s/%s_%s_%s.dat" % (self.hadoop_path, self.group, self.topic, timestamp) cached_fullpath = "%s/%s_%s_%s.dat" % (self.cached_path, self.group, self.topic, timestamp) #print ("Block " + str(self.block_cnt) + ": Saving file to HDFS " + hadoop_fullpath) self.block_cnt += 1 # place blocked messages into history and cached folders on hdfs os.system("sudo -u ubuntu /usr/local/hadoop/bin/hdfs dfs -put %s %s" % (self.temp_file_path, hadoop_fullpath)) os.system("sudo -u ubuntu /usr/local/hadoop/bin/hdfs dfs -put %s %s" % (self.temp_file_path, cached_fullpath)) os.remove(self.temp_file_path) timestamp = time.strftime('%Y%m%d%H%M%S') self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % (output_dir, self.topic, self.group, timestamp) self.temp_file = open(self.temp_file_path, "w")
def run(self): client = None consumer = None try: prev = None print("Starting Kafka Client") print("Kafka topic: {}").format(self.topic) hosts = get_es_hosts() print "es hosts: ", hosts es = Elasticsearch(hosts=hosts) # ES_HOST = {"host": "localhost", "port": 9200} # es = Elasticsearch(hosts=[ES_HOST]) # host_list = get_all_host('KAFKA') hosts = get_kafka_hosts() print "kafka hosts: ", hosts client = KafkaClient(hosts=hosts) # print "kafka client group name: "+self.groupName consumer = SimpleConsumer(client, self.groupName, self.topic) # consumer.seek --> offset=0, whence=current consumer.seek(-5, 2) print("Listening kafka message...") while self.stopCpu is False: # for message in consumer.get_messages(count=5, block=False): for message in consumer: if self.stopCpu is True: print("Kafka Consumer Listening Stopped") break if message: print "Consuming kafka message: ", message value = message.message.value try: json_value = json.loads(value) offset = message.offset json_value['data'][0]['offset'] = offset value = json.dumps(json_value) print "Publishing data: ", value doc = json.dumps(json_value['data'][0]) if len(doc) > 0: es.index(index='kafka', doc_type=self.topic, id=offset, body=doc) if len(value) > 0: self.parentOj.emit("ws" + str(self.pid), value) except Exception as e: traceback.print_exc() print "Skipping invalid message" except Exception as e: traceback.print_exc() finally: print("Listening kafka Stopped") print "Stopping consumer ..." consumer.stop() print "Closing client ..." client.close()
def main(): """kafkadump: Kafka topic dump utility for debugging. Usage: kafkadump list --host=<host> kafkadump dump <topic> --host=<host> [--consumer=<consumer>] Examples: List all the topics on your local Kafka instance: python kafkadump.py list --host=<kafkahost>:9092 Dump the contents of a single topic starting from offset 0: python kafkadump.py dump test.crawled_firehose --host=<kafkahost>:9092 Use CTRL+C (SIGINT, KeyboardInterrupt) to stop it from polling Kafka. It will end by printing the total records serviced and the raw output of the most recent record. Options: -h --host <host> Kafka host name where Kafka cluster will be resolved -c --consumer <consumer> Consumer group ID to use for reading messages """ args = docopt(main.__doc__) host = args["--host"] logging.basicConfig() print "=> Connecting to {0}...".format(host) kafka = KafkaClient(host) print "=> Connected." if args["list"]: for topic in kafka.topic_partitions.keys(): print topic return 0 elif args["dump"]: topic = args["<topic>"] consumer_id = args["--consumer"] or "default" consumer = SimpleConsumer(kafka, consumer_id, topic, buffer_size=1024*100, # 100kb fetch_size_bytes=1024*100, # 100kb max_buffer_size=None # eliminate big message errors ) consumer.seek(0, 0) num_records = 0 total_bytes = 0 item = None while True: try: message = consumer.get_message() if message is None: time.sleep(1) continue val = message.message.value item = json.loads(val) body_bytes = len(item) print item num_records = num_records + 1 total_bytes = total_bytes + body_bytes except: traceback.print_exc() break total_mbs = float(total_bytes) / (1024*1024) print if item is not None: print json.dumps(item, indent=4) if num_records == 0: num_records = 1 print num_records, "records", total_mbs, "megabytes", (float(total_bytes) / num_records / 1024), "kb per msg" kafka.close() return 0
if __name__ == "__main__": if len(sys.argv) != 4: print '功能: 移动consumer的消费指针至指定时间处' print 'Kafka Server为: xxxxxxx:9092 等同15/25:9092' print 'Usage: .py [topic] [group] [date]' else: topic = sys.argv[1] group = sys.argv[2] date = sys.argv[3] server = 'xxxxxxx:9092' print '将%s的%s的使用者%s的时间轴调整至%s...' % (server, topic, group, date) client = KafkaClient(server) consumer = SimpleConsumer(client, group, topic) step = 10000 consumer.seek(step, 0) cnt = 0 while step > 1: cnt = cnt + 1 message = consumer.get_message() msg = json.loads(message.message.value) if msg.has_key('up_time'): if cnt % 2 == 0: print 'Processed %s to date %s' % (cnt, msg['up_time']) if msg['up_time'] > date: step = int(step * 2 / 3) consumer.seek(-step, 1) elif msg['up_time'] == date: break else: consumer.seek(step, 1)
class ZKConsumer(object): zk_timeout = 30 jitter_seconds = 30 broker_prefix = '/brokers/ids' def __init__(self, zk_hosts, group, topic, nodes, zk_handler=None, logger=None, identifier=None, **consumer_kwargs): """Creates a Consumer that tracks state in ZooKeeper, rebalancing partition ownership as registered consumers change. NOTE: this class is intended for version 0.8.1 of Kafka, where offsets are managed by Kafka but there is no rebalancing in the protocol. """ if logger is None: logger = logging.getLogger('kafka.consumer.ZKConsumer') self.logger = logger self.identifier = identifier if KafkaClient is None: raise RuntimeError( "Kafka support requires cs.eyrie to be installed with the Kafka extra: install_requires= ['cs.eyrie[Kafka]']" ) self.zk_handler = zk_handler self.zk_hosts = zk_hosts self.broker_hosts = [] self.group = group self.topic = topic self.zk = None self.nodes = nodes self.client = None self.consumer = None self.consumer_kwargs = consumer_kwargs # This will kick off a cascading sequence to initialize ourselves: # 1. Connect to ZK and pull list of Kafka brokers # 2. Register ourselves as a consumer in ZK # 3. Rebalance partitions across all connected consumers self.init_zk() def zk_session_watch(self, state): self.logger.debug('ZK transitioned to: %s', state) if state == KazooState.SUSPENDED: if self.consumer is not None: self.logger.info('Stopping Kafka consumer') self.consumer.stop() self.consumer = None # Lost connection to ZK; we can't call any methods that would # try to contact it (i.e., we can't do self.zkp.finish() ) self.zkp = None elif state == KazooState.CONNECTED: self.logger.info('Restarting ZK partitioner') self.zk.handler.spawn(self.init_zkp) def _zkp_wait(self): handler = self.zk.handler while 1: if self.zkp.failed: self.logger.warning("Lost or unable to acquire partition") self.stop() elif self.zkp.release: self.zkp.release_set() elif self.zkp.acquired: def group_change_proxy(event): self.logger.warn('Connected consumers changed') if self.zkp is None: self.logger.info('Restarting ZK partitioner') handler.spawn(self.init_zkp) elif self.zkp is not None and self.zkp.failed: self.logger.warning( "Lost or unable to acquire partition") self.stop() else: self.logger.info( 'Scheduling ZK partitioner set release') rel_greenlet = handler.spawn(self.zkp.release_set) self.logger.info('Scheduling group re-join') rel_greenlet.link_value( lambda greenlet: self.zkp.join_group) if not self.nodes: self.logger.info( 'Partitioner aquired; setting child watch') result = self.zk.get_children_async(self.zkp._group_path) result.rawlink(group_change_proxy) # Break out of while loop to begin consuming events break elif self.zkp.allocating: self.zkp.wait_for_acquire() def init_zkp(self): if not hasattr(self, 'zkp') or self.zkp is None: if self.nodes: self.zkp = StaticZKPartitioner( self.zk, self.group, self.topic, self.nodes, partitions_changed_cb=self.init_consumer, logger=self.logger, identifier=self.identifier) else: self.zkp = ZKPartitioner( self.zk, self.group, self.topic, time_boundary=self.jitter_seconds, partitions_changed_cb=self.init_consumer, logger=self.logger, identifier=self.identifier) self._zkp_wait() def init_zk(self): # TODO: switch to async # 1. implement kazoo.interfaces.IHandler in terms of Tornado's IOLoop self.zk = KazooClient(hosts=self.zk_hosts, handler=self.zk_handler) self.zk.start() self.zk.add_listener(self.zk_session_watch) @self.zk.ChildrenWatch(self.broker_prefix) def broker_change_proxy(broker_ids): self.onBrokerChange(broker_ids) self.init_zkp() def onBrokerChange(self, broker_ids): self.broker_hosts = [] for b_id in broker_ids: b_json, zstat = self.zk.get('/'.join([self.broker_prefix, b_id])) b_data = json.loads(b_json) self.broker_hosts.append('{}:{}'.format(b_data['host'], b_data['port'])) my_partitions = [] if self.consumer is not None: self.logger.warn('Brokers changed, stopping Kafka consumer.') my_partitions = self.consumer.offsets.keys() self.consumer.stop() self.consumer = None if self.client is not None: self.logger.warn('Brokers changed, stopping Kafka client.') self.client.close() self.client = None if my_partitions: msg = 'Brokers changed, queuing restart of Kafka client / consumer.' self.logger.warn(msg) self.zk.handler.spawn(self.init_consumer, my_partitions) def init_consumer(self, my_partitions): if self.consumer is None: self.logger.warn('Starting Kafka client') self.client = KafkaClient(self.broker_hosts, client_id=self.zkp._identifier) else: if self.consumer is None or \ sorted(my_partitions) != sorted(self.consumer.offsets.keys()): self.logger.warn( 'Partitions changed, restarting Kafka consumer.') self.consumer.stop() else: self.logger.info( 'Partitions unchanged, not restarting Kafka consumer.') return self.consumer = SimpleConsumer(self.client, self.group, self.topic, partitions=my_partitions, **self.consumer_kwargs) self.consumer.provide_partition_info() self.logger.info("Consumer connected to Kafka: %s", self.consumer.offsets) def stop(self): if self.consumer is not None: self.logger.info('Stopping Kafka consumer') self.consumer.stop() self.consumer = None if self.client is not None: self.logger.info('Stopping Kafka client') self.client.close() self.client = None if self.zk is not None: self.logger.info('Stopping ZooKeeper client') if self.zkp is not None and not self.zkp.failed: self.zkp.finish() self.zk.stop() self.zkp = None self.zk = None def commit(self, partitions=None): """ Commit offsets for this consumer partitions: list of partitions to commit, default is to commit all of them """ if self.consumer is None: return self.logger.debug('Begin committing offsets for partitions: %s', partitions if partitions else 'All') self.consumer.commit(partitions) self.logger.debug('End committing offsets for partitions: %s', partitions if partitions else 'All') def pending(self, partitions=None): """ Gets the pending message count partitions: list of partitions to check for, default is to check all """ return self.consumer.pending(partitions) def provide_partition_info(self): """ Indicates that partition info must be returned by the consumer """ self.consumer.provide_partition_info() def seek(self, offset, whence): """ Alter the current offset in the consumer, similar to fseek offset: how much to modify the offset whence: where to modify it from 0 is relative to the earliest available offset (head) 1 is relative to the current offset 2 is relative to the latest known offset (tail) """ self.consumer.seek(offset, whence) def get_messages(self, count=1, block=True, timeout=0.1): """ Fetch the specified number of messages count: Indicates the maximum number of messages to be fetched block: If True, the API will block till some messages are fetched. timeout: If block is True, the function will block for the specified time (in seconds) until count messages is fetched. If None, it will block forever. """ if self.consumer is None: return [] else: try: messages = self.consumer.get_messages(count, block, timeout) if not messages and self.zkp.failed: raise FailedPayloadsError return messages except FailedPayloadsError as err: msg = 'Failed to retrieve payload, restarting consumer' self.logger.exception(msg) raise err def get_message(self, block=True, timeout=0.1, get_partition_info=None): return self.consumer.get_message(block, timeout, get_partition_info) def _get_message(self, block=True, timeout=0.1, get_partition_info=None, update_offset=True): return self.consumer._get_message(block, timeout, get_partition_info, update_offset) def __iter__(self): for msg in self.consumer: yield msg
class KafkaMonitor: def __init__(self, settings): # dynamic import of settings file # remove the .py from the filename self.settings = importlib.import_module(settings[:-3]) # only need kafka for both uses self.kafka_conn = KafkaClient(self.settings.KAFKA_HOSTS) def get_method(self, key): if key == 'handle_crawl_request': return self.handle_crawl_request elif key == 'handle_action_request': return self.handle_action_request raise AttributeError(key) def setup(self): self.redis_conn = redis.Redis(host=self.settings.REDIS_HOST, port=self.settings.REDIS_PORT) self.kafka_conn.ensure_topic_exists(self.settings.KAFKA_INCOMING_TOPIC) self.consumer = SimpleConsumer(self.kafka_conn, self.settings.KAFKA_GROUP, self.settings.KAFKA_INCOMING_TOPIC, auto_commit=True, iter_timeout=1.0) self.result_method = self.get_method(self.settings.SCHEMA_METHOD) self.validator = self.extend_with_default(Draft4Validator) def extend_with_default(self, validator_class): ''' Method to add default fields to our schema validation ( From the docs ) ''' validate_properties = validator_class.VALIDATORS["properties"] def set_defaults(validator, properties, instance, schema): for error in validate_properties( validator, properties, instance, schema, ): yield error for property, subschema in properties.iteritems(): if "default" in subschema: instance.setdefault(property, subschema["default"]) return validators.extend( validator_class, {"properties": set_defaults}, ) def handle_crawl_request(self, dict): ''' Processes a vaild crawl request @param dict: a valid dictionary object ''' # format key key = "{sid}:queue".format(sid=dict['spiderid']) val = pickle.dumps(dict, protocol=-1) # shortcut to shove stuff into the priority queue self.redis_conn.zadd(key, val, -dict['priority']) # if timeout crawl, add value to redis if 'expires' in dict: key = "timeout:{sid}:{appid}:{crawlid}".format( sid=dict['spiderid'], appid=dict['appid'], crawlid=dict['crawlid']) self.redis_conn.set(key, dict['expires']) print 'Added crawl to Redis' def handle_action_request(self, dict): ''' Processes a vaild action request @param dict: The valid dictionary object ''' # format key key = "{action}:{spiderid}:{appid}".format(action=dict['action'], spiderid=dict['spiderid'], appid=dict['appid']) if "crawlid" in dict: key = key + ":" + dict['crawlid'] self.redis_conn.set(key, dict['uuid']) print 'Added action to Redis' def _main_loop(self): ''' Continuous loop that reads from a kafka topic and tries to validate incoming messages ''' while True: start = time.time() try: for message in self.consumer.get_messages(): if message is None: break try: the_dict = json.loads(message.message.value) try: self.validator(self.schema).validate(the_dict) self.result_method(the_dict) except ValidationError as ex: print "invalid json received" except ValueError: print "bad json recieved" except OffsetOutOfRangeError: # consumer has no idea where they are self.consumer.seek(0, 2) end = time.time() time.sleep(.01) def run(self): ''' Sets up the schema to be validated against ''' self.setup() with open(self.settings.SCHEMA) as the_file: # No try/catch so we can see if there is a json parse error # on the schemas self.schema = json.load(the_file) self._main_loop() def feed(self, json_item): ''' Feeds a json item into the Kafka topic @param json_item: The loaded json object ''' topic = self.settings.KAFKA_INCOMING_TOPIC producer = SimpleProducer(self.kafka_conn) print "=> feeding JSON request into {0}...".format(topic) print json.dumps(json_item, indent=4) self.kafka_conn.ensure_topic_exists(topic) producer.send_messages(topic, json.dumps(json_item)) print "=> done feeding request."
class Consumer(object): def __init__(self, addr, group, topic): self.client = KafkaClient(addr) self.consumer = SimpleConsumer(self.client, group, topic, max_buffer_size=1310720000) self.temp_file_path = None self.temp_file = None self.hadoop_path = "/user/AdReport/%s/history" % (topic) self.cached_path = "/user/AdReport/%s/cached" % (topic) self.topic = topic self.group = group self.block_cnt = 0 def consume_topic(self, output_dir): timestamp = time.strftime('%Y%m%d%H%M%S') #open file for writing self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % ( output_dir, self.topic, self.group, timestamp) self.temp_file = open(self.temp_file_path, "w") print(self.temp_file) #one_entry = False while True: try: messages = self.consumer.get_messages(count=10, block=False) #OffsetAndMessage(offset=43, message=Message(magic=0, # attributes=0, key=None, value='some message')) for message in messages: print(message) #one_entry = True #print (self.temp_file.tell()) self.temp_file.write(message.message.value + "\n") if self.temp_file.tell() > 2000000: self.save_to_hdfs(output_dir) self.consumer.commit() except: self.consumer.seek(0, 2) #if one_entry: #print ("sending to hdfs") #self.save_to_hdfs(output_dir, self.topic) #self.consumer.commit() def save_to_hdfs(self, output_dir): print("Saving file to hdfs") self.temp_file.close() print("Closed open file") timestamp = time.strftime('%Y%m%d%H%M%S') hadoop_fullpath = "%s/%s_%s_%s.dat" % (self.hadoop_path, self.group, self.topic, timestamp) cached_fullpath = "%s/%s_%s_%s.dat" % (self.cached_path, self.group, self.topic, timestamp) #print ("Block " + str(self.block_cnt) + ": Saving file to HDFS " + hadoop_fullpath) self.block_cnt += 1 # place blocked messages into history and cached folders on hdfs os.system("sudo -u ubuntu /usr/local/hadoop/bin/hdfs dfs -put %s %s" % (self.temp_file_path, hadoop_fullpath)) os.system("sudo -u ubuntu /usr/local/hadoop/bin/hdfs dfs -put %s %s" % (self.temp_file_path, cached_fullpath)) os.remove(self.temp_file_path) timestamp = time.strftime('%Y%m%d%H%M%S') self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % ( output_dir, self.topic, self.group, timestamp) self.temp_file = open(self.temp_file_path, "w")
class KafkaMonitor: def __init__(self, settings_name, unit_test=False): ''' @param settings_name: the local settings file name @param unit_test: whether running unit tests or not ''' self.settings_name = settings_name self.wrapper = SettingsWrapper() self.logger = None self.unit_test = unit_test def _import_class(self, cl): ''' Imports a class from a string @param name: the module and class name in dot notation ''' d = cl.rfind(".") classname = cl[d + 1:len(cl)] m = __import__(cl[0:d], globals(), locals(), [classname]) return getattr(m, classname) def _load_plugins(self): ''' Sets up all plugins, defaults and settings.py ''' plugins = self.settings['PLUGINS'] self.plugins_dict = {} for key in plugins: # skip loading the plugin if its value is None if plugins[key] is None: continue # valid plugin, import and setup self.logger.debug("Trying to load plugin {cls}".format(cls=key)) the_class = self._import_class(key) instance = the_class() instance._set_logger(self.logger) if not self.unit_test: instance.setup(self.settings) the_schema = None print("self.settings['PLUGIN_DIR'] + instance.schema====", self.settings['PLUGIN_DIR'] + instance.schema) with open(self.settings['PLUGIN_DIR'] + instance.schema) as the_file: the_schema = json.load(the_file) mini = {} mini['instance'] = instance mini['schema'] = the_schema self.plugins_dict[plugins[key]] = mini self.plugins_dict = OrderedDict( sorted(self.plugins_dict.items(), key=lambda t: t[0])) def setup(self, level=None, log_file=None, json=None): ''' Load everything up. Note that any arg here will override both default and custom settings @param level: the log level @param log_file: boolean t/f whether to log to a file, else stdout @param json: boolean t/f whether to write the logs in json ''' self.settings = self.wrapper.load(self.settings_name) my_level = level if level else self.settings['LOG_LEVEL'] # negate because logger wants True for std out my_output = not log_file if log_file else self.settings['LOG_STDOUT'] my_json = json if json else self.settings['LOG_JSON'] self.logger = LogFactory.get_instance( json=my_json, stdout=my_output, level=my_level, name=self.settings['LOGGER_NAME'], dir=self.settings['LOG_DIR'], file=self.settings['LOG_FILE'], bytes=self.settings['LOG_MAX_BYTES'], backups=self.settings['LOG_BACKUPS']) self.validator = self.extend_with_default(Draft4Validator) def _setup_stats(self): ''' Sets up the stats collection ''' self.stats_dict = {} redis_conn = redis.Redis(host=self.settings['REDIS_HOST'], port=self.settings['REDIS_PORT']) try: redis_conn.info() self.logger.debug("Connected to Redis in StatsCollector Setup") except ConnectionError: self.logger.warn("Failed to connect to Redis in StatsCollector" " Setup, no stats will be collected") return if self.settings['STATS_TOTAL']: self._setup_stats_total(redis_conn) if self.settings['STATS_PLUGINS']: self._setup_stats_plugins(redis_conn) def _setup_stats_total(self, redis_conn): ''' Sets up the total stats collectors @param redis_conn: the redis connection ''' self.stats_dict['total'] = {} self.stats_dict['fail'] = {} temp_key1 = 'stats:kafka-monitor:total' temp_key2 = 'stats:kafka-monitor:fail' for item in self.settings['STATS_TIMES']: try: time = getattr(StatsCollector, item) self.stats_dict['total'][time] = StatsCollector \ .get_rolling_time_window( redis_conn=redis_conn, key='{k}:{t}'.format(k=temp_key1, t=time), window=time, cycle_time=self.settings['STATS_CYCLE']) self.stats_dict['fail'][time] = StatsCollector \ .get_rolling_time_window( redis_conn=redis_conn, key='{k}:{t}'.format(k=temp_key2, t=time), window=time, cycle_time=self.settings['STATS_CYCLE']) self.logger.debug("Set up total/fail Stats Collector '{i}'"\ .format(i=item)) except AttributeError as e: self.logger.warning("Unable to find Stats Time '{s}'"\ .format(s=item)) total1 = StatsCollector.get_hll_counter( redis_conn=redis_conn, key='{k}:lifetime'.format(k=temp_key1), cycle_time=self.settings['STATS_CYCLE'], roll=False) total2 = StatsCollector.get_hll_counter( redis_conn=redis_conn, key='{k}:lifetime'.format(k=temp_key2), cycle_time=self.settings['STATS_CYCLE'], roll=False) self.logger.debug("Set up total/fail Stats Collector 'lifetime'") self.stats_dict['total']['lifetime'] = total1 self.stats_dict['fail']['lifetime'] = total2 def _setup_stats_plugins(self, redis_conn): ''' Sets up the plugin stats collectors @param redis_conn: the redis connection ''' self.stats_dict['plugins'] = {} for key in self.plugins_dict: plugin_name = self.plugins_dict[key]['instance'].__class__.__name__ temp_key = 'stats:kafka-monitor:{p}'.format(p=plugin_name) self.stats_dict['plugins'][plugin_name] = {} for item in self.settings['STATS_TIMES']: try: time = getattr(StatsCollector, item) self.stats_dict['plugins'][plugin_name][time] = StatsCollector \ .get_rolling_time_window( redis_conn=redis_conn, key='{k}:{t}'.format(k=temp_key, t=time), window=time, cycle_time=self.settings['STATS_CYCLE']) self.logger.debug("Set up {p} plugin Stats Collector '{i}'"\ .format(p=plugin_name, i=item)) except AttributeError: self.logger.warning("Unable to find Stats Time '{s}'"\ .format(s=item)) total = StatsCollector.get_hll_counter( redis_conn=redis_conn, key='{k}:lifetime'.format(k=temp_key), cycle_time=self.settings['STATS_CYCLE'], roll=False) self.logger.debug("Set up {p} plugin Stats Collector 'lifetime'"\ .format(p=plugin_name)) self.stats_dict['plugins'][plugin_name]['lifetime'] = total def _setup_kafka(self): ''' Sets up kafka connections ''' @MethodTimer.timeout(self.settings['KAFKA_CONN_TIMEOUT'], False) def _hidden_setup(): try: self.kafka_conn = KafkaClient(self.settings['KAFKA_HOSTS']) self.kafka_conn.ensure_topic_exists( self.settings['KAFKA_INCOMING_TOPIC']) self.consumer = SimpleConsumer( self.kafka_conn, self.settings['KAFKA_GROUP'], self.settings['KAFKA_INCOMING_TOPIC'], auto_commit=True, iter_timeout=1.0) except KafkaUnavailableError as ex: message = "An exception '{0}' occured. Arguments:\n{1!r}" \ .format(type(ex).__name__, ex.args) self.logger.error(message) sys.exit(1) return True ret_val = _hidden_setup() if ret_val: self.logger.debug("Successfully connected to Kafka") else: self.logger.error("Failed to set up Kafka Connection within" " timeout") # this is essential to running the kafka monitor sys.exit(1) def extend_with_default(self, validator_class): ''' Method to add default fields to our schema validation ( From the docs ) ''' validate_properties = validator_class.VALIDATORS["properties"] def set_defaults(validator, properties, instance, schema): for error in validate_properties( validator, properties, instance, schema, ): yield error for property, subschema in properties.iteritems(): if "default" in subschema: instance.setdefault(property, subschema["default"]) return validators.extend( validator_class, {"properties": set_defaults}, ) def _main_loop(self): ''' Continuous loop that reads from a kafka topic and tries to validate incoming messages ''' self.logger.debug("Processing messages") old_time = 0 while True: self._process_messages() if self.settings['STATS_DUMP'] != 0: new_time = int(time.time() / self.settings['STATS_DUMP']) # only log every X seconds if new_time != old_time: self._dump_stats() old_time = new_time time.sleep(.01) def _process_messages(self): try: for message in self.consumer.get_messages(): if message is None: self.logger.debug("no message") break try: self._increment_total_stat(message.message.value) the_dict = json.loads(message.message.value) print('the_dict', the_dict) found_plugin = False print('self.plugins_dict', self.plugins_dict) for key in self.plugins_dict: obj = self.plugins_dict[key] instance = obj['instance'] print('instance==', instance) schema = obj['schema'] print( 'schema********************************************', schema) try: print('before v = self.validator(schema)') v = self.validator(schema) print('after v = self.validator(schema)') print('the_dict-------', the_dict) v.validate(the_dict) found_plugin = True print('found_plugin====', found_plugin) self._increment_plugin_stat( instance.__class__.__name__, the_dict) print('instance.handle(the_dict)', the_dict) ret = instance.handle(the_dict) # break if nothing is returned if ret is None: break except ValidationError: print(' except ValidationError:======') pass if not found_plugin: extras = {} extras['parsed'] = True extras['valid'] = False extras['data'] = the_dict self.logger.warn( "Did not find schema to validate " "request", extra=extras) self._increment_fail_stat(the_dict) except ValueError: extras = {} extras['parsed'] = False extras['valid'] = False extras['data'] = message.message.value self.logger.warning('Unparseable JSON Received', extra=extras) self._increment_fail_stat(message.message.value) except OffsetOutOfRangeError: # consumer has no idea where they are self.consumer.seek(0, 2) self.logger.error("Kafka offset out of range error") def _increment_total_stat(self, string): ''' Increments the total stat counters @param string: the loaded message object for the counter ''' string = string + str(time.time()) if 'total' in self.stats_dict: self.logger.debug("Incremented total stats") for key in self.stats_dict['total']: if key == 'lifetime': self.stats_dict['total'][key].increment(string) else: self.stats_dict['total'][key].increment() def _increment_fail_stat(self, item): ''' Increments the total stat counters @param item: the loaded message object for HLL counter ''' if isinstance(item, dict): item['ts'] = time.time() elif isinstance(item, str): item = item + str(time.time()) if 'fail' in self.stats_dict: self.logger.debug("Incremented fail stats") for key in self.stats_dict['fail']: if key == 'lifetime': self.stats_dict['fail'][key].increment(item) else: self.stats_dict['fail'][key].increment() def _increment_plugin_stat(self, name, item): ''' Increments the total stat counters @param name: The formal name of the plugin @param dict: the loaded message object for HLL counter ''' item['ts'] = time.time() if 'plugins' in self.stats_dict: self.logger.debug("Incremented plugin '{p}' plugin stats"\ .format(p=name)) for key in self.stats_dict['plugins'][name]: if key == 'lifetime': self.stats_dict['plugins'][name][key].increment(item) else: self.stats_dict['plugins'][name][key].increment() def _dump_stats(self): ''' Dumps the stats out ''' extras = {} if 'total' in self.stats_dict: self.logger.debug("Compiling total/fail dump stats") for key in self.stats_dict['total']: final = 'total_{t}'.format(t=key) extras[final] = self.stats_dict['total'][key].value() for key in self.stats_dict['fail']: final = 'fail_{t}'.format(t=key) extras[final] = self.stats_dict['fail'][key].value() if 'plugins' in self.stats_dict: self.logger.debug("Compiling plugin dump stats") for name in self.stats_dict['plugins']: for key in self.stats_dict['plugins'][name]: final = 'plugin_{n}_{t}'.format(n=name, t=key) extras[final] = self.stats_dict['plugins'][name][ key].value() if not self.logger.json: self.logger.info('Kafka Monitor Stats Dump:\n{0}'.format( json.dumps(extras, indent=4, sort_keys=True))) else: self.logger.info('Kafka Monitor Stats Dump', extra=extras) def run(self): ''' Set up and run ''' self._setup_kafka() self._load_plugins() self._setup_stats() self._main_loop() def feed(self, json_item): ''' Feeds a json item into the Kafka topic @param json_item: The loaded json object ''' @MethodTimer.timeout(self.settings['KAFKA_FEED_TIMEOUT'], False) def _feed(json_item): try: self.kafka_conn = KafkaClient(self.settings['KAFKA_HOSTS']) topic = self.settings['KAFKA_INCOMING_TOPIC'] producer = SimpleProducer(self.kafka_conn) except KafkaUnavailableError: self.logger.error("Unable to connect to Kafka") return False if not self.logger.json: self.logger.info('Feeding JSON into {0}\n{1}'.format( topic, json.dumps(json_item, indent=4))) else: self.logger.info('Feeding JSON into {0}\n'.format(topic), extra={'value': json_item}) self.kafka_conn.ensure_topic_exists(topic) producer.send_messages(topic, json.dumps(json_item)) return True result = _feed(json_item) if result: self.logger.info("Successfully fed item to Kafka") else: self.logger.error("Failed to feed item into Kafka")
class KafkaMonitor: def __init__(self, settings_name, unit_test=False): ''' @param settings_name: the local settings file name @param unit_test: whether running unit tests or not ''' self.settings_name = settings_name self.wrapper = SettingsWrapper() self.logger = None self.unit_test = unit_test def _import_class(self, cl): ''' Imports a class from a string @param name: the module and class name in dot notation ''' d = cl.rfind(".") classname = cl[d+1:len(cl)] m = __import__(cl[0:d], globals(), locals(), [classname]) return getattr(m, classname) def _load_plugins(self): ''' Sets up all plugins, defaults and settings.py ''' plugins = self.settings['PLUGINS'] self.plugins_dict = {} for key in plugins: # skip loading the plugin if its value is None if plugins[key] is None: continue # valid plugin, import and setup self.logger.debug("Trying to load plugin {cls}".format(cls=key)) the_class = self._import_class(key) instance = the_class() instance._set_logger(self.logger) if not self.unit_test: instance.setup(self.settings) the_schema = None with open(self.settings['PLUGIN_DIR'] + instance.schema) as the_file: the_schema = json.load(the_file) mini = {} mini['instance'] = instance mini['schema'] = the_schema self.plugins_dict[plugins[key]] = mini self.plugins_dict = OrderedDict(sorted(self.plugins_dict.items(), key=lambda t: t[0])) def setup(self, level=None, log_file=None, json=None): ''' Load everything up. Note that any arg here will override both default and custom settings @param level: the log level @param log_file: boolean t/f whether to log to a file, else stdout @param json: boolean t/f whether to write the logs in json ''' self.settings = self.wrapper.load(self.settings_name) my_level = level if level else self.settings['LOG_LEVEL'] # negate because logger wants True for std out my_output = not log_file if log_file else self.settings['LOG_STDOUT'] my_json = json if json else self.settings['LOG_JSON'] self.logger = LogFactory.get_instance(json=my_json, stdout=my_output, level=my_level, name=self.settings['LOGGER_NAME'], dir=self.settings['LOG_DIR'], file=self.settings['LOG_FILE'], bytes=self.settings['LOG_MAX_BYTES'], backups=self.settings['LOG_BACKUPS']) self.validator = self.extend_with_default(Draft4Validator) def _setup_stats(self): ''' Sets up the stats collection ''' self.stats_dict = {} redis_conn = redis.Redis(host=self.settings['REDIS_HOST'], port=self.settings['REDIS_PORT']) try: redis_conn.info() self.logger.debug("Connected to Redis in StatsCollector Setup") except ConnectionError: self.logger.warn("Failed to connect to Redis in StatsCollector" " Setup, no stats will be collected") return if self.settings['STATS_TOTAL']: self._setup_stats_total(redis_conn) if self.settings['STATS_PLUGINS']: self._setup_stats_plugins(redis_conn) def _setup_stats_total(self, redis_conn): ''' Sets up the total stats collectors @param redis_conn: the redis connection ''' self.stats_dict['total'] = {} self.stats_dict['fail'] = {} temp_key1 = 'stats:kafka-monitor:total' temp_key2 = 'stats:kafka-monitor:fail' for item in self.settings['STATS_TIMES']: try: time = getattr(StatsCollector, item) self.stats_dict['total'][time] = StatsCollector \ .get_rolling_time_window( redis_conn=redis_conn, key='{k}:{t}'.format(k=temp_key1, t=time), window=time, cycle_time=self.settings['STATS_CYCLE']) self.stats_dict['fail'][time] = StatsCollector \ .get_rolling_time_window( redis_conn=redis_conn, key='{k}:{t}'.format(k=temp_key2, t=time), window=time, cycle_time=self.settings['STATS_CYCLE']) self.logger.debug("Set up total/fail Stats Collector '{i}'"\ .format(i=item)) except AttributeError as e: self.logger.warning("Unable to find Stats Time '{s}'"\ .format(s=item)) total1 = StatsCollector.get_hll_counter(redis_conn=redis_conn, key='{k}:lifetime'.format(k=temp_key1), cycle_time=self.settings['STATS_CYCLE'], roll=False) total2 = StatsCollector.get_hll_counter(redis_conn=redis_conn, key='{k}:lifetime'.format(k=temp_key2), cycle_time=self.settings['STATS_CYCLE'], roll=False) self.logger.debug("Set up total/fail Stats Collector 'lifetime'") self.stats_dict['total']['lifetime'] = total1 self.stats_dict['fail']['lifetime'] = total2 def _setup_stats_plugins(self, redis_conn): ''' Sets up the plugin stats collectors @param redis_conn: the redis connection ''' self.stats_dict['plugins'] = {} for key in self.plugins_dict: plugin_name = self.plugins_dict[key]['instance'].__class__.__name__ temp_key = 'stats:kafka-monitor:{p}'.format(p=plugin_name) self.stats_dict['plugins'][plugin_name] = {} for item in self.settings['STATS_TIMES']: try: time = getattr(StatsCollector, item) self.stats_dict['plugins'][plugin_name][time] = StatsCollector \ .get_rolling_time_window( redis_conn=redis_conn, key='{k}:{t}'.format(k=temp_key, t=time), window=time, cycle_time=self.settings['STATS_CYCLE']) self.logger.debug("Set up {p} plugin Stats Collector '{i}'"\ .format(p=plugin_name, i=item)) except AttributeError: self.logger.warning("Unable to find Stats Time '{s}'"\ .format(s=item)) total = StatsCollector.get_hll_counter(redis_conn=redis_conn, key='{k}:lifetime'.format(k=temp_key), cycle_time=self.settings['STATS_CYCLE'], roll=False) self.logger.debug("Set up {p} plugin Stats Collector 'lifetime'"\ .format(p=plugin_name)) self.stats_dict['plugins'][plugin_name]['lifetime'] = total def _setup_kafka(self): ''' Sets up kafka connections ''' @MethodTimer.timeout(self.settings['KAFKA_CONN_TIMEOUT'], False) def _hidden_setup(): try: self.kafka_conn = KafkaClient(self.settings['KAFKA_HOSTS']) self.kafka_conn.ensure_topic_exists( self.settings['KAFKA_INCOMING_TOPIC']) self.consumer = SimpleConsumer(self.kafka_conn, self.settings['KAFKA_GROUP'], self.settings['KAFKA_INCOMING_TOPIC'], auto_commit=True, iter_timeout=1.0) except KafkaUnavailableError as ex: message = "An exception '{0}' occured. Arguments:\n{1!r}" \ .format(type(ex).__name__, ex.args) self.logger.error(message) sys.exit(1) return True ret_val = _hidden_setup() if ret_val: self.logger.debug("Successfully connected to Kafka") else: self.logger.error("Failed to set up Kafka Connection within" " timeout") # this is essential to running the kafka monitor sys.exit(1) def extend_with_default(self, validator_class): ''' Method to add default fields to our schema validation ( From the docs ) ''' validate_properties = validator_class.VALIDATORS["properties"] def set_defaults(validator, properties, instance, schema): for error in validate_properties( validator, properties, instance, schema, ): yield error for property, subschema in properties.iteritems(): if "default" in subschema: instance.setdefault(property, subschema["default"]) return validators.extend( validator_class, {"properties": set_defaults}, ) def _main_loop(self): ''' Continuous loop that reads from a kafka topic and tries to validate incoming messages ''' self.logger.debug("Processing messages") old_time = 0 while True: self._process_messages() if self.settings['STATS_DUMP'] != 0: new_time = int(time.time() / self.settings['STATS_DUMP']) # only log every X seconds if new_time != old_time: self._dump_stats() old_time = new_time time.sleep(.01) def _process_messages(self): try: for message in self.consumer.get_messages(): if message is None: self.logger.debug("no message") break try: self._increment_total_stat(message.message.value) the_dict = json.loads(message.message.value) found_plugin = False for key in self.plugins_dict: obj = self.plugins_dict[key] instance = obj['instance'] schema = obj['schema'] try: self.validator(schema).validate(the_dict) found_plugin = True self._increment_plugin_stat( instance.__class__.__name__, the_dict) ret = instance.handle(the_dict) # break if nothing is returned if ret is None: break except ValidationError: pass if not found_plugin: extras = {} extras['parsed'] = True extras['valid'] = False extras['data'] = the_dict self.logger.warn("Did not find schema to validate " "request", extra=extras) self._increment_fail_stat(the_dict) except ValueError: extras = {} extras['parsed'] = False extras['valid'] = False extras['data'] = message.message.value self.logger.warning('Unparseable JSON Received', extra=extras) self._increment_fail_stat(message.message.value) except OffsetOutOfRangeError: # consumer has no idea where they are self.consumer.seek(0, 2) self.logger.error("Kafka offset out of range error") def _increment_total_stat(self, string): ''' Increments the total stat counters @param string: the loaded message object for the counter ''' string = string + str(time.time()) if 'total' in self.stats_dict: self.logger.debug("Incremented total stats") for key in self.stats_dict['total']: if key == 'lifetime': self.stats_dict['total'][key].increment(string) else: self.stats_dict['total'][key].increment() def _increment_fail_stat(self, item): ''' Increments the total stat counters @param item: the loaded message object for HLL counter ''' if isinstance(item, dict): item['ts'] = time.time() elif isinstance(item, str): item = item + str(time.time()) if 'fail' in self.stats_dict: self.logger.debug("Incremented fail stats") for key in self.stats_dict['fail']: if key == 'lifetime': self.stats_dict['fail'][key].increment(item) else: self.stats_dict['fail'][key].increment() def _increment_plugin_stat(self, name, item): ''' Increments the total stat counters @param name: The formal name of the plugin @param dict: the loaded message object for HLL counter ''' item['ts'] = time.time() if 'plugins' in self.stats_dict: self.logger.debug("Incremented plugin '{p}' plugin stats"\ .format(p=name)) for key in self.stats_dict['plugins'][name]: if key == 'lifetime': self.stats_dict['plugins'][name][key].increment(item) else: self.stats_dict['plugins'][name][key].increment() def _dump_stats(self): ''' Dumps the stats out ''' extras = {} if 'total' in self.stats_dict: self.logger.debug("Compiling total/fail dump stats") for key in self.stats_dict['total']: final = 'total_{t}'.format(t=key) extras[final] = self.stats_dict['total'][key].value() for key in self.stats_dict['fail']: final = 'fail_{t}'.format(t=key) extras[final] = self.stats_dict['fail'][key].value() if 'plugins' in self.stats_dict: self.logger.debug("Compiling plugin dump stats") for name in self.stats_dict['plugins']: for key in self.stats_dict['plugins'][name]: final = 'plugin_{n}_{t}'.format(n=name, t=key) extras[final] = self.stats_dict['plugins'][name][key].value() if not self.logger.json: self.logger.info('Kafka Monitor Stats Dump:\n{0}'.format( json.dumps(extras, indent=4, sort_keys=True))) else: self.logger.info('Kafka Monitor Stats Dump', extra=extras) def run(self): ''' Set up and run ''' self._setup_kafka() self._load_plugins() self._setup_stats() self._main_loop() def feed(self, json_item): ''' Feeds a json item into the Kafka topic @param json_item: The loaded json object ''' @MethodTimer.timeout(self.settings['KAFKA_FEED_TIMEOUT'], False) def _feed(json_item): try: self.kafka_conn = KafkaClient(self.settings['KAFKA_HOSTS']) topic = self.settings['KAFKA_INCOMING_TOPIC'] producer = SimpleProducer(self.kafka_conn) except KafkaUnavailableError: self.logger.error("Unable to connect to Kafka") return False if not self.logger.json: self.logger.info('Feeding JSON into {0}\n{1}'.format( topic, json.dumps(json_item, indent=4))) else: self.logger.info('Feeding JSON into {0}\n'.format(topic), extra={'value': json_item}) self.kafka_conn.ensure_topic_exists(topic) producer.send_messages(topic, json.dumps(json_item)) return True result = _feed(json_item) if result: self.logger.info("Successfully fed item to Kafka") else: self.logger.error("Failed to feed item into Kafka")
class KafkaMonitor: def __init__(self, settings): # dynamic import of settings file # remove the .py from the filename self.settings = importlib.import_module(settings[:-3]) # only need kafka for both uses self.kafka_conn = KafkaClient(self.settings.KAFKA_HOSTS) def get_method(self, key): if key == 'handle_crawl_request': return self.handle_crawl_request elif key == 'handle_action_request': return self.handle_action_request raise AttributeError(key) def setup(self): self.redis_conn = redis.Redis(host=self.settings.REDIS_HOST, port=self.settings.REDIS_PORT) self.kafka_conn.ensure_topic_exists(self.settings.KAFKA_INCOMING_TOPIC) self.consumer = SimpleConsumer(self.kafka_conn, self.settings.KAFKA_GROUP, self.settings.KAFKA_INCOMING_TOPIC, auto_commit=True, iter_timeout=1.0) self.result_method = self.get_method(self.settings.SCHEMA_METHOD) self.validator = self.extend_with_default(Draft4Validator) def extend_with_default(self, validator_class): ''' Method to add default fields to our schema validation ( From the docs ) ''' validate_properties = validator_class.VALIDATORS["properties"] def set_defaults(validator, properties, instance, schema): for error in validate_properties( validator, properties, instance, schema, ): yield error for property, subschema in properties.iteritems(): if "default" in subschema: instance.setdefault(property, subschema["default"]) return validators.extend( validator_class, {"properties": set_defaults}, ) def handle_crawl_request(self, dict): ''' Processes a vaild crawl request @param dict: a valid dictionary object ''' # format key key = "{sid}:queue".format(sid=dict['spiderid']) val = pickle.dumps(dict, protocol=-1) # shortcut to shove stuff into the priority queue self.redis_conn.zadd(key, val, -dict['priority']) # if timeout crawl, add value to redis if 'expires' in dict: key = "timeout:{sid}:{appid}:{crawlid}".format( sid=dict['spiderid'], appid=dict['appid'], crawlid=dict['crawlid']) self.redis_conn.set(key, dict['expires']) def handle_action_request(self, dict): ''' Processes a vaild action request @param dict: The valid dictionary object ''' # format key key = "{action}:{spiderid}:{appid}".format( action=dict['action'], spiderid=dict['spiderid'], appid=dict['appid']) if "crawlid" in dict: key = key + ":" + dict['crawlid'] self.redis_conn.set(key, dict['uuid']) def _main_loop(self): ''' Continuous loop that reads from a kafka topic and tries to validate incoming messages ''' while True: start = time.time() try: for message in self.consumer.get_messages(): if message is None: break try: the_dict = json.loads(message.message.value) try: self.validator(self.schema).validate(the_dict) self.result_method(the_dict) except ValidationError as ex: print "invalid json received" except ValueError: print "bad json recieved" except OffsetOutOfRangeError: # consumer has no idea where they are self.consumer.seek(0, 2) end = time.time() time.sleep(.01) def run(self): ''' Sets up the schema to be validated against ''' self.setup() with open(self.settings.SCHEMA) as the_file: # No try/catch so we can see if there is a json parse error # on the schemas self.schema = json.load(the_file) self._main_loop() def feed(self, json_item): ''' Feeds a json item into the Kafka topic @param json_item: The loaded json object ''' topic = self.settings.KAFKA_INCOMING_TOPIC producer = SimpleProducer(self.kafka_conn) print "=> feeding JSON request into {0}...".format(topic) print json.dumps(json_item, indent=4) self.kafka_conn.ensure_topic_exists(topic) producer.send_messages(topic, json.dumps(json_item)) print "=> done feeding request."
class Consumer(object): """Kafka consumer class with functions to consume messages to HDFS. Messages are blocked into 20MB files and transferred to HDFS Attributes: client: string representing IP:port of the kafka broker consumer: Consumer object specifying the client group, and topic temp_file_path: location of the 20MB file to be appended to before transfer to HDFS temp_file: File object opened from temp_file_path topic: String representing the topic on Kafka group: String representing the Kafka consumer group to be associated with block_cnt: integer representing the block count for print statements """ def __init__(self, addr, group, topic): """Initialize Consumer with kafka broker IP, group, and topic.""" self.client = KafkaClient(addr) self.consumer = SimpleConsumer(self.client, group, topic, max_buffer_size=1310720000) self.temp_file_path = None self.temp_file = None self.hadoop_path = "/user/parking_data/history" self.topic = topic self.group = group self.block_cnt = 0 def consume_topic(self, output_dir): """Consumes a stream of messages from the "messages" topic. Code template from https://github.com/ajmssc/bitcoin-inspector.git Args: output_dir: string representing the directory to store the 20MB before transferring to HDFS Returns: None """ timestamp = time.strftime("%Y%m%d%H%M%S") # open file for writing self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % (output_dir, self.topic, self.group, timestamp) self.temp_file = open(self.temp_file_path, "w") # while True: for ii in range(0, 2): try: # get 1000 messages at a time, non blocking messages = self.consumer.get_messages(count=1000, block=False) # OffsetAndMessage(offset=43, message=Message(magic=0, # attributes=0, key=None, value='some message')) for message in messages: self.temp_file.write(message.message.value + "\n") # file size > 20MB if self.temp_file.tell() > 20000000: self.flush_to_hdfs(output_dir) self.consumer.commit() except: # move to tail of kafka topic if consumer is referencing # unknown offset self.consumer.seek(0, 2) def flush_to_hdfs(self, output_dir): """Flushes the 20MB file into HDFS. Code template from https://github.com/ajmssc/bitcoin-inspector.git Flushes the file into HDFS folders Args: output_dir: string representing the directory to store the 20MB before transferring to HDFS Returns: None """ self.temp_file.close() timestamp = time.strftime("%Y%m%d%H%M%S") hadoop_fullpath = "%s/%s_%s_%s.dat" % (self.hadoop_path, self.group, self.topic, timestamp) print "Block {}: Flushing 20MB file to HDFS => {}".format(str(self.block_cnt), hadoop_fullpath) self.block_cnt += 1 # place blocked messages into history and cached folders on hdfs print ("hdfs dfs -put %s %s" % (self.temp_file_path, hadoop_fullpath)) os.system("sudo hdfs dfs -put %s %s" % (self.temp_file_path, hadoop_fullpath)) # os.system("sudo -u hdfs hdfs dfs -put %s %s" % (self.temp_file_path, # cached_fullpath)) os.remove(self.temp_file_path) timestamp = time.strftime("%Y%m%d%H%M%S") self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % (output_dir, self.topic, self.group, timestamp) self.temp_file = open(self.temp_file_path, "w")
class Prod2Cons(object): ''' Implements blackbox producer & consumer test to/from Kafka ''' def __init__(self, host, port, schema_path, topic, nbmsg, consumer_timeout): self.topic = topic self.nbmsg = nbmsg self.sent_msg = 0 self.host = host self.port = port self.sent = [-100] * self.nbmsg self.rcv = [-100] * self.nbmsg self.runtag = str(random.randint(10, 100000)) try: self.broker = KafkaClient("%s:%d" % (self.host, self.port)) except: raise ValueError( "KafkaClient (%s:%d) - init failed" % (self.host, self.port)) try: self.producer = SimpleProducer(self.broker) except: raise ValueError( "SimpleProducer (%s:%d) - init failed" % (self.host, self.port)) try: self.consumer = SimpleConsumer( self.broker, "testbot", topic, iter_timeout=consumer_timeout) except: raise ValueError( "SimpleConsumer (%s:%d) - init failed" % (self.host, self.port)) try: self.schema = avro.schema.parse(open(schema_path).read()) except: raise ValueError( "Prod2Cons load schema (%s) - init failed" % (schema_path)) def add_sent(self, index): ''' add a datetime now event ''' self.sent[index] = datetime.datetime.now() def add_rcv(self, index): ''' add a datetime now event ''' self.rcv[index] = datetime.datetime.now() def average_ms(self): ''' compute average between sent / rcv values ''' result = 0 for i in range(len(self.sent)): delta = (self.rcv[i] - self.sent[i]) result += int(delta.total_seconds() * 1000) # milliseconds return int(result / len(self.sent)) def prod(self): ''' The test producer ''' LOGGER.debug("prod2cons - start producer") writer = avro.io.DatumWriter(self.schema) for i in xrange(self.nbmsg): rawdata = "%s|%s" % (self.runtag, str(i)) bytes_writer = io.BytesIO() encoder = avro.io.BinaryEncoder(bytes_writer) writer.write({"timestamp": TIMESTAMP_MILLIS(), "src": "testbot", "host_ip": "localhost", "rawdata": rawdata}, encoder) raw_bytes = bytes_writer.getvalue() self.add_sent(i) self.producer.send_messages(self.topic, raw_bytes) self.sent_msg += 1 return 0 def consumer_reset(self): ''' Indicate to restart from the most recent offset ''' self.consumer.seek(0, 2) def cons(self): ''' Run the consumer and return a test result struct ''' LOGGER.debug("prod2cons - start consumer") readcount = 0 readvalid = 0 readnotvalid = 0 avg_ms = -1 # time.sleep(2) added for a local test for checking lond delay display for message in self.consumer: readcount += 1 try: newmessage = message[1][3] bytes_reader = io.BytesIO(newmessage) decoder = avro.io.BinaryDecoder(bytes_reader) reader = avro.io.DatumReader(self.schema) msg = reader.read(decoder) rawsplit = msg['rawdata'].split('|') if rawsplit[0] == self.runtag: readvalid += 1 self.add_rcv(int(rawsplit[1])) else: readnotvalid += 1 LOGGER.error("consumer reads unexpected message [%s] - runtag is [%s]", msg['rawdata'], self.runtag) except: LOGGER.error("prod2cons - consumer failed") raise Exception("consumer failed") if readcount == self.nbmsg and readvalid == self.nbmsg: LOGGER.debug("consumer : test run ok") avg_ms = self.average_ms() return TestbotResult(self.sent_msg, readvalid, readnotvalid, avg_ms)
class Consumer(object): """Kafka consumer class with functions to consume messages to HDFS. Messages are blocked into 20MB files and transferred to HDFS Attributes: client: string representing IP:port of the kafka broker consumer: Consumer object specifying the client group, and topic temp_file_path: location of the 20MB file to be appended to before transfer to HDFS temp_file: File object opened from temp_file_path topic: String representing the topic on Kafka group: String representing the Kafka consumer group to be associated with block_cnt: integer representing the block count for print statements """ def __init__(self, addr, group, topic): """Initialize Consumer with kafka broker IP, group, and topic.""" self.client = KafkaClient(addr) self.consumer = SimpleConsumer(self.client, group, topic, max_buffer_size=1310720000) self.temp_file_path = None self.temp_file = None self.hadoop_path = "/user/parking_data/history" self.topic = topic self.group = group self.block_cnt = 0 def consume_topic(self, output_dir): """Consumes a stream of messages from the "messages" topic. Code template from https://github.com/ajmssc/bitcoin-inspector.git Args: output_dir: string representing the directory to store the 20MB before transferring to HDFS Returns: None """ timestamp = time.strftime('%Y%m%d%H%M%S') # open file for writing self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % ( output_dir, self.topic, self.group, timestamp) self.temp_file = open(self.temp_file_path, "w") # while True: for ii in range(0, 2): try: # get 1000 messages at a time, non blocking messages = self.consumer.get_messages(count=1000, block=False) # OffsetAndMessage(offset=43, message=Message(magic=0, # attributes=0, key=None, value='some message')) for message in messages: self.temp_file.write(message.message.value + "\n") # file size > 20MB if self.temp_file.tell() > 20000000: self.flush_to_hdfs(output_dir) self.consumer.commit() except: # move to tail of kafka topic if consumer is referencing # unknown offset self.consumer.seek(0, 2) def flush_to_hdfs(self, output_dir): """Flushes the 20MB file into HDFS. Code template from https://github.com/ajmssc/bitcoin-inspector.git Flushes the file into HDFS folders Args: output_dir: string representing the directory to store the 20MB before transferring to HDFS Returns: None """ self.temp_file.close() timestamp = time.strftime('%Y%m%d%H%M%S') hadoop_fullpath = "%s/%s_%s_%s.dat" % (self.hadoop_path, self.group, self.topic, timestamp) print("Block {}: Flushing 20MB file to HDFS => {}".format( str(self.block_cnt), hadoop_fullpath)) self.block_cnt += 1 # place blocked messages into history and cached folders on hdfs print("hdfs dfs -put %s %s" % (self.temp_file_path, hadoop_fullpath)) os.system("sudo hdfs dfs -put %s %s" % (self.temp_file_path, hadoop_fullpath)) # os.system("sudo -u hdfs hdfs dfs -put %s %s" % (self.temp_file_path, # cached_fullpath)) os.remove(self.temp_file_path) timestamp = time.strftime('%Y%m%d%H%M%S') self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % ( output_dir, self.topic, self.group, timestamp) self.temp_file = open(self.temp_file_path, "w")