class Consumer(object): def __init__(self, addr, group, topic): self.client = KafkaClient(addr) self.consumer = SimpleConsumer(self.client, group, topic, max_buffer_size=1310720000, auto_offset_reset='smallest') self.temp_file_path = None self.temp_file = None self.topic = topic self.group = group self.block_cnt = 0 def consume_topic(self): timestamp = time.strftime('%Y%m%d%H%M%S') #open file for writing self.temp_file_path = "/home/ubuntu/datamill/kafka_%s_%s_%s.dat" % ( self.topic, self.group, timestamp) self.temp_file = open(self.temp_file_path, "w") header = 'experiment_id,job_id,results_file,package_id,package_name,worker_id,config_id,replicate_no,setup_time,run_time,collect_time,hw_cpu_arch,hw_cpu_mhz,hw_gpu_mhz,hw_num_cpus,hw_page_sz,hw_ram_mhz,hw_ram_sz,sw_address_randomization,sw_autogroup,sw_compiler,sw_drop_caches,sw_env_padding,sw_filesystem,sw_freq_scaling,sw_link_order,sw_opt_flag,sw_swap,sw_sys_time' self.temp_file.write(header) while True: try: messages = self.consumer.get_messages(count=100, block=False) for message in messages: self.temp_file.write(message.message.value + "\n") if self.temp_file.tell() > 20000: self.save_to_hdfs() self.consumer.commit() except: self.consumer.seek(0, 2) self.consumer.commit() def save_to_hdfs(self): self.temp_file.close() timestamp = time.strftime('%Y%m%d%H%M%S') hadoop_path = "/datamill/%s_%s_%s.csv" % (self.group, self.topic, timestamp) print "Block " + str( self.block_cnt) + ": Saving file to HDFS " + hadoop_path self.block_cnt += 1 # place blocked messages into history and cached folders on hdfs os.system("hdfs dfs -put %s %s" % (self.temp_file_path, hadoop_path)) os.remove(self.temp_file_path) timestamp = time.strftime('%Y%m%d%H%M%S') self.temp_file_path = "/home/ubuntu/datamill/kafka_%s_%s_%s.dat" % ( self.topic, self.group, timestamp) self.temp_file = open(self.temp_file_path, "w")
def run(self): client = KafkaClient(self.bootstrap_server, client_id='commandline') consumer = SimpleConsumer(client, self.group, self.topic, auto_commit_every_n=1, buffer_size=160, auto_commit=True) for message in consumer: now = datetime.now() print("%s: %s" % (now, message)) consumer.commit()
class Consumer(object): def __init__(self, addr, group, topic): """Initialize Consumer with kafka broker IP, group, and topic.""" self.client = KafkaClient(addr) self.consumer = SimpleConsumer(self.client, group, topic, max_buffer_size=1310720000) self.temp_file_path = None self.temp_file = None self.hadoop_path = "/insight/artsy/geo" self.topic = topic self.group = group self.block_cnt = 0 def consume_topic(self, output_dir): """Consumes a stream of messages from the "post_geo_activity" topic. Code template from https://github.com/ajmssc/bitcoin-inspector.git """ timestamp = time.strftime('%Y%m%d%H%M%S') # open file for writing self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % (output_dir,self.topic,self.group,timestamp) self.temp_file = open(self.temp_file_path,"w") while True: try: # get 1000 messages at a time, non blocking messages = self.consumer.get_messages(count=1000, block=False) for message in messages: self.temp_file.write(message.message.value + "\n") # file size > 20MB if self.temp_file.tell() > 20000000: self.flush_to_hdfs(output_dir) self.consumer.commit() except: # move to tail of kafka topic if consumer is referencing # unknown offset self.consumer.seek(0, 2) def flush_to_hdfs(self, output_dir): """Flushes the 20MB file into HDFS.""" self.temp_file.close() timestamp = time.strftime('%Y%m%d%H%M%S') hadoop_fullpath = "%s/%s_%s_%s.dat" % (self.hadoop_path, self.group,self.topic, timestamp) print "Block {}: Flushing data file to HDFS => {}".format(str(self.block_cnt),hadoop_fullpath) self.block_cnt += 1 os.system("hdfs dfs -put %s %s" % (self.temp_file_path, hadoop_fullpath)) # save from local to hdfs os.remove(self.temp_file_path) # remove temp local file timestamp = time.strftime('%Y%m%d%H%M%S') self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % (output_dir,self.topic,self.group,timestamp) self.temp_file = open(self.temp_file_path, "w")
class Consumer(object): def __init__(self, addr, group, topic): self.client = KafkaClient(addr) self.consumer = SimpleConsumer(self.client, group, topic, max_buffer_size=1310720000, auto_offset_reset='smallest') self.temp_file_path = None self.temp_file = None self.topic = topic self.group = group self.block_cnt = 0 def consume_topic(self): timestamp = time.strftime('%Y%m%d%H%M%S') #open file for writing self.temp_file_path = "/home/ubuntu/datamill/kafka_%s_%s_%s.dat" % (self.topic, self.group, timestamp) self.temp_file = open(self.temp_file_path,"w") header = 'experiment_id,job_id,results_file,package_id,package_name,worker_id,config_id,replicate_no,setup_time,run_time,collect_time,hw_cpu_arch,hw_cpu_mhz,hw_gpu_mhz,hw_num_cpus,hw_page_sz,hw_ram_mhz,hw_ram_sz,sw_address_randomization,sw_autogroup,sw_compiler,sw_drop_caches,sw_env_padding,sw_filesystem,sw_freq_scaling,sw_link_order,sw_opt_flag,sw_swap,sw_sys_time' self.temp_file.write(header) while True: try: messages = self.consumer.get_messages(count=100, block=False) for message in messages: self.temp_file.write(message.message.value + "\n") if self.temp_file.tell() > 20000: self.save_to_hdfs() self.consumer.commit() except: self.consumer.seek(0, 2) self.consumer.commit() def save_to_hdfs(self): self.temp_file.close() timestamp = time.strftime('%Y%m%d%H%M%S') hadoop_path = "/datamill/%s_%s_%s.csv" % (self.group, self.topic, timestamp) print "Block " + str(self.block_cnt) + ": Saving file to HDFS " + hadoop_path self.block_cnt += 1 # place blocked messages into history and cached folders on hdfs os.system("hdfs dfs -put %s %s" % (self.temp_file_path, hadoop_path)) os.remove(self.temp_file_path) timestamp = time.strftime('%Y%m%d%H%M%S') self.temp_file_path = "/home/ubuntu/datamill/kafka_%s_%s_%s.dat" % (self.topic, self.group, timestamp) self.temp_file = open(self.temp_file_path, "w")
class Worker(object): def __init__(self, topic, hosts=None, log_level=logging.WARNING): hosts = hosts or "localhost:9092" self.group = "kafque" self.topic = "{}_{}".format(self.group, topic) self.client = KafkaClient(hosts) self.client.ensure_topic_exists(str(self.topic)) self.consumer = SimpleConsumer( self.client, str(self.group), str(self.topic), auto_commit=False) self.consumer.provide_partition_info() self.consumer.fetch_last_known_offsets() self.logger = setup_logger(__name__, level=log_level) self.failed_queue = None if self.topic != "{}_failed".format(self.group): self.failed_queue = FailedQueue( hosts=hosts, log_level=logging.ERROR) def handle_signals(self): def warm_shutdown(signum, frame): # TODO: if worker is busy, defer cleanup to cold_shutdown self.logger.debug("Got signal {}.".format(signum)) self.logger.warning("Warm shut down.") raise SystemExit() signal.signal(signal.SIGINT, warm_shutdown) signal.signal(signal.SIGTERM, warm_shutdown) def run(self): self.logger.info("kafque worker started.") self.handle_signals() for partition, message in self.consumer: self.logger.debug("Offset {}".format(message.offset)) job = json.loads(message.message.value) callback = callback_from_string(job.pop("callback")) try: result = callback(*job["args"], **job["kwargs"]) self.logger.info(result) self.consumer.commit() except Exception as exc: self.logger.error(exc, exc_info=True) # TODO: set job as failed if self.failed_queue: self.failed_queue.enqueue( callback, args=job["args"], kwargs=job["kwargs"]) self.consumer.commit()
class ZKConsumer(object): zk_timeout = 30 jitter_seconds = 30 broker_prefix = '/brokers/ids' def __init__( self, zk_hosts, group, topic, nodes, zk_handler=None, logger=None, identifier=None, **consumer_kwargs): """Creates a Consumer that tracks state in ZooKeeper, rebalancing partition ownership as registered consumers change. NOTE: this class is intended for version 0.8.1 of Kafka, where offsets are managed by Kafka but there is no rebalancing in the protocol. """ if logger is None: logger = logging.getLogger('kafka.consumer.ZKConsumer') self.logger = logger self.identifier = identifier if KafkaClient is None: raise RuntimeError("Kafka support requires cs.eyrie to be installed with the Kafka extra: install_requires= ['cs.eyrie[Kafka]']") self.zk_handler = zk_handler self.zk_hosts = zk_hosts self.broker_hosts = [] self.group = group self.topic = topic self.zk = None self.nodes = nodes self.client = None self.consumer = None self.consumer_kwargs = consumer_kwargs # This will kick off a cascading sequence to initialize ourselves: # 1. Connect to ZK and pull list of Kafka brokers # 2. Register ourselves as a consumer in ZK # 3. Rebalance partitions across all connected consumers self.init_zk() def zk_session_watch(self, state): self.logger.debug('ZK transitioned to: %s', state) if state == KazooState.SUSPENDED: if self.consumer is not None: self.logger.info('Stopping Kafka consumer') self.consumer.stop() self.consumer = None # Lost connection to ZK; we can't call any methods that would # try to contact it (i.e., we can't do self.zkp.finish() ) self.zkp = None elif state == KazooState.CONNECTED: self.logger.info('Restarting ZK partitioner') self.zk.handler.spawn(self.init_zkp) def _zkp_wait(self): handler = self.zk.handler while 1: if self.zkp.failed: self.logger.warning("Lost or unable to acquire partition") self.stop() elif self.zkp.release: self.zkp.release_set() elif self.zkp.acquired: def group_change_proxy(event): self.logger.warn('Connected consumers changed') if self.zkp is None: self.logger.info('Restarting ZK partitioner') handler.spawn(self.init_zkp) elif self.zkp is not None and self.zkp.failed: self.logger.warning("Lost or unable to acquire partition") self.stop() else: self.logger.info('Scheduling ZK partitioner set release') rel_greenlet = handler.spawn(self.zkp.release_set) self.logger.info('Scheduling group re-join') rel_greenlet.link_value(lambda greenlet: self.zkp.join_group) if not self.nodes: self.logger.info('Partitioner aquired; setting child watch') result = self.zk.get_children_async(self.zkp._group_path) result.rawlink(group_change_proxy) # Break out of while loop to begin consuming events break elif self.zkp.allocating: self.zkp.wait_for_acquire() def init_zkp(self): if not hasattr(self, 'zkp') or self.zkp is None: if self.nodes: self.zkp = StaticZKPartitioner( self.zk, self.group, self.topic, self.nodes, partitions_changed_cb=self.init_consumer, logger=self.logger, identifier=self.identifier) else: self.zkp = ZKPartitioner( self.zk, self.group, self.topic, time_boundary=self.jitter_seconds, partitions_changed_cb=self.init_consumer, logger=self.logger, identifier=self.identifier) self._zkp_wait() def init_zk(self): # TODO: switch to async # 1. implement kazoo.interfaces.IHandler in terms of Tornado's IOLoop self.zk = KazooClient(hosts=self.zk_hosts, handler=self.zk_handler) self.zk.start() self.zk.add_listener(self.zk_session_watch) @self.zk.ChildrenWatch(self.broker_prefix) def broker_change_proxy(broker_ids): self.onBrokerChange(broker_ids) self.init_zkp() def onBrokerChange(self, broker_ids): self.broker_hosts = [] for b_id in broker_ids: b_json, zstat = self.zk.get('/'.join([self.broker_prefix, b_id])) b_data = json.loads(b_json) self.broker_hosts.append('{}:{}'.format(b_data['host'], b_data['port'])) my_partitions = [] if self.consumer is not None: self.logger.warn('Brokers changed, stopping Kafka consumer.') my_partitions = self.consumer.offsets.keys() self.consumer.stop() self.consumer = None if self.client is not None: self.logger.warn('Brokers changed, stopping Kafka client.') self.client.close() self.client = None if my_partitions: msg = 'Brokers changed, queuing restart of Kafka client / consumer.' self.logger.warn(msg) self.zk.handler.spawn(self.init_consumer, my_partitions) def init_consumer(self, my_partitions): if self.consumer is None: self.logger.warn('Starting Kafka client') self.client = KafkaClient(self.broker_hosts, client_id=self.zkp._identifier) else: if self.consumer is None or \ sorted(my_partitions) != sorted(self.consumer.offsets.keys()): self.logger.warn('Partitions changed, restarting Kafka consumer.') self.consumer.stop() else: self.logger.info('Partitions unchanged, not restarting Kafka consumer.') return self.consumer = SimpleConsumer(self.client, self.group, self.topic, partitions=my_partitions, **self.consumer_kwargs) self.consumer.provide_partition_info() self.logger.info("Consumer connected to Kafka: %s", self.consumer.offsets) def stop(self): if self.consumer is not None: self.logger.info('Stopping Kafka consumer') self.consumer.stop() self.consumer = None if self.client is not None: self.logger.info('Stopping Kafka client') self.client.close() self.client = None if self.zk is not None: self.logger.info('Stopping ZooKeeper client') if self.zkp is not None and not self.zkp.failed: self.zkp.finish() self.zk.stop() self.zkp = None self.zk = None def commit(self, partitions=None): """ Commit offsets for this consumer partitions: list of partitions to commit, default is to commit all of them """ if self.consumer is None: return self.logger.debug('Begin committing offsets for partitions: %s', partitions if partitions else 'All') self.consumer.commit(partitions) self.logger.debug('End committing offsets for partitions: %s', partitions if partitions else 'All') def pending(self, partitions=None): """ Gets the pending message count partitions: list of partitions to check for, default is to check all """ return self.consumer.pending(partitions) def provide_partition_info(self): """ Indicates that partition info must be returned by the consumer """ self.consumer.provide_partition_info() def seek(self, offset, whence): """ Alter the current offset in the consumer, similar to fseek offset: how much to modify the offset whence: where to modify it from 0 is relative to the earliest available offset (head) 1 is relative to the current offset 2 is relative to the latest known offset (tail) """ self.consumer.seek(offset, whence) def get_messages(self, count=1, block=True, timeout=0.1): """ Fetch the specified number of messages count: Indicates the maximum number of messages to be fetched block: If True, the API will block till some messages are fetched. timeout: If block is True, the function will block for the specified time (in seconds) until count messages is fetched. If None, it will block forever. """ if self.consumer is None: return [] else: try: messages = self.consumer.get_messages(count, block, timeout) if not messages and self.zkp.failed: raise FailedPayloadsError return messages except FailedPayloadsError as err: msg = 'Failed to retrieve payload, restarting consumer' self.logger.exception(msg) raise err def get_message(self, block=True, timeout=0.1, get_partition_info=None): return self.consumer.get_message(block, timeout, get_partition_info) def _get_message(self, block=True, timeout=0.1, get_partition_info=None, update_offset=True): return self.consumer._get_message(block, timeout, get_partition_info, update_offset) def __iter__(self): for msg in self.consumer: yield msg
class ZKConsumer(object): zk_timeout = 30 jitter_seconds = 30 broker_prefix = '/brokers/ids' def __init__(self, zk_hosts, group, topic, nodes, zk_handler=None, logger=None, identifier=None, **consumer_kwargs): """Creates a Consumer that tracks state in ZooKeeper, rebalancing partition ownership as registered consumers change. NOTE: this class is intended for version 0.8.1 of Kafka, where offsets are managed by Kafka but there is no rebalancing in the protocol. """ if logger is None: logger = logging.getLogger('kafka.consumer.ZKConsumer') self.logger = logger self.identifier = identifier if KafkaClient is None: raise RuntimeError( "Kafka support requires cs.eyrie to be installed with the Kafka extra: install_requires= ['cs.eyrie[Kafka]']" ) self.zk_handler = zk_handler self.zk_hosts = zk_hosts self.broker_hosts = [] self.group = group self.topic = topic self.zk = None self.nodes = nodes self.client = None self.consumer = None self.consumer_kwargs = consumer_kwargs # This will kick off a cascading sequence to initialize ourselves: # 1. Connect to ZK and pull list of Kafka brokers # 2. Register ourselves as a consumer in ZK # 3. Rebalance partitions across all connected consumers self.init_zk() def zk_session_watch(self, state): self.logger.debug('ZK transitioned to: %s', state) if state == KazooState.SUSPENDED: if self.consumer is not None: self.logger.info('Stopping Kafka consumer') self.consumer.stop() self.consumer = None # Lost connection to ZK; we can't call any methods that would # try to contact it (i.e., we can't do self.zkp.finish() ) self.zkp = None elif state == KazooState.CONNECTED: self.logger.info('Restarting ZK partitioner') self.zk.handler.spawn(self.init_zkp) def _zkp_wait(self): handler = self.zk.handler while 1: if self.zkp.failed: self.logger.warning("Lost or unable to acquire partition") self.stop() elif self.zkp.release: self.zkp.release_set() elif self.zkp.acquired: def group_change_proxy(event): self.logger.warn('Connected consumers changed') if self.zkp is None: self.logger.info('Restarting ZK partitioner') handler.spawn(self.init_zkp) elif self.zkp is not None and self.zkp.failed: self.logger.warning( "Lost or unable to acquire partition") self.stop() else: self.logger.info( 'Scheduling ZK partitioner set release') rel_greenlet = handler.spawn(self.zkp.release_set) self.logger.info('Scheduling group re-join') rel_greenlet.link_value( lambda greenlet: self.zkp.join_group) if not self.nodes: self.logger.info( 'Partitioner aquired; setting child watch') result = self.zk.get_children_async(self.zkp._group_path) result.rawlink(group_change_proxy) # Break out of while loop to begin consuming events break elif self.zkp.allocating: self.zkp.wait_for_acquire() def init_zkp(self): if not hasattr(self, 'zkp') or self.zkp is None: if self.nodes: self.zkp = StaticZKPartitioner( self.zk, self.group, self.topic, self.nodes, partitions_changed_cb=self.init_consumer, logger=self.logger, identifier=self.identifier) else: self.zkp = ZKPartitioner( self.zk, self.group, self.topic, time_boundary=self.jitter_seconds, partitions_changed_cb=self.init_consumer, logger=self.logger, identifier=self.identifier) self._zkp_wait() def init_zk(self): # TODO: switch to async # 1. implement kazoo.interfaces.IHandler in terms of Tornado's IOLoop self.zk = KazooClient(hosts=self.zk_hosts, handler=self.zk_handler) self.zk.start() self.zk.add_listener(self.zk_session_watch) @self.zk.ChildrenWatch(self.broker_prefix) def broker_change_proxy(broker_ids): self.onBrokerChange(broker_ids) self.init_zkp() def onBrokerChange(self, broker_ids): self.broker_hosts = [] for b_id in broker_ids: b_json, zstat = self.zk.get('/'.join([self.broker_prefix, b_id])) b_data = json.loads(b_json) self.broker_hosts.append('{}:{}'.format(b_data['host'], b_data['port'])) my_partitions = [] if self.consumer is not None: self.logger.warn('Brokers changed, stopping Kafka consumer.') my_partitions = self.consumer.offsets.keys() self.consumer.stop() self.consumer = None if self.client is not None: self.logger.warn('Brokers changed, stopping Kafka client.') self.client.close() self.client = None if my_partitions: msg = 'Brokers changed, queuing restart of Kafka client / consumer.' self.logger.warn(msg) self.zk.handler.spawn(self.init_consumer, my_partitions) def init_consumer(self, my_partitions): if self.consumer is None: self.logger.warn('Starting Kafka client') self.client = KafkaClient(self.broker_hosts, client_id=self.zkp._identifier) else: if self.consumer is None or \ sorted(my_partitions) != sorted(self.consumer.offsets.keys()): self.logger.warn( 'Partitions changed, restarting Kafka consumer.') self.consumer.stop() else: self.logger.info( 'Partitions unchanged, not restarting Kafka consumer.') return self.consumer = SimpleConsumer(self.client, self.group, self.topic, partitions=my_partitions, **self.consumer_kwargs) self.consumer.provide_partition_info() self.logger.info("Consumer connected to Kafka: %s", self.consumer.offsets) def stop(self): if self.consumer is not None: self.logger.info('Stopping Kafka consumer') self.consumer.stop() self.consumer = None if self.client is not None: self.logger.info('Stopping Kafka client') self.client.close() self.client = None if self.zk is not None: self.logger.info('Stopping ZooKeeper client') if self.zkp is not None and not self.zkp.failed: self.zkp.finish() self.zk.stop() self.zkp = None self.zk = None def commit(self, partitions=None): """ Commit offsets for this consumer partitions: list of partitions to commit, default is to commit all of them """ if self.consumer is None: return self.logger.debug('Begin committing offsets for partitions: %s', partitions if partitions else 'All') self.consumer.commit(partitions) self.logger.debug('End committing offsets for partitions: %s', partitions if partitions else 'All') def pending(self, partitions=None): """ Gets the pending message count partitions: list of partitions to check for, default is to check all """ return self.consumer.pending(partitions) def provide_partition_info(self): """ Indicates that partition info must be returned by the consumer """ self.consumer.provide_partition_info() def seek(self, offset, whence): """ Alter the current offset in the consumer, similar to fseek offset: how much to modify the offset whence: where to modify it from 0 is relative to the earliest available offset (head) 1 is relative to the current offset 2 is relative to the latest known offset (tail) """ self.consumer.seek(offset, whence) def get_messages(self, count=1, block=True, timeout=0.1): """ Fetch the specified number of messages count: Indicates the maximum number of messages to be fetched block: If True, the API will block till some messages are fetched. timeout: If block is True, the function will block for the specified time (in seconds) until count messages is fetched. If None, it will block forever. """ if self.consumer is None: return [] else: try: messages = self.consumer.get_messages(count, block, timeout) if not messages and self.zkp.failed: raise FailedPayloadsError return messages except FailedPayloadsError as err: msg = 'Failed to retrieve payload, restarting consumer' self.logger.exception(msg) raise err def get_message(self, block=True, timeout=0.1, get_partition_info=None): return self.consumer.get_message(block, timeout, get_partition_info) def _get_message(self, block=True, timeout=0.1, get_partition_info=None, update_offset=True): return self.consumer._get_message(block, timeout, get_partition_info, update_offset) def __iter__(self): for msg in self.consumer: yield msg
class Consumer(object): """Kafka consumer class with functions to consume messages to HDFS. Messages are blocked into 20MB files and transferred to HDFS Attributes: client: string representing IP:port of the kafka broker consumer: Consumer object specifying the client group, and topic temp_file_path: location of the 20MB file to be appended to before transfer to HDFS temp_file: File object opened from temp_file_path topic: String representing the topic on Kafka group: String representing the Kafka consumer group to be associated with block_cnt: integer representing the block count for print statements """ def __init__(self, addr, group, topic): """Initialize Consumer with kafka broker IP, group, and topic.""" self.client = KafkaClient(addr) self.consumer = SimpleConsumer(self.client, group, topic, max_buffer_size=1310720000) self.temp_file_path = None self.temp_file = None self.hadoop_path = "/user/parking_data/history" self.topic = topic self.group = group self.block_cnt = 0 def consume_topic(self, output_dir): """Consumes a stream of messages from the "messages" topic. Code template from https://github.com/ajmssc/bitcoin-inspector.git Args: output_dir: string representing the directory to store the 20MB before transferring to HDFS Returns: None """ timestamp = time.strftime("%Y%m%d%H%M%S") # open file for writing self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % (output_dir, self.topic, self.group, timestamp) self.temp_file = open(self.temp_file_path, "w") # while True: for ii in range(0, 2): try: # get 1000 messages at a time, non blocking messages = self.consumer.get_messages(count=1000, block=False) # OffsetAndMessage(offset=43, message=Message(magic=0, # attributes=0, key=None, value='some message')) for message in messages: self.temp_file.write(message.message.value + "\n") # file size > 20MB if self.temp_file.tell() > 20000000: self.flush_to_hdfs(output_dir) self.consumer.commit() except: # move to tail of kafka topic if consumer is referencing # unknown offset self.consumer.seek(0, 2) def flush_to_hdfs(self, output_dir): """Flushes the 20MB file into HDFS. Code template from https://github.com/ajmssc/bitcoin-inspector.git Flushes the file into HDFS folders Args: output_dir: string representing the directory to store the 20MB before transferring to HDFS Returns: None """ self.temp_file.close() timestamp = time.strftime("%Y%m%d%H%M%S") hadoop_fullpath = "%s/%s_%s_%s.dat" % (self.hadoop_path, self.group, self.topic, timestamp) print "Block {}: Flushing 20MB file to HDFS => {}".format(str(self.block_cnt), hadoop_fullpath) self.block_cnt += 1 # place blocked messages into history and cached folders on hdfs print ("hdfs dfs -put %s %s" % (self.temp_file_path, hadoop_fullpath)) os.system("sudo hdfs dfs -put %s %s" % (self.temp_file_path, hadoop_fullpath)) # os.system("sudo -u hdfs hdfs dfs -put %s %s" % (self.temp_file_path, # cached_fullpath)) os.remove(self.temp_file_path) timestamp = time.strftime("%Y%m%d%H%M%S") self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % (output_dir, self.topic, self.group, timestamp) self.temp_file = open(self.temp_file_path, "w")
class Consumer(object): def __init__(self, addr, group, topic): """Initialize Consumer with kafka broker IP, group, and topic.""" self.client = KafkaClient(addr) self.consumer = SimpleConsumer(self.client, group, topic, max_buffer_size=1310720000) self.temp_file_path = None self.temp_file = None self.hadoop_path = "/insight/artsy/geo" self.topic = topic self.group = group self.block_cnt = 0 def consume_topic(self, output_dir): """Consumes a stream of messages from the "post_geo_activity" topic. Code template from https://github.com/ajmssc/bitcoin-inspector.git """ timestamp = time.strftime('%Y%m%d%H%M%S') # open file for writing self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % ( output_dir, self.topic, self.group, timestamp) self.temp_file = open(self.temp_file_path, "w") while True: try: # get 1000 messages at a time, non blocking messages = self.consumer.get_messages(count=1000, block=False) for message in messages: self.temp_file.write(message.message.value + "\n") # file size > 20MB if self.temp_file.tell() > 20000000: self.flush_to_hdfs(output_dir) self.consumer.commit() except: # move to tail of kafka topic if consumer is referencing # unknown offset self.consumer.seek(0, 2) def flush_to_hdfs(self, output_dir): """Flushes the 20MB file into HDFS.""" self.temp_file.close() timestamp = time.strftime('%Y%m%d%H%M%S') hadoop_fullpath = "%s/%s_%s_%s.dat" % (self.hadoop_path, self.group, self.topic, timestamp) print "Block {}: Flushing data file to HDFS => {}".format( str(self.block_cnt), hadoop_fullpath) self.block_cnt += 1 os.system( "hdfs dfs -put %s %s" % (self.temp_file_path, hadoop_fullpath)) # save from local to hdfs os.remove(self.temp_file_path) # remove temp local file timestamp = time.strftime('%Y%m%d%H%M%S') self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % ( output_dir, self.topic, self.group, timestamp) self.temp_file = open(self.temp_file_path, "w")
class Consumer(object): def __init__(self, addr, group, topic): self.client = KafkaClient(addr) self.consumer = SimpleConsumer(self.client, group, topic, max_buffer_size=1310720000) self.temp_file_path = None self.temp_file = None self.hadoop_path = "/user/AdReport/%s/history" %(topic) self.cached_path = "/user/AdReport/%s/cached" % (topic) self.topic = topic self.group = group self.block_cnt = 0 def consume_topic(self, output_dir): timestamp = time.strftime('%Y%m%d%H%M%S') #open file for writing self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % (output_dir, self.topic, self.group, timestamp) self.temp_file = open(self.temp_file_path,"w") print ( self.temp_file) #one_entry = False while True: try: messages = self.consumer.get_messages(count=10, block=False) #OffsetAndMessage(offset=43, message=Message(magic=0, # attributes=0, key=None, value='some message')) for message in messages: print (message) #one_entry = True #print (self.temp_file.tell()) self.temp_file.write(message.message.value + "\n") if self.temp_file.tell() > 2000000: self.save_to_hdfs(output_dir) self.consumer.commit() except: self.consumer.seek(0, 2) #if one_entry: #print ("sending to hdfs") #self.save_to_hdfs(output_dir, self.topic) #self.consumer.commit() def save_to_hdfs(self, output_dir): print ("Saving file to hdfs") self.temp_file.close() print ("Closed open file") timestamp = time.strftime('%Y%m%d%H%M%S') hadoop_fullpath = "%s/%s_%s_%s.dat" % (self.hadoop_path, self.group, self.topic, timestamp) cached_fullpath = "%s/%s_%s_%s.dat" % (self.cached_path, self.group, self.topic, timestamp) #print ("Block " + str(self.block_cnt) + ": Saving file to HDFS " + hadoop_fullpath) self.block_cnt += 1 # place blocked messages into history and cached folders on hdfs os.system("sudo -u ubuntu /usr/local/hadoop/bin/hdfs dfs -put %s %s" % (self.temp_file_path, hadoop_fullpath)) os.system("sudo -u ubuntu /usr/local/hadoop/bin/hdfs dfs -put %s %s" % (self.temp_file_path, cached_fullpath)) os.remove(self.temp_file_path) timestamp = time.strftime('%Y%m%d%H%M%S') self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % (output_dir, self.topic, self.group, timestamp) self.temp_file = open(self.temp_file_path, "w")
# stdlib from collections import defaultdict # 3p from kafka.client import KafkaClient from kafka.consumer import SimpleConsumer kafka_conn = KafkaClient("kafka:9092") consumer = SimpleConsumer(kafka_conn, "sample_check", "test-topic", auto_commit=True) for message in consumer.get_messages(count=10): print message.offset consumer.commit()
class Consumer(object): """Kafka consumer class with functions to consume messages to HDFS. Messages are blocked into 20MB files and transferred to HDFS Attributes: client: string representing IP:port of the kafka broker consumer: Consumer object specifying the client group, and topic temp_file_path: location of the 20MB file to be appended to before transfer to HDFS temp_file: File object opened from temp_file_path topic: String representing the topic on Kafka group: String representing the Kafka consumer group to be associated with block_cnt: integer representing the block count for print statements """ def __init__(self, addr, group, topic): """Initialize Consumer with kafka broker IP, group, and topic.""" self.client = KafkaClient(addr) self.consumer = SimpleConsumer(self.client, group, topic, max_buffer_size=1310720000) self.temp_file_path = None self.temp_file = None self.hadoop_path = "/user/parking_data/history" self.topic = topic self.group = group self.block_cnt = 0 def consume_topic(self, output_dir): """Consumes a stream of messages from the "messages" topic. Code template from https://github.com/ajmssc/bitcoin-inspector.git Args: output_dir: string representing the directory to store the 20MB before transferring to HDFS Returns: None """ timestamp = time.strftime('%Y%m%d%H%M%S') # open file for writing self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % ( output_dir, self.topic, self.group, timestamp) self.temp_file = open(self.temp_file_path, "w") # while True: for ii in range(0, 2): try: # get 1000 messages at a time, non blocking messages = self.consumer.get_messages(count=1000, block=False) # OffsetAndMessage(offset=43, message=Message(magic=0, # attributes=0, key=None, value='some message')) for message in messages: self.temp_file.write(message.message.value + "\n") # file size > 20MB if self.temp_file.tell() > 20000000: self.flush_to_hdfs(output_dir) self.consumer.commit() except: # move to tail of kafka topic if consumer is referencing # unknown offset self.consumer.seek(0, 2) def flush_to_hdfs(self, output_dir): """Flushes the 20MB file into HDFS. Code template from https://github.com/ajmssc/bitcoin-inspector.git Flushes the file into HDFS folders Args: output_dir: string representing the directory to store the 20MB before transferring to HDFS Returns: None """ self.temp_file.close() timestamp = time.strftime('%Y%m%d%H%M%S') hadoop_fullpath = "%s/%s_%s_%s.dat" % (self.hadoop_path, self.group, self.topic, timestamp) print("Block {}: Flushing 20MB file to HDFS => {}".format( str(self.block_cnt), hadoop_fullpath)) self.block_cnt += 1 # place blocked messages into history and cached folders on hdfs print("hdfs dfs -put %s %s" % (self.temp_file_path, hadoop_fullpath)) os.system("sudo hdfs dfs -put %s %s" % (self.temp_file_path, hadoop_fullpath)) # os.system("sudo -u hdfs hdfs dfs -put %s %s" % (self.temp_file_path, # cached_fullpath)) os.remove(self.temp_file_path) timestamp = time.strftime('%Y%m%d%H%M%S') self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % ( output_dir, self.topic, self.group, timestamp) self.temp_file = open(self.temp_file_path, "w")
class Consumer(object): def __init__(self, addr, group, topic): self.client = KafkaClient(addr) self.consumer = SimpleConsumer(self.client, group, topic, max_buffer_size=1310720000) self.temp_file_path = None self.temp_file = None self.hadoop_path = "/user/AdReport/%s/history" % (topic) self.cached_path = "/user/AdReport/%s/cached" % (topic) self.topic = topic self.group = group self.block_cnt = 0 def consume_topic(self, output_dir): timestamp = time.strftime('%Y%m%d%H%M%S') #open file for writing self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % ( output_dir, self.topic, self.group, timestamp) self.temp_file = open(self.temp_file_path, "w") print(self.temp_file) #one_entry = False while True: try: messages = self.consumer.get_messages(count=10, block=False) #OffsetAndMessage(offset=43, message=Message(magic=0, # attributes=0, key=None, value='some message')) for message in messages: print(message) #one_entry = True #print (self.temp_file.tell()) self.temp_file.write(message.message.value + "\n") if self.temp_file.tell() > 2000000: self.save_to_hdfs(output_dir) self.consumer.commit() except: self.consumer.seek(0, 2) #if one_entry: #print ("sending to hdfs") #self.save_to_hdfs(output_dir, self.topic) #self.consumer.commit() def save_to_hdfs(self, output_dir): print("Saving file to hdfs") self.temp_file.close() print("Closed open file") timestamp = time.strftime('%Y%m%d%H%M%S') hadoop_fullpath = "%s/%s_%s_%s.dat" % (self.hadoop_path, self.group, self.topic, timestamp) cached_fullpath = "%s/%s_%s_%s.dat" % (self.cached_path, self.group, self.topic, timestamp) #print ("Block " + str(self.block_cnt) + ": Saving file to HDFS " + hadoop_fullpath) self.block_cnt += 1 # place blocked messages into history and cached folders on hdfs os.system("sudo -u ubuntu /usr/local/hadoop/bin/hdfs dfs -put %s %s" % (self.temp_file_path, hadoop_fullpath)) os.system("sudo -u ubuntu /usr/local/hadoop/bin/hdfs dfs -put %s %s" % (self.temp_file_path, cached_fullpath)) os.remove(self.temp_file_path) timestamp = time.strftime('%Y%m%d%H%M%S') self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % ( output_dir, self.topic, self.group, timestamp) self.temp_file = open(self.temp_file_path, "w")
class Consumer(object): def __init__(self, addr, group, topic): self.client = KafkaClient(addr) self.consumer = SimpleConsumer(self.client, group, topic, max_buffer_size=1310720000) self.temp_file_path = None self.temp_file = None self.topic = topic self.group = group self.block_cnt = 0 def consume_topic(self, output_dir): timestamp = time.strftime('%Y%m%d%H%M%S') #open file for writing self.temp_file_path = "/home/ubuntu/FantasyFootball/ingestion/kafka_%s_%s_%s.dat" % (self.topic, self.group, timestamp) self.temp_file = open(self.temp_file_path,"w") one_entry = False while True: try: messages = self.consumer.get_messages(count=100, block=False) #OffsetAndMessage(offset=43, message=Message(magic=0, # attributes=0, key=None, value='some message')) for message in messages: one_entry = True self.tempfile.write(message.message.value + "\n") if self.tempfile.tell() > 2000: self.save_to_hdfs(output_dir) self.consumer.commit() except: self.consumer.seek(0, 2) if one_entry: self.save_to_hdfs(output_dir, self.topic) self.consumer.commit() def save_to_hdfs(self, output_dir): self.tempfile.close() timestamp = time.strftime('%Y%m%d%H%M%S') hadoop_path = "/user/solivero/playerpoints/history/%s_%s_%s.dat" % (self.group, self.topic, timestamp) cached_path = "/user/solivero/playerpoints/cached/%s_%s_%s.dat" % (self.group, self.topic, timestamp) print "Block " + str(self.block_cnt) + ": Saving file to HDFS " + hadoop_path self.block_cnt += 1 # place blocked messages into history and cached folders on hdfs os.system("sudo -u hdfs hdfs dfs -put %s %s" % (self.temp_file_path,hadoop_path)) os.system("sudo -u hdfs hdfs dfs -put %s %s" % (self.temp_file_path,cached_path)) os.remove(self.temp_file_path) timestamp = time.strftime('%Y%m%d%H%M%S') self.temp_file_path = "/home/ubuntu/fantasyfootball/ingestion/kafka_%s_%s_%s.dat" % (self.topic, self.group, timestamp) self.temp_file = open(self.temp_file_path, "w")
class Consumer(object): def __init__(self, addr, group, topic): self.client = KafkaClient(addr) self.consumer = SimpleConsumer(self.client, group, topic, max_buffer_size=1310720000) self.temp_file_path = None self.temp_file = None self.topic = topic self.group = group self.block_cnt = 0 os.system ( "hdfs dfs -mkdir /data2" ) def consume_topic(self, output_dir): if not os.path.isdir ( output_dir ): os.makedirs ( output_dir ) timestamp = time.strftime('%Y%m%d%H%M%S') self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % (output_dir, self.topic, self.group, timestamp) self.temp_file = open(self.temp_file_path,"w") while True: try: # get 1000 messages at a time, non blocking messages = self.consumer.get_messages(count=1000, block=False) # OffsetAndMessage(offset=43, message=Message(magic=0, # attributes=0, key=None, value='some message')) for message in messages: self.temp_file.write(message.message.value + "\n") # file size > 40MB if self.temp_file.tell() > 40000000: self.flush_to_hdfs(output_dir) self.consumer.commit() except: # move to tail of kafka topic if consumer is referencing # unknown offset self.consumer.seek(0, 2) def flush_to_hdfs(self, output_dir): self.temp_file.close() timestamp = time.strftime('%Y%m%d%H%M%S') print "Block {}: Flushing 40MB file to HDFS => /data2".format(str(self.block_cnt)) self.block_cnt += 1 # place blocked messages into history and cached folders on hdfs os.system("hdfs dfs -copyFromLocal %s %s" % (self.temp_file_path, "/data2")) os.remove(self.temp_file_path) timestamp = time.strftime('%Y%m%d%H%M%S') self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % (output_dir, self.topic, self.group, timestamp) self.temp_file = open(self.temp_file_path, "w")