class SimpleWorker: ''' SimpleWorker is responsible for listening to a particular partition/topic of the Kafka cluster, consuming incoming messages and storing data to the HDFS. :param interval : Milliseconds spent waiting in poll if data is not available in the buffer. :param processes: Number of the parallel processes. :param topic : Topic to listen for new messages. :param partition: Partition number to consume. ''' def __init__(self, interval, processes, topic, partition, **consumer): self._logger = logging.getLogger('SHIELD.SIMPLE.WORKER') self._logger.info('Initializing Simple Worker process...') self._interval = interval self._isalive = True self._processes = processes # .............................init Kafka Consumer self.Consumer = Consumer(**consumer) self.Consumer.assign(topic, [int(partition)]) # .............................set up local staging area self._tmpdir = tempfile.mkdtemp(prefix='_SW.', dir=tempfile.gettempdir()) self._logger.info('Use directory "{0}" as local staging area.'.format( self._tmpdir)) # .............................define a process pool object self._pool = Pool(self._processes, init_child) self._logger.info( 'Master Collector will use {0} parallel processes.'.format( self._processes)) signal.signal(signal.SIGUSR1, self.kill) self._logger.info('Initialization completed successfully!') def __del__(self): ''' Called when the instance is about to be destroyed. ''' if hasattr(self, '_tmpdir'): self._logger.info('Clean up temporary directory "{0}".'.format( self._tmpdir)) shutil.rmtree(self._tmpdir) def kill(self): ''' Receive signal for termination from an external process. ''' self._logger.info( 'Receiving a kill signal from an external process...') self._isalive = False @classmethod def run(cls): ''' Main command-line entry point. :param cls: The class as implicit first argument. ''' try: args = parse_args() conf = json.loads(args.config_file.read()) # .........................set up logger get_logger('SHIELD', args.log_level) # .........................check kerberos authentication if os.getenv('KRB_AUTH'): authenticate(conf['kerberos']) # .........................instantiate Simple Worker worker = cls(args.interval, args.parallel_processes, args.topic, args.partition, **conf['consumer']) worker.start(args.hdfs_directory) except SystemExit: raise except: sys.excepthook(*sys.exc_info()) sys.exit(1) def start(self, hpath): ''' Start Simple Worker. :param hpath: Destination folder in HDFS. ''' self._logger.info('Start Simple Worker process!') self._logger.info('Messages will be stored under "{0}".'.format(hpath)) try: while self._isalive: for record in self.Consumer.poll(self._interval): if not record: continue self._pool.apply_async(store, args=(hpath, record, self._tmpdir)) except KeyboardInterrupt: pass finally: self._pool.close() self._pool.join() self.Consumer.close() self._logger.info('Stop Simple Worker process.')