コード例 #1
0
ファイル: simple.py プロジェクト: shield-h2020/dare-workers
class SimpleWorker:
    '''
        SimpleWorker is responsible for listening to a particular partition/topic of the
    Kafka cluster, consuming incoming messages and storing data to the HDFS.

    :param interval : Milliseconds spent waiting in poll if data is not available in the
                      buffer.
    :param processes: Number of the parallel processes.
    :param topic    : Topic to listen for new messages.
    :param partition: Partition number to consume.
    '''
    def __init__(self, interval, processes, topic, partition, **consumer):
        self._logger = logging.getLogger('SHIELD.SIMPLE.WORKER')
        self._logger.info('Initializing Simple Worker  process...')

        self._interval = interval
        self._isalive = True
        self._processes = processes

        # .............................init Kafka Consumer
        self.Consumer = Consumer(**consumer)
        self.Consumer.assign(topic, [int(partition)])

        # .............................set up local staging area
        self._tmpdir = tempfile.mkdtemp(prefix='_SW.',
                                        dir=tempfile.gettempdir())
        self._logger.info('Use directory "{0}" as local staging area.'.format(
            self._tmpdir))

        # .............................define a process pool object
        self._pool = Pool(self._processes, init_child)
        self._logger.info(
            'Master Collector will use {0} parallel processes.'.format(
                self._processes))

        signal.signal(signal.SIGUSR1, self.kill)
        self._logger.info('Initialization completed successfully!')

    def __del__(self):
        '''
            Called when the instance is about to be destroyed.
        '''
        if hasattr(self, '_tmpdir'):
            self._logger.info('Clean up temporary directory "{0}".'.format(
                self._tmpdir))
            shutil.rmtree(self._tmpdir)

    def kill(self):
        '''
            Receive signal for termination from an external process.
        '''
        self._logger.info(
            'Receiving a kill signal from an external process...')
        self._isalive = False

    @classmethod
    def run(cls):
        '''
            Main command-line entry point.

        :param cls: The class as implicit first argument.
        '''
        try:
            args = parse_args()
            conf = json.loads(args.config_file.read())

            # .........................set up logger
            get_logger('SHIELD', args.log_level)

            # .........................check kerberos authentication
            if os.getenv('KRB_AUTH'):
                authenticate(conf['kerberos'])

            # .........................instantiate Simple Worker
            worker = cls(args.interval, args.parallel_processes, args.topic,
                         args.partition, **conf['consumer'])

            worker.start(args.hdfs_directory)
        except SystemExit:
            raise
        except:
            sys.excepthook(*sys.exc_info())
            sys.exit(1)

    def start(self, hpath):
        '''
            Start Simple Worker.

        :param hpath: Destination folder in HDFS.
        '''
        self._logger.info('Start Simple Worker process!')
        self._logger.info('Messages will be stored under "{0}".'.format(hpath))

        try:
            while self._isalive:
                for record in self.Consumer.poll(self._interval):
                    if not record: continue
                    self._pool.apply_async(store,
                                           args=(hpath, record, self._tmpdir))

        except KeyboardInterrupt:
            pass
        finally:
            self._pool.close()
            self._pool.join()

            self.Consumer.close()
            self._logger.info('Stop Simple Worker process.')