Ejemplo n.º 1
0
class Consumer(object):
    def __init__(self, addr, group, topic):
        self.client = KafkaClient(addr)
        self.consumer = SimpleConsumer(self.client,
                                       group,
                                       topic,
                                       max_buffer_size=1310720000,
                                       auto_offset_reset='smallest')
        self.temp_file_path = None
        self.temp_file = None
        self.topic = topic
        self.group = group
        self.block_cnt = 0

    def consume_topic(self):

        timestamp = time.strftime('%Y%m%d%H%M%S')

        #open file for writing
        self.temp_file_path = "/home/ubuntu/datamill/kafka_%s_%s_%s.dat" % (
            self.topic, self.group, timestamp)
        self.temp_file = open(self.temp_file_path, "w")
        header = 'experiment_id,job_id,results_file,package_id,package_name,worker_id,config_id,replicate_no,setup_time,run_time,collect_time,hw_cpu_arch,hw_cpu_mhz,hw_gpu_mhz,hw_num_cpus,hw_page_sz,hw_ram_mhz,hw_ram_sz,sw_address_randomization,sw_autogroup,sw_compiler,sw_drop_caches,sw_env_padding,sw_filesystem,sw_freq_scaling,sw_link_order,sw_opt_flag,sw_swap,sw_sys_time'
        self.temp_file.write(header)

        while True:
            try:
                messages = self.consumer.get_messages(count=100, block=False)

                for message in messages:
                    self.temp_file.write(message.message.value + "\n")

                if self.temp_file.tell() > 20000:
                    self.save_to_hdfs()

                self.consumer.commit()
            except:
                self.consumer.seek(0, 2)

        self.consumer.commit()

    def save_to_hdfs(self):
        self.temp_file.close()

        timestamp = time.strftime('%Y%m%d%H%M%S')
        hadoop_path = "/datamill/%s_%s_%s.csv" % (self.group, self.topic,
                                                  timestamp)
        print "Block " + str(
            self.block_cnt) + ": Saving file to HDFS " + hadoop_path
        self.block_cnt += 1

        # place blocked messages into history and cached folders on hdfs
        os.system("hdfs dfs -put %s %s" % (self.temp_file_path, hadoop_path))
        os.remove(self.temp_file_path)

        timestamp = time.strftime('%Y%m%d%H%M%S')

        self.temp_file_path = "/home/ubuntu/datamill/kafka_%s_%s_%s.dat" % (
            self.topic, self.group, timestamp)
        self.temp_file = open(self.temp_file_path, "w")
Ejemplo n.º 2
0
    def run(self):
        client = KafkaClient(self.bootstrap_server, client_id='commandline')
        consumer = SimpleConsumer(client, self.group, self.topic, auto_commit_every_n=1, buffer_size=160,
                                  auto_commit=True)

        for message in consumer:
            now = datetime.now()
            print("%s: %s" % (now, message))
            consumer.commit()
Ejemplo n.º 3
0
class Consumer(object):
    def __init__(self, addr, group, topic):
        """Initialize Consumer with kafka broker IP, group, and topic."""
        self.client = KafkaClient(addr)
        self.consumer = SimpleConsumer(self.client, group, topic,
                                       max_buffer_size=1310720000)
        self.temp_file_path = None
        self.temp_file = None
        self.hadoop_path = "/insight/artsy/geo"
        self.topic = topic
        self.group = group
        self.block_cnt = 0

    def consume_topic(self, output_dir):
        """Consumes a stream of messages from the "post_geo_activity" topic.
        Code template from https://github.com/ajmssc/bitcoin-inspector.git
        """
        timestamp = time.strftime('%Y%m%d%H%M%S')
        
        # open file for writing
        self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % (output_dir,self.topic,self.group,timestamp)
        self.temp_file = open(self.temp_file_path,"w")

        while True:
            try:
                # get 1000 messages at a time, non blocking
                messages = self.consumer.get_messages(count=1000, block=False)
                for message in messages:
                    self.temp_file.write(message.message.value + "\n")

                # file size > 20MB
                if self.temp_file.tell() > 20000000:
                    self.flush_to_hdfs(output_dir)

                self.consumer.commit()
            except:
                # move to tail of kafka topic if consumer is referencing
                # unknown offset
                self.consumer.seek(0, 2)


    def flush_to_hdfs(self, output_dir):
        """Flushes the 20MB file into HDFS."""
        self.temp_file.close()
        timestamp = time.strftime('%Y%m%d%H%M%S')
        hadoop_fullpath = "%s/%s_%s_%s.dat" % (self.hadoop_path, self.group,self.topic, timestamp)

        print "Block {}: Flushing data file to HDFS => {}".format(str(self.block_cnt),hadoop_fullpath)
        self.block_cnt += 1
        os.system("hdfs dfs -put %s %s" % (self.temp_file_path, hadoop_fullpath)) # save from local to hdfs
        os.remove(self.temp_file_path) # remove temp local file
        timestamp = time.strftime('%Y%m%d%H%M%S')
        self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % (output_dir,self.topic,self.group,timestamp)
        self.temp_file = open(self.temp_file_path, "w")
Ejemplo n.º 4
0
class Consumer(object):

    def __init__(self, addr, group, topic):
        self.client = KafkaClient(addr)
        self.consumer = SimpleConsumer(self.client, group, topic, max_buffer_size=1310720000, auto_offset_reset='smallest')
        self.temp_file_path = None
        self.temp_file = None
        self.topic = topic
        self.group = group
        self.block_cnt = 0


    def consume_topic(self):

        timestamp = time.strftime('%Y%m%d%H%M%S')

        #open file for writing
        self.temp_file_path = "/home/ubuntu/datamill/kafka_%s_%s_%s.dat" % (self.topic, self.group, timestamp)
        self.temp_file = open(self.temp_file_path,"w")
        header = 'experiment_id,job_id,results_file,package_id,package_name,worker_id,config_id,replicate_no,setup_time,run_time,collect_time,hw_cpu_arch,hw_cpu_mhz,hw_gpu_mhz,hw_num_cpus,hw_page_sz,hw_ram_mhz,hw_ram_sz,sw_address_randomization,sw_autogroup,sw_compiler,sw_drop_caches,sw_env_padding,sw_filesystem,sw_freq_scaling,sw_link_order,sw_opt_flag,sw_swap,sw_sys_time'
        self.temp_file.write(header)

        while True:
            try:
                messages = self.consumer.get_messages(count=100, block=False)

                for message in messages:
                    self.temp_file.write(message.message.value + "\n")

                if self.temp_file.tell() > 20000:
                    self.save_to_hdfs()

                self.consumer.commit()
            except:
                self.consumer.seek(0, 2)

        self.consumer.commit()

    def save_to_hdfs(self):
        self.temp_file.close()

        timestamp = time.strftime('%Y%m%d%H%M%S')
        hadoop_path = "/datamill/%s_%s_%s.csv" % (self.group, self.topic, timestamp)
        print "Block " + str(self.block_cnt) + ": Saving file to HDFS " + hadoop_path
        self.block_cnt += 1

        # place blocked messages into history and cached folders on hdfs
        os.system("hdfs dfs -put %s %s" % (self.temp_file_path, hadoop_path))
        os.remove(self.temp_file_path)

        timestamp = time.strftime('%Y%m%d%H%M%S')

        self.temp_file_path = "/home/ubuntu/datamill/kafka_%s_%s_%s.dat" % (self.topic, self.group, timestamp)
        self.temp_file = open(self.temp_file_path, "w")
Ejemplo n.º 5
0
class Worker(object):
    def __init__(self, topic, hosts=None, log_level=logging.WARNING):
        hosts = hosts or "localhost:9092"
        self.group = "kafque"
        self.topic = "{}_{}".format(self.group, topic)
        self.client = KafkaClient(hosts)
        self.client.ensure_topic_exists(str(self.topic))
        self.consumer = SimpleConsumer(
            self.client, str(self.group), str(self.topic), auto_commit=False)
        self.consumer.provide_partition_info()
        self.consumer.fetch_last_known_offsets()
        self.logger = setup_logger(__name__, level=log_level)

        self.failed_queue = None
        if self.topic != "{}_failed".format(self.group):
            self.failed_queue = FailedQueue(
                hosts=hosts, log_level=logging.ERROR)

    def handle_signals(self):
        def warm_shutdown(signum, frame):
            # TODO: if worker is busy, defer cleanup to cold_shutdown
            self.logger.debug("Got signal {}.".format(signum))
            self.logger.warning("Warm shut down.")
            raise SystemExit()

        signal.signal(signal.SIGINT, warm_shutdown)
        signal.signal(signal.SIGTERM, warm_shutdown)

    def run(self):
        self.logger.info("kafque worker started.")
        self.handle_signals()

        for partition, message in self.consumer:
            self.logger.debug("Offset {}".format(message.offset))
            job = json.loads(message.message.value)

            callback = callback_from_string(job.pop("callback"))
            try:
                result = callback(*job["args"], **job["kwargs"])
                self.logger.info(result)
                self.consumer.commit()
            except Exception as exc:
                self.logger.error(exc, exc_info=True)

                # TODO: set job as failed
                if self.failed_queue:
                    self.failed_queue.enqueue(
                        callback, args=job["args"], kwargs=job["kwargs"])
                    self.consumer.commit()
Ejemplo n.º 6
0
class ZKConsumer(object):

    zk_timeout = 30
    jitter_seconds = 30
    broker_prefix = '/brokers/ids'

    def __init__(
            self,
            zk_hosts,
            group,
            topic,
            nodes,
            zk_handler=None,
            logger=None,
            identifier=None,
            **consumer_kwargs):
        """Creates a Consumer that tracks state in ZooKeeper,
        rebalancing partition ownership as registered consumers change.
        NOTE: this class is intended for version 0.8.1 of Kafka, where offsets
              are managed by Kafka but there is no rebalancing in the protocol.
        """
        if logger is None:
            logger = logging.getLogger('kafka.consumer.ZKConsumer')
        self.logger = logger
        self.identifier = identifier

        if KafkaClient is None:
            raise RuntimeError("Kafka support requires cs.eyrie to be installed with the Kafka extra: install_requires= ['cs.eyrie[Kafka]']")
        self.zk_handler = zk_handler
        self.zk_hosts = zk_hosts
        self.broker_hosts = []

        self.group = group
        self.topic = topic

        self.zk = None
        self.nodes = nodes
        self.client = None
        self.consumer = None
        self.consumer_kwargs = consumer_kwargs

        # This will kick off a cascading sequence to initialize ourselves:
        # 1. Connect to ZK and pull list of Kafka brokers
        # 2. Register ourselves as a consumer in ZK
        # 3. Rebalance partitions across all connected consumers
        self.init_zk()

    def zk_session_watch(self, state):
        self.logger.debug('ZK transitioned to: %s', state)
        if state == KazooState.SUSPENDED:
            if self.consumer is not None:
                self.logger.info('Stopping Kafka consumer')
                self.consumer.stop()
                self.consumer = None
            # Lost connection to ZK; we can't call any methods that would
            # try to contact it (i.e., we can't do self.zkp.finish() )
            self.zkp = None
        elif state == KazooState.CONNECTED:
            self.logger.info('Restarting ZK partitioner')
            self.zk.handler.spawn(self.init_zkp)

    def _zkp_wait(self):
        handler = self.zk.handler
        while 1:
            if self.zkp.failed:
                self.logger.warning("Lost or unable to acquire partition")
                self.stop()
            elif self.zkp.release:
                self.zkp.release_set()
            elif self.zkp.acquired:
                def group_change_proxy(event):
                    self.logger.warn('Connected consumers changed')
                    if self.zkp is None:
                        self.logger.info('Restarting ZK partitioner')
                        handler.spawn(self.init_zkp)
                    elif self.zkp is not None and self.zkp.failed:
                        self.logger.warning("Lost or unable to acquire partition")
                        self.stop()
                    else:
                        self.logger.info('Scheduling ZK partitioner set release')
                        rel_greenlet = handler.spawn(self.zkp.release_set)
                        self.logger.info('Scheduling group re-join')
                        rel_greenlet.link_value(lambda greenlet: self.zkp.join_group)
                if not self.nodes:
                    self.logger.info('Partitioner aquired; setting child watch')
                    result = self.zk.get_children_async(self.zkp._group_path)
                    result.rawlink(group_change_proxy)
                # Break out of while loop to begin consuming events
                break
            elif self.zkp.allocating:
                self.zkp.wait_for_acquire()

    def init_zkp(self):
        if not hasattr(self, 'zkp') or self.zkp is None:
            if self.nodes:
                self.zkp = StaticZKPartitioner(
                    self.zk, self.group, self.topic, self.nodes,
                    partitions_changed_cb=self.init_consumer,
                    logger=self.logger, identifier=self.identifier)
            else:
                self.zkp = ZKPartitioner(
                    self.zk, self.group, self.topic,
                    time_boundary=self.jitter_seconds,
                    partitions_changed_cb=self.init_consumer,
                    logger=self.logger, identifier=self.identifier)

        self._zkp_wait()

    def init_zk(self):
        # TODO: switch to async
        # 1. implement kazoo.interfaces.IHandler in terms of Tornado's IOLoop
        self.zk = KazooClient(hosts=self.zk_hosts, handler=self.zk_handler)
        self.zk.start()
        self.zk.add_listener(self.zk_session_watch)

        @self.zk.ChildrenWatch(self.broker_prefix)
        def broker_change_proxy(broker_ids):
            self.onBrokerChange(broker_ids)

        self.init_zkp()

    def onBrokerChange(self, broker_ids):
        self.broker_hosts = []
        for b_id in broker_ids:
            b_json, zstat = self.zk.get('/'.join([self.broker_prefix, b_id]))
            b_data = json.loads(b_json)
            self.broker_hosts.append('{}:{}'.format(b_data['host'],
                                                    b_data['port']))

        my_partitions = []
        if self.consumer is not None:
            self.logger.warn('Brokers changed, stopping Kafka consumer.')
            my_partitions = self.consumer.offsets.keys()
            self.consumer.stop()
            self.consumer = None
        if self.client is not None:
            self.logger.warn('Brokers changed, stopping Kafka client.')
            self.client.close()
            self.client = None

        if my_partitions:
            msg = 'Brokers changed, queuing restart of Kafka client / consumer.'
            self.logger.warn(msg)
            self.zk.handler.spawn(self.init_consumer, my_partitions)

    def init_consumer(self, my_partitions):
        if self.consumer is None:
            self.logger.warn('Starting Kafka client')
            self.client = KafkaClient(self.broker_hosts,
                                      client_id=self.zkp._identifier)
        else:
            if self.consumer is None or \
               sorted(my_partitions) != sorted(self.consumer.offsets.keys()):
                self.logger.warn('Partitions changed, restarting Kafka consumer.')
                self.consumer.stop()
            else:
                self.logger.info('Partitions unchanged, not restarting Kafka consumer.')
                return

        self.consumer = SimpleConsumer(self.client, self.group, self.topic,
                                       partitions=my_partitions,
                                       **self.consumer_kwargs)
        self.consumer.provide_partition_info()
        self.logger.info("Consumer connected to Kafka: %s", self.consumer.offsets)

    def stop(self):
        if self.consumer is not None:
            self.logger.info('Stopping Kafka consumer')
            self.consumer.stop()
            self.consumer = None
        if self.client is not None:
            self.logger.info('Stopping Kafka client')
            self.client.close()
            self.client = None
        if self.zk is not None:
            self.logger.info('Stopping ZooKeeper client')
            if self.zkp is not None and not self.zkp.failed:
                self.zkp.finish()
                self.zk.stop()
            self.zkp = None
            self.zk = None

    def commit(self, partitions=None):
        """
        Commit offsets for this consumer

        partitions: list of partitions to commit, default is to commit
                    all of them
        """
        if self.consumer is None:
            return
        self.logger.debug('Begin committing offsets for partitions: %s',
                          partitions if partitions else 'All')
        self.consumer.commit(partitions)
        self.logger.debug('End committing offsets for partitions: %s',
                          partitions if partitions else 'All')

    def pending(self, partitions=None):
        """
        Gets the pending message count

        partitions: list of partitions to check for, default is to check all
        """
        return self.consumer.pending(partitions)

    def provide_partition_info(self):
        """
        Indicates that partition info must be returned by the consumer
        """
        self.consumer.provide_partition_info()

    def seek(self, offset, whence):
        """
        Alter the current offset in the consumer, similar to fseek

        offset: how much to modify the offset
        whence: where to modify it from
                0 is relative to the earliest available offset (head)
                1 is relative to the current offset
                2 is relative to the latest known offset (tail)
        """
        self.consumer.seek(offset, whence)

    def get_messages(self, count=1, block=True, timeout=0.1):
        """
        Fetch the specified number of messages

        count: Indicates the maximum number of messages to be fetched
        block: If True, the API will block till some messages are fetched.
        timeout: If block is True, the function will block for the specified
                 time (in seconds) until count messages is fetched. If None,
                 it will block forever.
        """
        if self.consumer is None:
            return []
        else:
            try:
                messages = self.consumer.get_messages(count, block, timeout)
                if not messages and self.zkp.failed:
                    raise FailedPayloadsError
                return messages
            except FailedPayloadsError as err:
                msg = 'Failed to retrieve payload, restarting consumer'
                self.logger.exception(msg)
                raise err

    def get_message(self, block=True, timeout=0.1, get_partition_info=None):
        return self.consumer.get_message(block, timeout, get_partition_info)

    def _get_message(self, block=True, timeout=0.1, get_partition_info=None,
                     update_offset=True):
        return self.consumer._get_message(block, timeout, get_partition_info,
                                          update_offset)

    def __iter__(self):
        for msg in self.consumer:
            yield msg
Ejemplo n.º 7
0
class ZKConsumer(object):

    zk_timeout = 30
    jitter_seconds = 30
    broker_prefix = '/brokers/ids'

    def __init__(self,
                 zk_hosts,
                 group,
                 topic,
                 nodes,
                 zk_handler=None,
                 logger=None,
                 identifier=None,
                 **consumer_kwargs):
        """Creates a Consumer that tracks state in ZooKeeper,
        rebalancing partition ownership as registered consumers change.
        NOTE: this class is intended for version 0.8.1 of Kafka, where offsets
              are managed by Kafka but there is no rebalancing in the protocol.
        """
        if logger is None:
            logger = logging.getLogger('kafka.consumer.ZKConsumer')
        self.logger = logger
        self.identifier = identifier

        if KafkaClient is None:
            raise RuntimeError(
                "Kafka support requires cs.eyrie to be installed with the Kafka extra: install_requires= ['cs.eyrie[Kafka]']"
            )
        self.zk_handler = zk_handler
        self.zk_hosts = zk_hosts
        self.broker_hosts = []

        self.group = group
        self.topic = topic

        self.zk = None
        self.nodes = nodes
        self.client = None
        self.consumer = None
        self.consumer_kwargs = consumer_kwargs

        # This will kick off a cascading sequence to initialize ourselves:
        # 1. Connect to ZK and pull list of Kafka brokers
        # 2. Register ourselves as a consumer in ZK
        # 3. Rebalance partitions across all connected consumers
        self.init_zk()

    def zk_session_watch(self, state):
        self.logger.debug('ZK transitioned to: %s', state)
        if state == KazooState.SUSPENDED:
            if self.consumer is not None:
                self.logger.info('Stopping Kafka consumer')
                self.consumer.stop()
                self.consumer = None
            # Lost connection to ZK; we can't call any methods that would
            # try to contact it (i.e., we can't do self.zkp.finish() )
            self.zkp = None
        elif state == KazooState.CONNECTED:
            self.logger.info('Restarting ZK partitioner')
            self.zk.handler.spawn(self.init_zkp)

    def _zkp_wait(self):
        handler = self.zk.handler
        while 1:
            if self.zkp.failed:
                self.logger.warning("Lost or unable to acquire partition")
                self.stop()
            elif self.zkp.release:
                self.zkp.release_set()
            elif self.zkp.acquired:

                def group_change_proxy(event):
                    self.logger.warn('Connected consumers changed')
                    if self.zkp is None:
                        self.logger.info('Restarting ZK partitioner')
                        handler.spawn(self.init_zkp)
                    elif self.zkp is not None and self.zkp.failed:
                        self.logger.warning(
                            "Lost or unable to acquire partition")
                        self.stop()
                    else:
                        self.logger.info(
                            'Scheduling ZK partitioner set release')
                        rel_greenlet = handler.spawn(self.zkp.release_set)
                        self.logger.info('Scheduling group re-join')
                        rel_greenlet.link_value(
                            lambda greenlet: self.zkp.join_group)

                if not self.nodes:
                    self.logger.info(
                        'Partitioner aquired; setting child watch')
                    result = self.zk.get_children_async(self.zkp._group_path)
                    result.rawlink(group_change_proxy)
                # Break out of while loop to begin consuming events
                break
            elif self.zkp.allocating:
                self.zkp.wait_for_acquire()

    def init_zkp(self):
        if not hasattr(self, 'zkp') or self.zkp is None:
            if self.nodes:
                self.zkp = StaticZKPartitioner(
                    self.zk,
                    self.group,
                    self.topic,
                    self.nodes,
                    partitions_changed_cb=self.init_consumer,
                    logger=self.logger,
                    identifier=self.identifier)
            else:
                self.zkp = ZKPartitioner(
                    self.zk,
                    self.group,
                    self.topic,
                    time_boundary=self.jitter_seconds,
                    partitions_changed_cb=self.init_consumer,
                    logger=self.logger,
                    identifier=self.identifier)

        self._zkp_wait()

    def init_zk(self):
        # TODO: switch to async
        # 1. implement kazoo.interfaces.IHandler in terms of Tornado's IOLoop
        self.zk = KazooClient(hosts=self.zk_hosts, handler=self.zk_handler)
        self.zk.start()
        self.zk.add_listener(self.zk_session_watch)

        @self.zk.ChildrenWatch(self.broker_prefix)
        def broker_change_proxy(broker_ids):
            self.onBrokerChange(broker_ids)

        self.init_zkp()

    def onBrokerChange(self, broker_ids):
        self.broker_hosts = []
        for b_id in broker_ids:
            b_json, zstat = self.zk.get('/'.join([self.broker_prefix, b_id]))
            b_data = json.loads(b_json)
            self.broker_hosts.append('{}:{}'.format(b_data['host'],
                                                    b_data['port']))

        my_partitions = []
        if self.consumer is not None:
            self.logger.warn('Brokers changed, stopping Kafka consumer.')
            my_partitions = self.consumer.offsets.keys()
            self.consumer.stop()
            self.consumer = None
        if self.client is not None:
            self.logger.warn('Brokers changed, stopping Kafka client.')
            self.client.close()
            self.client = None

        if my_partitions:
            msg = 'Brokers changed, queuing restart of Kafka client / consumer.'
            self.logger.warn(msg)
            self.zk.handler.spawn(self.init_consumer, my_partitions)

    def init_consumer(self, my_partitions):
        if self.consumer is None:
            self.logger.warn('Starting Kafka client')
            self.client = KafkaClient(self.broker_hosts,
                                      client_id=self.zkp._identifier)
        else:
            if self.consumer is None or \
               sorted(my_partitions) != sorted(self.consumer.offsets.keys()):
                self.logger.warn(
                    'Partitions changed, restarting Kafka consumer.')
                self.consumer.stop()
            else:
                self.logger.info(
                    'Partitions unchanged, not restarting Kafka consumer.')
                return

        self.consumer = SimpleConsumer(self.client,
                                       self.group,
                                       self.topic,
                                       partitions=my_partitions,
                                       **self.consumer_kwargs)
        self.consumer.provide_partition_info()
        self.logger.info("Consumer connected to Kafka: %s",
                         self.consumer.offsets)

    def stop(self):
        if self.consumer is not None:
            self.logger.info('Stopping Kafka consumer')
            self.consumer.stop()
            self.consumer = None
        if self.client is not None:
            self.logger.info('Stopping Kafka client')
            self.client.close()
            self.client = None
        if self.zk is not None:
            self.logger.info('Stopping ZooKeeper client')
            if self.zkp is not None and not self.zkp.failed:
                self.zkp.finish()
                self.zk.stop()
            self.zkp = None
            self.zk = None

    def commit(self, partitions=None):
        """
        Commit offsets for this consumer

        partitions: list of partitions to commit, default is to commit
                    all of them
        """
        if self.consumer is None:
            return
        self.logger.debug('Begin committing offsets for partitions: %s',
                          partitions if partitions else 'All')
        self.consumer.commit(partitions)
        self.logger.debug('End committing offsets for partitions: %s',
                          partitions if partitions else 'All')

    def pending(self, partitions=None):
        """
        Gets the pending message count

        partitions: list of partitions to check for, default is to check all
        """
        return self.consumer.pending(partitions)

    def provide_partition_info(self):
        """
        Indicates that partition info must be returned by the consumer
        """
        self.consumer.provide_partition_info()

    def seek(self, offset, whence):
        """
        Alter the current offset in the consumer, similar to fseek

        offset: how much to modify the offset
        whence: where to modify it from
                0 is relative to the earliest available offset (head)
                1 is relative to the current offset
                2 is relative to the latest known offset (tail)
        """
        self.consumer.seek(offset, whence)

    def get_messages(self, count=1, block=True, timeout=0.1):
        """
        Fetch the specified number of messages

        count: Indicates the maximum number of messages to be fetched
        block: If True, the API will block till some messages are fetched.
        timeout: If block is True, the function will block for the specified
                 time (in seconds) until count messages is fetched. If None,
                 it will block forever.
        """
        if self.consumer is None:
            return []
        else:
            try:
                messages = self.consumer.get_messages(count, block, timeout)
                if not messages and self.zkp.failed:
                    raise FailedPayloadsError
                return messages
            except FailedPayloadsError as err:
                msg = 'Failed to retrieve payload, restarting consumer'
                self.logger.exception(msg)
                raise err

    def get_message(self, block=True, timeout=0.1, get_partition_info=None):
        return self.consumer.get_message(block, timeout, get_partition_info)

    def _get_message(self,
                     block=True,
                     timeout=0.1,
                     get_partition_info=None,
                     update_offset=True):
        return self.consumer._get_message(block, timeout, get_partition_info,
                                          update_offset)

    def __iter__(self):
        for msg in self.consumer:
            yield msg
Ejemplo n.º 8
0
class Consumer(object):
    """Kafka consumer class with functions to consume messages to HDFS.
    Messages are blocked into 20MB files and transferred to HDFS
    Attributes:
        client: string representing IP:port of the kafka broker
        consumer: Consumer object specifying the client group, and topic
        temp_file_path: location of the 20MB file to be appended to before
            transfer to HDFS
        temp_file: File object opened from temp_file_path
        topic: String representing the topic on Kafka
        group: String representing the Kafka consumer group to be associated
            with
        block_cnt: integer representing the block count for print statements
    """

    def __init__(self, addr, group, topic):
        """Initialize Consumer with kafka broker IP, group, and topic."""
        self.client = KafkaClient(addr)
        self.consumer = SimpleConsumer(self.client, group, topic, max_buffer_size=1310720000)
        self.temp_file_path = None
        self.temp_file = None
        self.hadoop_path = "/user/parking_data/history"
        self.topic = topic
        self.group = group
        self.block_cnt = 0

    def consume_topic(self, output_dir):
        """Consumes a stream of messages from the "messages" topic.
        Code template from https://github.com/ajmssc/bitcoin-inspector.git
        Args:
            output_dir: string representing the directory to store the 20MB
                before transferring to HDFS
        Returns:
            None
        """
        timestamp = time.strftime("%Y%m%d%H%M%S")

        # open file for writing
        self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % (output_dir, self.topic, self.group, timestamp)
        self.temp_file = open(self.temp_file_path, "w")

        # while True:
        for ii in range(0, 2):
            try:
                # get 1000 messages at a time, non blocking
                messages = self.consumer.get_messages(count=1000, block=False)

                # OffsetAndMessage(offset=43, message=Message(magic=0,
                # attributes=0, key=None, value='some message'))
                for message in messages:
                    self.temp_file.write(message.message.value + "\n")

                # file size > 20MB
                if self.temp_file.tell() > 20000000:
                    self.flush_to_hdfs(output_dir)

                self.consumer.commit()
            except:
                # move to tail of kafka topic if consumer is referencing
                # unknown offset
                self.consumer.seek(0, 2)

    def flush_to_hdfs(self, output_dir):
        """Flushes the 20MB file into HDFS.
        Code template from https://github.com/ajmssc/bitcoin-inspector.git
        Flushes the file into HDFS folders
        Args:
            output_dir: string representing the directory to store the 20MB
                before transferring to HDFS
        Returns:
            None
        """
        self.temp_file.close()

        timestamp = time.strftime("%Y%m%d%H%M%S")

        hadoop_fullpath = "%s/%s_%s_%s.dat" % (self.hadoop_path, self.group, self.topic, timestamp)
        print "Block {}: Flushing 20MB file to HDFS => {}".format(str(self.block_cnt), hadoop_fullpath)
        self.block_cnt += 1

        # place blocked messages into history and cached folders on hdfs
        print ("hdfs dfs -put %s %s" % (self.temp_file_path, hadoop_fullpath))
        os.system("sudo hdfs dfs -put %s %s" % (self.temp_file_path, hadoop_fullpath))
        # os.system("sudo -u hdfs hdfs dfs -put %s %s" % (self.temp_file_path,
        # cached_fullpath))
        os.remove(self.temp_file_path)

        timestamp = time.strftime("%Y%m%d%H%M%S")

        self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % (output_dir, self.topic, self.group, timestamp)
        self.temp_file = open(self.temp_file_path, "w")
Ejemplo n.º 9
0
class Consumer(object):
    def __init__(self, addr, group, topic):
        """Initialize Consumer with kafka broker IP, group, and topic."""
        self.client = KafkaClient(addr)
        self.consumer = SimpleConsumer(self.client,
                                       group,
                                       topic,
                                       max_buffer_size=1310720000)
        self.temp_file_path = None
        self.temp_file = None
        self.hadoop_path = "/insight/artsy/geo"
        self.topic = topic
        self.group = group
        self.block_cnt = 0

    def consume_topic(self, output_dir):
        """Consumes a stream of messages from the "post_geo_activity" topic.
        Code template from https://github.com/ajmssc/bitcoin-inspector.git
        """
        timestamp = time.strftime('%Y%m%d%H%M%S')

        # open file for writing
        self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % (
            output_dir, self.topic, self.group, timestamp)
        self.temp_file = open(self.temp_file_path, "w")

        while True:
            try:
                # get 1000 messages at a time, non blocking
                messages = self.consumer.get_messages(count=1000, block=False)
                for message in messages:
                    self.temp_file.write(message.message.value + "\n")

                # file size > 20MB
                if self.temp_file.tell() > 20000000:
                    self.flush_to_hdfs(output_dir)

                self.consumer.commit()
            except:
                # move to tail of kafka topic if consumer is referencing
                # unknown offset
                self.consumer.seek(0, 2)

    def flush_to_hdfs(self, output_dir):
        """Flushes the 20MB file into HDFS."""
        self.temp_file.close()
        timestamp = time.strftime('%Y%m%d%H%M%S')
        hadoop_fullpath = "%s/%s_%s_%s.dat" % (self.hadoop_path, self.group,
                                               self.topic, timestamp)

        print "Block {}: Flushing data file to HDFS => {}".format(
            str(self.block_cnt), hadoop_fullpath)
        self.block_cnt += 1
        os.system(
            "hdfs dfs -put %s %s" %
            (self.temp_file_path, hadoop_fullpath))  # save from local to hdfs
        os.remove(self.temp_file_path)  # remove temp local file
        timestamp = time.strftime('%Y%m%d%H%M%S')
        self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % (
            output_dir, self.topic, self.group, timestamp)
        self.temp_file = open(self.temp_file_path, "w")
class Consumer(object):

    def __init__(self, addr, group, topic):
        self.client = KafkaClient(addr)
        self.consumer = SimpleConsumer(self.client, group, topic, max_buffer_size=1310720000)
        self.temp_file_path = None
        self.temp_file = None
        self.hadoop_path = "/user/AdReport/%s/history" %(topic)
        self.cached_path = "/user/AdReport/%s/cached" % (topic)
        self.topic = topic
        self.group = group
        self.block_cnt = 0


    def consume_topic(self, output_dir):

        timestamp = time.strftime('%Y%m%d%H%M%S')
        
        #open file for writing
        self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % (output_dir,
                                                         self.topic,
                                                         self.group,
                                                         timestamp)
        self.temp_file = open(self.temp_file_path,"w")
	print ( self.temp_file) 
	#one_entry = False

        while True:
            try:
                messages = self.consumer.get_messages(count=10, block=False)
		
                #OffsetAndMessage(offset=43, message=Message(magic=0,
                # attributes=0, key=None, value='some message'))
                for message in messages:
		    print (message)
		    #one_entry = True
                    #print (self.temp_file.tell())
		    self.temp_file.write(message.message.value + "\n")		

                if self.temp_file.tell() > 2000000:
                    self.save_to_hdfs(output_dir)

                self.consumer.commit()
            except:
                self.consumer.seek(0, 2)

	#if one_entry:
	    #print ("sending to hdfs")
            #self.save_to_hdfs(output_dir, self.topic)
	#self.consumer.commit()

    def save_to_hdfs(self, output_dir):
	print ("Saving file to hdfs")
        self.temp_file.close()
	print ("Closed open file")
        timestamp = time.strftime('%Y%m%d%H%M%S')

        hadoop_fullpath = "%s/%s_%s_%s.dat" % (self.hadoop_path, self.group,
                                               self.topic, timestamp)
        cached_fullpath = "%s/%s_%s_%s.dat" % (self.cached_path, self.group,
                                               self.topic, timestamp)
        #print ("Block " + str(self.block_cnt) + ": Saving file to HDFS " + hadoop_fullpath)
        self.block_cnt += 1

        # place blocked messages into history and cached folders on hdfs
        os.system("sudo -u ubuntu /usr/local/hadoop/bin/hdfs dfs -put %s %s" % (self.temp_file_path,
                                                        hadoop_fullpath))
        os.system("sudo -u ubuntu /usr/local/hadoop/bin/hdfs dfs -put %s %s" % (self.temp_file_path,
                                                        cached_fullpath))
        os.remove(self.temp_file_path)

        timestamp = time.strftime('%Y%m%d%H%M%S')

        self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % (output_dir,
                                                         self.topic,
                                                         self.group,
                                                         timestamp)
        self.temp_file = open(self.temp_file_path, "w")
Ejemplo n.º 11
0
# stdlib
from collections import defaultdict

# 3p
from kafka.client import KafkaClient
from kafka.consumer import SimpleConsumer

kafka_conn = KafkaClient("kafka:9092")
consumer = SimpleConsumer(kafka_conn,
                          "sample_check",
                          "test-topic",
                          auto_commit=True)

for message in consumer.get_messages(count=10):
    print message.offset
    consumer.commit()
Ejemplo n.º 12
0
class Consumer(object):
    """Kafka consumer class with functions to consume messages to HDFS.
    Messages are blocked into 20MB files and transferred to HDFS
    Attributes:
        client: string representing IP:port of the kafka broker
        consumer: Consumer object specifying the client group, and topic
        temp_file_path: location of the 20MB file to be appended to before
            transfer to HDFS
        temp_file: File object opened from temp_file_path
        topic: String representing the topic on Kafka
        group: String representing the Kafka consumer group to be associated
            with
        block_cnt: integer representing the block count for print statements
    """
    def __init__(self, addr, group, topic):
        """Initialize Consumer with kafka broker IP, group, and topic."""
        self.client = KafkaClient(addr)
        self.consumer = SimpleConsumer(self.client,
                                       group,
                                       topic,
                                       max_buffer_size=1310720000)
        self.temp_file_path = None
        self.temp_file = None
        self.hadoop_path = "/user/parking_data/history"
        self.topic = topic
        self.group = group
        self.block_cnt = 0

    def consume_topic(self, output_dir):
        """Consumes a stream of messages from the "messages" topic.
        Code template from https://github.com/ajmssc/bitcoin-inspector.git
        Args:
            output_dir: string representing the directory to store the 20MB
                before transferring to HDFS
        Returns:
            None
        """
        timestamp = time.strftime('%Y%m%d%H%M%S')

        # open file for writing
        self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % (
            output_dir, self.topic, self.group, timestamp)
        self.temp_file = open(self.temp_file_path, "w")

        # while True:
        for ii in range(0, 2):
            try:
                # get 1000 messages at a time, non blocking
                messages = self.consumer.get_messages(count=1000, block=False)

                # OffsetAndMessage(offset=43, message=Message(magic=0,
                # attributes=0, key=None, value='some message'))
                for message in messages:
                    self.temp_file.write(message.message.value + "\n")

                # file size > 20MB
                if self.temp_file.tell() > 20000000:
                    self.flush_to_hdfs(output_dir)

                self.consumer.commit()
            except:
                # move to tail of kafka topic if consumer is referencing
                # unknown offset
                self.consumer.seek(0, 2)

    def flush_to_hdfs(self, output_dir):
        """Flushes the 20MB file into HDFS.
        Code template from https://github.com/ajmssc/bitcoin-inspector.git
        Flushes the file into HDFS folders
        Args:
            output_dir: string representing the directory to store the 20MB
                before transferring to HDFS
        Returns:
            None
        """
        self.temp_file.close()

        timestamp = time.strftime('%Y%m%d%H%M%S')

        hadoop_fullpath = "%s/%s_%s_%s.dat" % (self.hadoop_path, self.group,
                                               self.topic, timestamp)
        print("Block {}: Flushing 20MB file to HDFS => {}".format(
            str(self.block_cnt), hadoop_fullpath))
        self.block_cnt += 1

        # place blocked messages into history and cached folders on hdfs
        print("hdfs dfs -put %s %s" % (self.temp_file_path, hadoop_fullpath))
        os.system("sudo hdfs dfs -put %s %s" %
                  (self.temp_file_path, hadoop_fullpath))
        # os.system("sudo -u hdfs hdfs dfs -put %s %s" % (self.temp_file_path,
        # cached_fullpath))
        os.remove(self.temp_file_path)

        timestamp = time.strftime('%Y%m%d%H%M%S')

        self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % (
            output_dir, self.topic, self.group, timestamp)
        self.temp_file = open(self.temp_file_path, "w")
Ejemplo n.º 13
0
class Consumer(object):
    def __init__(self, addr, group, topic):
        self.client = KafkaClient(addr)
        self.consumer = SimpleConsumer(self.client,
                                       group,
                                       topic,
                                       max_buffer_size=1310720000)
        self.temp_file_path = None
        self.temp_file = None
        self.hadoop_path = "/user/AdReport/%s/history" % (topic)
        self.cached_path = "/user/AdReport/%s/cached" % (topic)
        self.topic = topic
        self.group = group
        self.block_cnt = 0

    def consume_topic(self, output_dir):

        timestamp = time.strftime('%Y%m%d%H%M%S')

        #open file for writing
        self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % (
            output_dir, self.topic, self.group, timestamp)
        self.temp_file = open(self.temp_file_path, "w")
        print(self.temp_file)
        #one_entry = False

        while True:
            try:
                messages = self.consumer.get_messages(count=10, block=False)

                #OffsetAndMessage(offset=43, message=Message(magic=0,
                # attributes=0, key=None, value='some message'))
                for message in messages:
                    print(message)
                    #one_entry = True
                    #print (self.temp_file.tell())
                    self.temp_file.write(message.message.value + "\n")

                if self.temp_file.tell() > 2000000:
                    self.save_to_hdfs(output_dir)

                self.consumer.commit()
            except:
                self.consumer.seek(0, 2)

#if one_entry:
#print ("sending to hdfs")
#self.save_to_hdfs(output_dir, self.topic)
#self.consumer.commit()

    def save_to_hdfs(self, output_dir):
        print("Saving file to hdfs")
        self.temp_file.close()
        print("Closed open file")
        timestamp = time.strftime('%Y%m%d%H%M%S')

        hadoop_fullpath = "%s/%s_%s_%s.dat" % (self.hadoop_path, self.group,
                                               self.topic, timestamp)
        cached_fullpath = "%s/%s_%s_%s.dat" % (self.cached_path, self.group,
                                               self.topic, timestamp)
        #print ("Block " + str(self.block_cnt) + ": Saving file to HDFS " + hadoop_fullpath)
        self.block_cnt += 1

        # place blocked messages into history and cached folders on hdfs
        os.system("sudo -u ubuntu /usr/local/hadoop/bin/hdfs dfs -put %s %s" %
                  (self.temp_file_path, hadoop_fullpath))
        os.system("sudo -u ubuntu /usr/local/hadoop/bin/hdfs dfs -put %s %s" %
                  (self.temp_file_path, cached_fullpath))
        os.remove(self.temp_file_path)

        timestamp = time.strftime('%Y%m%d%H%M%S')

        self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % (
            output_dir, self.topic, self.group, timestamp)
        self.temp_file = open(self.temp_file_path, "w")
class Consumer(object):

    def __init__(self, addr, group, topic):
        self.client = KafkaClient(addr)
        self.consumer = SimpleConsumer(self.client, group, topic, max_buffer_size=1310720000)
        self.temp_file_path = None
        self.temp_file = None
        self.topic = topic
        self.group = group
        self.block_cnt = 0


    def consume_topic(self, output_dir):

        timestamp = time.strftime('%Y%m%d%H%M%S')
        
        #open file for writing
        self.temp_file_path = "/home/ubuntu/FantasyFootball/ingestion/kafka_%s_%s_%s.dat" % (self.topic, self.group, timestamp)
        self.temp_file = open(self.temp_file_path,"w")
        one_entry = False 

        while True:
            try:
                messages = self.consumer.get_messages(count=100, block=False)

                #OffsetAndMessage(offset=43, message=Message(magic=0,
                # attributes=0, key=None, value='some message'))
                for message in messages:
                    one_entry = True
                    self.tempfile.write(message.message.value + "\n")

                if self.tempfile.tell() > 2000:
                    self.save_to_hdfs(output_dir)

                self.consumer.commit()
            except:
                self.consumer.seek(0, 2)

        if one_entry:
            self.save_to_hdfs(output_dir, self.topic)

        self.consumer.commit()

    def save_to_hdfs(self, output_dir):
        self.tempfile.close()

        timestamp = time.strftime('%Y%m%d%H%M%S')

        hadoop_path = "/user/solivero/playerpoints/history/%s_%s_%s.dat" % (self.group, self.topic, timestamp)
        cached_path = "/user/solivero/playerpoints/cached/%s_%s_%s.dat" % (self.group, self.topic, timestamp)
        print "Block " + str(self.block_cnt) + ": Saving file to HDFS " + hadoop_path
        self.block_cnt += 1

        # place blocked messages into history and cached folders on hdfs
        os.system("sudo -u hdfs hdfs dfs -put %s %s" % (self.temp_file_path,hadoop_path))
        os.system("sudo -u hdfs hdfs dfs -put %s %s" % (self.temp_file_path,cached_path))
        os.remove(self.temp_file_path)

        timestamp = time.strftime('%Y%m%d%H%M%S')

        self.temp_file_path = "/home/ubuntu/fantasyfootball/ingestion/kafka_%s_%s_%s.dat" % (self.topic, self.group, timestamp)
        self.temp_file = open(self.temp_file_path, "w")
Ejemplo n.º 15
0
class Consumer(object):
    def __init__(self, addr, group, topic):
        self.client = KafkaClient(addr)
        self.consumer = SimpleConsumer(self.client, group, topic,
                                       max_buffer_size=1310720000)
        self.temp_file_path = None
        self.temp_file = None
        self.topic = topic
        self.group = group
        self.block_cnt = 0

        os.system ( "hdfs dfs -mkdir /data2" )

    def consume_topic(self, output_dir):
        if not os.path.isdir ( output_dir ): os.makedirs ( output_dir )

        timestamp = time.strftime('%Y%m%d%H%M%S')

        self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % (output_dir,
                                                         self.topic,
                                                         self.group,
                                                         timestamp)
        self.temp_file = open(self.temp_file_path,"w")

        while True:
            try:
                # get 1000 messages at a time, non blocking
                messages = self.consumer.get_messages(count=1000, block=False)

                # OffsetAndMessage(offset=43, message=Message(magic=0,
                # attributes=0, key=None, value='some message'))
                for message in messages:
                    self.temp_file.write(message.message.value + "\n")

                # file size > 40MB
                if self.temp_file.tell() > 40000000:
                    self.flush_to_hdfs(output_dir)

                self.consumer.commit()
            except:
                # move to tail of kafka topic if consumer is referencing
                # unknown offset
                self.consumer.seek(0, 2)


    def flush_to_hdfs(self, output_dir):

        self.temp_file.close()

        timestamp = time.strftime('%Y%m%d%H%M%S')

        print "Block {}: Flushing 40MB file to HDFS => /data2".format(str(self.block_cnt))
        self.block_cnt += 1

        # place blocked messages into history and cached folders on hdfs
        os.system("hdfs dfs -copyFromLocal %s %s" % (self.temp_file_path,
                                                        "/data2"))
        os.remove(self.temp_file_path)

        timestamp = time.strftime('%Y%m%d%H%M%S')

        self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % (output_dir,
                                                         self.topic,
                                                         self.group,
                                                         timestamp)
        self.temp_file = open(self.temp_file_path, "w")