Exemple #1
0
class Consumer(object):
    def __init__(self, addr, group, topic):
        self.client = KafkaClient(addr)
        self.consumer = SimpleConsumer(self.client,
                                       group,
                                       topic,
                                       max_buffer_size=1310720000,
                                       auto_offset_reset='smallest')
        self.temp_file_path = None
        self.temp_file = None
        self.topic = topic
        self.group = group
        self.block_cnt = 0

    def consume_topic(self):

        timestamp = time.strftime('%Y%m%d%H%M%S')

        #open file for writing
        self.temp_file_path = "/home/ubuntu/datamill/kafka_%s_%s_%s.dat" % (
            self.topic, self.group, timestamp)
        self.temp_file = open(self.temp_file_path, "w")
        header = 'experiment_id,job_id,results_file,package_id,package_name,worker_id,config_id,replicate_no,setup_time,run_time,collect_time,hw_cpu_arch,hw_cpu_mhz,hw_gpu_mhz,hw_num_cpus,hw_page_sz,hw_ram_mhz,hw_ram_sz,sw_address_randomization,sw_autogroup,sw_compiler,sw_drop_caches,sw_env_padding,sw_filesystem,sw_freq_scaling,sw_link_order,sw_opt_flag,sw_swap,sw_sys_time'
        self.temp_file.write(header)

        while True:
            try:
                messages = self.consumer.get_messages(count=100, block=False)

                for message in messages:
                    self.temp_file.write(message.message.value + "\n")

                if self.temp_file.tell() > 20000:
                    self.save_to_hdfs()

                self.consumer.commit()
            except:
                self.consumer.seek(0, 2)

        self.consumer.commit()

    def save_to_hdfs(self):
        self.temp_file.close()

        timestamp = time.strftime('%Y%m%d%H%M%S')
        hadoop_path = "/datamill/%s_%s_%s.csv" % (self.group, self.topic,
                                                  timestamp)
        print "Block " + str(
            self.block_cnt) + ": Saving file to HDFS " + hadoop_path
        self.block_cnt += 1

        # place blocked messages into history and cached folders on hdfs
        os.system("hdfs dfs -put %s %s" % (self.temp_file_path, hadoop_path))
        os.remove(self.temp_file_path)

        timestamp = time.strftime('%Y%m%d%H%M%S')

        self.temp_file_path = "/home/ubuntu/datamill/kafka_%s_%s_%s.dat" % (
            self.topic, self.group, timestamp)
        self.temp_file = open(self.temp_file_path, "w")
Exemple #2
0
def main():
    client = KafkaClient("localhost:9092")
    consumer = SimpleConsumer(client, "test-group", "twitter_raw")
    consumer.seek(0,2)

    num = 0
    for message in consumer:
        print "redis publish:", num
        num+=1
        try:
            data_depickled = pickle.loads(message.message.value.decode('utf-8'))
        except Exception, e:
            continue
        # print data_depickled
        # {  
        #    'text':'@_LulaMoore me hamas perra',
        #    'created_at':datetime.datetime(2015, 10, 9, 23, 36, 49),
        #    'source':u'Twitter Web Client',
        #    'lang:':u'es',
        #    'place':{  
        #       'country_code':u'AR',
        #       'coordinates':[  
        #          [  
        #             -68.176283,
        #             -38.984724
        #          ],
        #          [  
        #             -68.176283,
        #             -38.921051
        #          ],
        #          [  
        #             -68.015162,
        #             -38.921051
        #          ],
        #          [  
        #             -68.015162,
        #             -38.984724
        #          ]
        #       ]
        #    },
        #    'user':{  
        #       'statuses_count':15067,
        #       'name':u'Dama negra *\uffe6*',
        #       'friends_count':390,
        #       'created_at':datetime.datetime(2014, 3, 15,2,37, 10),
        #       'profile_image_url': u'http://pbs.twimg.com/profile_images/652333268256313344/x9K9Nlys_normal.jpg',
        #       'followers_count':384,
        #       'id':2390242428
        #    },
        #    'id':652628813935980544
        # }

        ### process data here ###
        # text = data_depickled['text']
        filtered_data = data_filter(data_depickled)
        data_pickled = pickle.dumps(filtered_data)
        redis.publish('tweets_processed', data_pickled)
Exemple #3
0
def main():
    client = KafkaClient("localhost:9092")
    consumer = SimpleConsumer(client, "test-group", "twitter_raw")
    consumer.seek(0,2)

    for message in consumer:
        # data_deserialized = str.decode(message.message.value)
        data_depickled = pickle.loads(message.message.value.decode('utf-8'))
        # print str(data_depickled).decode('string_escape')
        print data_depickled
    def blocking_consumer(self, message_consume_function, parse_json, topic_group, topic_name):
        print "starting blocking consumer with topic group %s and topic name %s" % (topic_group, topic_name)
        consumer = SimpleConsumer(self.client, topic_group, topic_name)
        consumer.seek(0,2)

        for message in consumer:
            message = parse_json(message)
            print "=============" + str(message) + "============"
            message_consume_function(message)
            print "called message consume function"
    def run(self):
        client = KafkaClient("10.206.216.13:19092,10.206.212.14:19092,10.206.209.25:19092")
        consumer = SimpleConsumer(client, "test-group", "jiketest",auto_commit=False,partitions=self.part)

        consumer.seek(0,0)

        while True:
            message = consumer.get_message(True,60)
            self.__offset = message.offset
            print message.message.value
Exemple #6
0
class Consumer(object):
    def __init__(self, addr, group, topic):
        """Initialize Consumer with kafka broker IP, group, and topic."""
        self.client = KafkaClient(addr)
        self.consumer = SimpleConsumer(self.client, group, topic,
                                       max_buffer_size=1310720000)
        self.temp_file_path = None
        self.temp_file = None
        self.hadoop_path = "/insight/artsy/geo"
        self.topic = topic
        self.group = group
        self.block_cnt = 0

    def consume_topic(self, output_dir):
        """Consumes a stream of messages from the "post_geo_activity" topic.
        Code template from https://github.com/ajmssc/bitcoin-inspector.git
        """
        timestamp = time.strftime('%Y%m%d%H%M%S')
        
        # open file for writing
        self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % (output_dir,self.topic,self.group,timestamp)
        self.temp_file = open(self.temp_file_path,"w")

        while True:
            try:
                # get 1000 messages at a time, non blocking
                messages = self.consumer.get_messages(count=1000, block=False)
                for message in messages:
                    self.temp_file.write(message.message.value + "\n")

                # file size > 20MB
                if self.temp_file.tell() > 20000000:
                    self.flush_to_hdfs(output_dir)

                self.consumer.commit()
            except:
                # move to tail of kafka topic if consumer is referencing
                # unknown offset
                self.consumer.seek(0, 2)


    def flush_to_hdfs(self, output_dir):
        """Flushes the 20MB file into HDFS."""
        self.temp_file.close()
        timestamp = time.strftime('%Y%m%d%H%M%S')
        hadoop_fullpath = "%s/%s_%s_%s.dat" % (self.hadoop_path, self.group,self.topic, timestamp)

        print "Block {}: Flushing data file to HDFS => {}".format(str(self.block_cnt),hadoop_fullpath)
        self.block_cnt += 1
        os.system("hdfs dfs -put %s %s" % (self.temp_file_path, hadoop_fullpath)) # save from local to hdfs
        os.remove(self.temp_file_path) # remove temp local file
        timestamp = time.strftime('%Y%m%d%H%M%S')
        self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % (output_dir,self.topic,self.group,timestamp)
        self.temp_file = open(self.temp_file_path, "w")
class Consumer(object):

    def __init__(self, addr, group, topic):
        self.client = KafkaClient(addr)
        self.consumer = SimpleConsumer(self.client, group, topic, max_buffer_size=1310720000, auto_offset_reset='smallest')
        self.temp_file_path = None
        self.temp_file = None
        self.topic = topic
        self.group = group
        self.block_cnt = 0


    def consume_topic(self):

        timestamp = time.strftime('%Y%m%d%H%M%S')

        #open file for writing
        self.temp_file_path = "/home/ubuntu/datamill/kafka_%s_%s_%s.dat" % (self.topic, self.group, timestamp)
        self.temp_file = open(self.temp_file_path,"w")
        header = 'experiment_id,job_id,results_file,package_id,package_name,worker_id,config_id,replicate_no,setup_time,run_time,collect_time,hw_cpu_arch,hw_cpu_mhz,hw_gpu_mhz,hw_num_cpus,hw_page_sz,hw_ram_mhz,hw_ram_sz,sw_address_randomization,sw_autogroup,sw_compiler,sw_drop_caches,sw_env_padding,sw_filesystem,sw_freq_scaling,sw_link_order,sw_opt_flag,sw_swap,sw_sys_time'
        self.temp_file.write(header)

        while True:
            try:
                messages = self.consumer.get_messages(count=100, block=False)

                for message in messages:
                    self.temp_file.write(message.message.value + "\n")

                if self.temp_file.tell() > 20000:
                    self.save_to_hdfs()

                self.consumer.commit()
            except:
                self.consumer.seek(0, 2)

        self.consumer.commit()

    def save_to_hdfs(self):
        self.temp_file.close()

        timestamp = time.strftime('%Y%m%d%H%M%S')
        hadoop_path = "/datamill/%s_%s_%s.csv" % (self.group, self.topic, timestamp)
        print "Block " + str(self.block_cnt) + ": Saving file to HDFS " + hadoop_path
        self.block_cnt += 1

        # place blocked messages into history and cached folders on hdfs
        os.system("hdfs dfs -put %s %s" % (self.temp_file_path, hadoop_path))
        os.remove(self.temp_file_path)

        timestamp = time.strftime('%Y%m%d%H%M%S')

        self.temp_file_path = "/home/ubuntu/datamill/kafka_%s_%s_%s.dat" % (self.topic, self.group, timestamp)
        self.temp_file = open(self.temp_file_path, "w")
Exemple #8
0
class KafkaDatawakeLookaheadSpout(Spout):
    group = 'datawake-crawler-out-consumer'.encode()

    def __init__(self):
        Spout.__init__(self)
        self.queue = None

    def initialize(self, stormconf, context):
        try:
            settings = all_settings.get_settings(stormconf['topology.deployment'])
            self.topic = settings['crawler-out-topic'].encode()
            self.conn_pool = settings['conn_pool'].encode()
            self.log('KafkaDatawakeLookaheadSpout initialized with topic =' + self.topic + ' conn_pool=' + self.conn_pool)
            self.kafka = KafkaClient(self.conn_pool)
            self.consumer = SimpleConsumer(self.kafka, self.group, self.topic, max_buffer_size=None)
            self.consumer.seek(0, 2)  # move to the tail of the queue
        except:
            self.log("KafkaDatawakeLookaheadSpout initialize error", level='error')
            self.log(traceback.format_exc(), level='error')
            raise


    def next_tuple(self):
        """
        input message:
            dict(
                 id = input['id'],
                 appid = input['appid'],
                 url = url,
                 status_code = response.getcode(),
                 status_msg = 'Success',
                 timestamp = response.info()['date'],
                 links_found = links,
                 raw_html =  html,
                 attrs = input['attrs']
            )
        :return:  (url, status, headers, flags, body, timestamp, source,context)
        """

        offsetAndMessage = self.consumer.get_messages(timeout=None)[0]
        message = offsetAndMessage.message.value

        crawled = json.loads(message)
        safeurl = crawled['url'].encode('utf-8', 'ignore')
        self.log("Lookahead spout received id: " + crawled['id'] + " url: " + safeurl)
        context = {
            'source': 'datawake-lookahead',
            'userId': crawled['attrs']['userId'],
            'org': crawled['attrs']['org'],
            'domain': crawled['attrs']['domain'],
            'url': crawled['url']
        }
        self.emit([crawled['url'], crawled['status_code'], '', '', crawled['raw_html'], crawled['timestamp'], context['source'], context])
Exemple #9
0
        def run(self):
            client = None
            consumer = None
            try:
                prev = None
                # print("Starting Kafka Client")
                # print("Kafka topic: {}").format(self.topic)
                print get_kafka_hosts()
                client = KafkaClient(hosts=get_kafka_hosts())
                consumer = SimpleConsumer(client=client,
                                          group=self.groupName.encode(
                                              'ascii', 'ignore'),
                                          topic=self.topic,
                                          iter_timeout=5)
                consumer.seek(0, 1)
                print '[Kafka Consumer] START'
                print 'Topic: {}'.format(self.topic)
                print 'Listening incoming message...'
                print '========================================================='
                # print("Listening kafka message...")

                while self.stopCpu is False:
                    for message in consumer.get_messages(count=5, block=False):
                        if self.stopCpu is True:
                            # print("Kafka Consumer Listening Stopped")
                            break

                        if message:
                            offset = message.offset
                            value = message.message.value
                            print 'msg: {0}, offset: {1}'.format(value, offset)

                            if len(value) > 0:
                                # chartdata = []
                                # j_val = json.loads(value)
                                # j_val['offset'] = offset
                                # chartdata.append(j_val)
                                # print("destination => ws"+str(self.pid))
                                # self.parentOj.emit("ws"+str(self.type), chartdata)
                                # self.parentOj.emit(self.topic, value)
                                self.parentOj.emit("ws" + str(self.pid), value)

                print '[Kafka Consumer] STOP'
                print 'Topic: {}'.format(self.topic)
                print 'Stop listening...'
                print '========================================================'
                # print("Listening kafka Stopped")
                consumer.stop()
                client.close()
            except Exception as e:
                consumer.stop()
                client.close()
Exemple #10
0
class KafkaDatawakeVisitedSpout(Spout):
    group = 'datawake-visited-consumer'.encode()

    def __init__(self):
        Spout.__init__(self)
        self.queue = None

    def initialize(self, stormconf, context):
        try:
            settings = all_settings.get_settings(
                stormconf['topology.deployment'])
            self.topic = settings['visited-topic'].encode()
            self.conn_pool = settings['conn_pool'].encode()
            self.log('KafkaDatawakeVisitedSpout initialized with topic =' +
                     self.topic + ' conn_pool=' + self.conn_pool)
            self.kafka = KafkaClient(self.conn_pool)
            self.kafka.ensure_topic_exists(self.topic)
            self.consumer = SimpleConsumer(self.kafka,
                                           self.group,
                                           self.topic,
                                           max_buffer_size=None)
            self.consumer.seek(0, 2)  # move to the tail of the queue
        except:
            self.log("KafkaDatawakeVisitedSpout initialize error",
                     level='error')
            self.log(traceback.format_exc(), level='error')
            raise

    def next_tuple(self):
        """
        input:  (timestamp,org,domain,user_id,url,html)
        :return:  (url, status, headers, flags, body, timestamp, source,context)
        """
        try:
            for message in self.consumer:
                self.log("msg")
                self.log(message)
                #offsetAndMessage = self.consumer.get_messages(timeout=None)[0]
                message = message.split('\0')
                (timestamp, org, domain, userId, url, html) = message
                context = {'source': 'datawake-visited', 'domain': domain}
                self.emit([
                    url, '', '', '', html, timestamp, context['source'],
                    context
                ])
        except:
            self.log(traceback.format_exc(), level='error')

    def fail(self, tup_id):
        pass
class KafkaConsumer:

    group = "python-lookahead-consumer"

    def __init__(self,conn_pool,topic,group):
        self.conn_pool = conn_pool
        self.topic = topic
        self.group = group
        self.kafka = KafkaClient(self.conn_pool)
        self.consumer = SimpleConsumer(self.kafka,self.group,self.topic,max_buffer_size=None)
        self.consumer.seek(0,2) # move to the tail of the queue

    def next(self):
        offsetAndMessage = self.consumer.get_messages(timeout=None)[0]
        message = offsetAndMessage.message.value
        return message
Exemple #12
0
def consume():
    message_count = 0

    kafka = KafkaClient(KAFKA_HOSTS)

    consumer = SimpleConsumer(client=kafka,
                              group='XXXXX-YYYYY-ZZZZZ',
                              topic='events',
                              iter_timeout=15,
                              max_buffer_size=1024*1024*2)

    consumer.seek(0, )

    for m in consumer:
        message_count += 1
        pprint.pprint(m.message.value)
        print message_count
        print
class KafkaDatawakeVisitedSpout(Spout):
    group = 'datawake-visited-consumer'.encode()

    def __init__(self):
        Spout.__init__(self)
        self.queue = None

    def initialize(self, stormconf, context):
        try:
            settings = all_settings.get_settings(stormconf['topology.deployment'])
            self.topic = settings['visited-topic'].encode()
            self.conn_pool = settings['conn_pool'].encode()
            self.log('KafkaDatawakeVisitedSpout initialized with topic =' + self.topic + ' conn_pool=' + self.conn_pool)
            self.kafka = KafkaClient(self.conn_pool)
            self.kafka.ensure_topic_exists(self.topic)
            self.consumer = SimpleConsumer(self.kafka, self.group, self.topic, max_buffer_size=None)
            self.consumer.seek(0, 2)  # move to the tail of the queue
        except:
            self.log("KafkaDatawakeVisitedSpout initialize error", level='error')
            self.log(traceback.format_exc(), level='error')
            raise

    def next_tuple(self):
        """
        input:  (timestamp,org,domain,user_id,url,html)
        :return:  (url, status, headers, flags, body, timestamp, source,context)
        """
        try:
            for message in self.consumer:
                self.log("msg")
                self.log(message)
                #offsetAndMessage = self.consumer.get_messages(timeout=None)[0]
                message = message.split('\0')
                (timestamp, org, domain, userId, url, html) = message
                context = {
                    'source': 'datawake-visited',
                    'domain': domain
                }
                self.emit([url, '', '', '', html, timestamp, context['source'], context])
        except:
            self.log(traceback.format_exc(), level='error')

    def fail(self, tup_id):
	pass 
class CrawlerSpout(Spout):

    group = 'datawake-crawler-in-consumer'.encode()


    def initialize(self, stormconf, context):
        try:
            settings = all_settings.get_settings(stormconf['topology.deployment'])
            self.topic = settings['crawler-in-topic'].encode()
            self.conn_pool = settings['conn_pool'].encode()
            self.log('CrawlerSpout initialized with topic ='+self.topic+' conn_pool='+self.conn_pool)
            self.kafka = KafkaClient(self.conn_pool)
	    self.kafka.ensure_topic_exists(self.topic)
            self.consumer = SimpleConsumer(self.kafka,self.group,self.topic,max_buffer_size=None, fetch_size_bytes=2000000)
            self.consumer.seek(0,2) # move to the tail of the queue
        except:
            self.log("CrawlerSpout initialize error",level='error')
            self.log(traceback.format_exc(),level='error')
            raise

    def next_tuple(self):
        """
        input message:
             json.dumps(dict(
                    id = 'abcdefg', #TODO generate UUID,
                    appid = self.appid,
                    url = url,
                    priority = 50,
                    depth = 0,
                    attrs  = dict(
                        userId = context['userId'],
                        org =  context['org'],
                        domain = context['domain']
                    )
                ))
        :return:
        """
        try:
            for message in self.consumer:
                to_crawl = json.loads(message)
                self.emit([to_crawl])
        except:
            self.log(traceback.format_exc(),level='error')
Exemple #15
0
class KafkaConsumer:

    group = "python-lookahead-consumer"

    def __init__(self, conn_pool, topic, group):
        self.conn_pool = conn_pool
        self.topic = topic
        self.group = group
        self.kafka = KafkaClient(self.conn_pool)
        self.consumer = SimpleConsumer(self.kafka,
                                       self.group,
                                       self.topic,
                                       max_buffer_size=None)
        self.consumer.seek(0, 2)  # move to the tail of the queue

    def next(self):
        offsetAndMessage = self.consumer.get_messages(timeout=None)[0]
        message = offsetAndMessage.message.value
        return message
Exemple #16
0
class Command(BaseCommand):
    option_list = BaseCommand.option_list + (
        make_option("--beginning",
                    action="store_true",
                    dest="beginning",
                    default=False,
                    help="Start from the beginning of the topic (seek to 0)"),
        make_option("--end",
                    action="store_true",
                    dest="end",
                    default=False,
                    help="Start from the end of the topic (seek to 0)"),
        make_option(
            "--dev",
            action="store_true",
            dest="dev",
            default=False,
            help=
            "Start a few hundred back from the end of the topic to warm local events."
        ),
    )

    topic = settings.KAFKA_EVENT_TOPIC

    @property
    def consumer_name(self):
        basename = str(__name__).split(".")[-1].replace("_", "-")
        return "%s-%s" % (
            basename,
            socket.gethostname(),
        )

    def handle(self, *args, **options):
        self.stdout.write("Starting %s worker for %s." % (
            self.topic,
            self.consumer_name,
        ))

        self.consumer = SimpleConsumer(client=kafka_client,
                                       group=self.consumer_name,
                                       topic=self.topic,
                                       max_buffer_size=1024 * 1024 * 2)

        # self.consumer = MultiProcessConsumer(client=kafka_client,
        #                                      group=self.consumer_name,
        #                                      topic=self.topic,
        #                                      num_procs=multiprocessing.cpu_count())

        self.dev_mode = False
        if options["beginning"]:
            self.consumer.seek(0, 0)
        elif options["end"]:
            self.consumer.seek(0, 2)
        elif options["dev"]:
            self.dev_mode = True
            self.consumer.seek(-300, 2)

        try:
            while bool(self.consumer):
                try:
                    for m in self.consumer:
                        self.handle_message(m)
                except OffsetOutOfRangeError:
                    self.stderr.write(
                        "Offset out of range error.  Seeking to beginning.")
                    self.consumer.seek(0, 0)

        except KeyboardInterrupt:
            self.consumer = False

            kafka_client.close()

            self.stdout.write("Shutting down %s worker." % (self.topic, ))
            self.stdout.write("Got Ctrl-C, terminating %s consumer." %
                              (self.topic, ))

    def handle_message(self, message_and_offset):
        offset = message_and_offset.offset
        message = message_and_offset.message
        payload = json.loads(message.value)

        team = payload.get("team")
        event_type = payload.get("event")

        if not payload.get("event_id"):
            self.stdout.write("Skipping event with no UUID <Offset: %s>" %
                              offset)
            return

        # Process Advisories
        if event_type in ADVISORY_EVENTS:
            task = create_advisory_from_event_payload
            if self.dev_mode:
                advisory = task(payload)
                if advisory:
                    print "Created %s" % advisory
            else:
                task.apply_async(args=(payload, ))

        # New Sensor Enrollment
        if event_type == "sensor_new":
            return Sensor.create_from_payload(payload)

        # Cluster Destroy Event
        if event_type == "dead_packages_group":
            cluster_id = payload["value"].values()[0]
            try:
                return Cluster.objects.get(uuid=cluster_id).delete()
            except Cluster.DoesNotExist:
                return
Exemple #17
0
class ZKConsumer(object):

    zk_timeout = 30
    jitter_seconds = 30
    broker_prefix = '/brokers/ids'

    def __init__(
            self,
            zk_hosts,
            group,
            topic,
            nodes,
            zk_handler=None,
            logger=None,
            identifier=None,
            **consumer_kwargs):
        """Creates a Consumer that tracks state in ZooKeeper,
        rebalancing partition ownership as registered consumers change.
        NOTE: this class is intended for version 0.8.1 of Kafka, where offsets
              are managed by Kafka but there is no rebalancing in the protocol.
        """
        if logger is None:
            logger = logging.getLogger('kafka.consumer.ZKConsumer')
        self.logger = logger
        self.identifier = identifier

        if KafkaClient is None:
            raise RuntimeError("Kafka support requires cs.eyrie to be installed with the Kafka extra: install_requires= ['cs.eyrie[Kafka]']")
        self.zk_handler = zk_handler
        self.zk_hosts = zk_hosts
        self.broker_hosts = []

        self.group = group
        self.topic = topic

        self.zk = None
        self.nodes = nodes
        self.client = None
        self.consumer = None
        self.consumer_kwargs = consumer_kwargs

        # This will kick off a cascading sequence to initialize ourselves:
        # 1. Connect to ZK and pull list of Kafka brokers
        # 2. Register ourselves as a consumer in ZK
        # 3. Rebalance partitions across all connected consumers
        self.init_zk()

    def zk_session_watch(self, state):
        self.logger.debug('ZK transitioned to: %s', state)
        if state == KazooState.SUSPENDED:
            if self.consumer is not None:
                self.logger.info('Stopping Kafka consumer')
                self.consumer.stop()
                self.consumer = None
            # Lost connection to ZK; we can't call any methods that would
            # try to contact it (i.e., we can't do self.zkp.finish() )
            self.zkp = None
        elif state == KazooState.CONNECTED:
            self.logger.info('Restarting ZK partitioner')
            self.zk.handler.spawn(self.init_zkp)

    def _zkp_wait(self):
        handler = self.zk.handler
        while 1:
            if self.zkp.failed:
                self.logger.warning("Lost or unable to acquire partition")
                self.stop()
            elif self.zkp.release:
                self.zkp.release_set()
            elif self.zkp.acquired:
                def group_change_proxy(event):
                    self.logger.warn('Connected consumers changed')
                    if self.zkp is None:
                        self.logger.info('Restarting ZK partitioner')
                        handler.spawn(self.init_zkp)
                    elif self.zkp is not None and self.zkp.failed:
                        self.logger.warning("Lost or unable to acquire partition")
                        self.stop()
                    else:
                        self.logger.info('Scheduling ZK partitioner set release')
                        rel_greenlet = handler.spawn(self.zkp.release_set)
                        self.logger.info('Scheduling group re-join')
                        rel_greenlet.link_value(lambda greenlet: self.zkp.join_group)
                if not self.nodes:
                    self.logger.info('Partitioner aquired; setting child watch')
                    result = self.zk.get_children_async(self.zkp._group_path)
                    result.rawlink(group_change_proxy)
                # Break out of while loop to begin consuming events
                break
            elif self.zkp.allocating:
                self.zkp.wait_for_acquire()

    def init_zkp(self):
        if not hasattr(self, 'zkp') or self.zkp is None:
            if self.nodes:
                self.zkp = StaticZKPartitioner(
                    self.zk, self.group, self.topic, self.nodes,
                    partitions_changed_cb=self.init_consumer,
                    logger=self.logger, identifier=self.identifier)
            else:
                self.zkp = ZKPartitioner(
                    self.zk, self.group, self.topic,
                    time_boundary=self.jitter_seconds,
                    partitions_changed_cb=self.init_consumer,
                    logger=self.logger, identifier=self.identifier)

        self._zkp_wait()

    def init_zk(self):
        # TODO: switch to async
        # 1. implement kazoo.interfaces.IHandler in terms of Tornado's IOLoop
        self.zk = KazooClient(hosts=self.zk_hosts, handler=self.zk_handler)
        self.zk.start()
        self.zk.add_listener(self.zk_session_watch)

        @self.zk.ChildrenWatch(self.broker_prefix)
        def broker_change_proxy(broker_ids):
            self.onBrokerChange(broker_ids)

        self.init_zkp()

    def onBrokerChange(self, broker_ids):
        self.broker_hosts = []
        for b_id in broker_ids:
            b_json, zstat = self.zk.get('/'.join([self.broker_prefix, b_id]))
            b_data = json.loads(b_json)
            self.broker_hosts.append('{}:{}'.format(b_data['host'],
                                                    b_data['port']))

        my_partitions = []
        if self.consumer is not None:
            self.logger.warn('Brokers changed, stopping Kafka consumer.')
            my_partitions = self.consumer.offsets.keys()
            self.consumer.stop()
            self.consumer = None
        if self.client is not None:
            self.logger.warn('Brokers changed, stopping Kafka client.')
            self.client.close()
            self.client = None

        if my_partitions:
            msg = 'Brokers changed, queuing restart of Kafka client / consumer.'
            self.logger.warn(msg)
            self.zk.handler.spawn(self.init_consumer, my_partitions)

    def init_consumer(self, my_partitions):
        if self.consumer is None:
            self.logger.warn('Starting Kafka client')
            self.client = KafkaClient(self.broker_hosts,
                                      client_id=self.zkp._identifier)
        else:
            if self.consumer is None or \
               sorted(my_partitions) != sorted(self.consumer.offsets.keys()):
                self.logger.warn('Partitions changed, restarting Kafka consumer.')
                self.consumer.stop()
            else:
                self.logger.info('Partitions unchanged, not restarting Kafka consumer.')
                return

        self.consumer = SimpleConsumer(self.client, self.group, self.topic,
                                       partitions=my_partitions,
                                       **self.consumer_kwargs)
        self.consumer.provide_partition_info()
        self.logger.info("Consumer connected to Kafka: %s", self.consumer.offsets)

    def stop(self):
        if self.consumer is not None:
            self.logger.info('Stopping Kafka consumer')
            self.consumer.stop()
            self.consumer = None
        if self.client is not None:
            self.logger.info('Stopping Kafka client')
            self.client.close()
            self.client = None
        if self.zk is not None:
            self.logger.info('Stopping ZooKeeper client')
            if self.zkp is not None and not self.zkp.failed:
                self.zkp.finish()
                self.zk.stop()
            self.zkp = None
            self.zk = None

    def commit(self, partitions=None):
        """
        Commit offsets for this consumer

        partitions: list of partitions to commit, default is to commit
                    all of them
        """
        if self.consumer is None:
            return
        self.logger.debug('Begin committing offsets for partitions: %s',
                          partitions if partitions else 'All')
        self.consumer.commit(partitions)
        self.logger.debug('End committing offsets for partitions: %s',
                          partitions if partitions else 'All')

    def pending(self, partitions=None):
        """
        Gets the pending message count

        partitions: list of partitions to check for, default is to check all
        """
        return self.consumer.pending(partitions)

    def provide_partition_info(self):
        """
        Indicates that partition info must be returned by the consumer
        """
        self.consumer.provide_partition_info()

    def seek(self, offset, whence):
        """
        Alter the current offset in the consumer, similar to fseek

        offset: how much to modify the offset
        whence: where to modify it from
                0 is relative to the earliest available offset (head)
                1 is relative to the current offset
                2 is relative to the latest known offset (tail)
        """
        self.consumer.seek(offset, whence)

    def get_messages(self, count=1, block=True, timeout=0.1):
        """
        Fetch the specified number of messages

        count: Indicates the maximum number of messages to be fetched
        block: If True, the API will block till some messages are fetched.
        timeout: If block is True, the function will block for the specified
                 time (in seconds) until count messages is fetched. If None,
                 it will block forever.
        """
        if self.consumer is None:
            return []
        else:
            try:
                messages = self.consumer.get_messages(count, block, timeout)
                if not messages and self.zkp.failed:
                    raise FailedPayloadsError
                return messages
            except FailedPayloadsError as err:
                msg = 'Failed to retrieve payload, restarting consumer'
                self.logger.exception(msg)
                raise err

    def get_message(self, block=True, timeout=0.1, get_partition_info=None):
        return self.consumer.get_message(block, timeout, get_partition_info)

    def _get_message(self, block=True, timeout=0.1, get_partition_info=None,
                     update_offset=True):
        return self.consumer._get_message(block, timeout, get_partition_info,
                                          update_offset)

    def __iter__(self):
        for msg in self.consumer:
            yield msg
class Consumer(object):

    def __init__(self, addr, group, topic):
        self.client = KafkaClient(addr)
        self.consumer = SimpleConsumer(self.client, group, topic, max_buffer_size=1310720000)
        self.temp_file_path = None
        self.temp_file = None
        self.topic = topic
        self.group = group
        self.block_cnt = 0


    def consume_topic(self, output_dir):

        timestamp = time.strftime('%Y%m%d%H%M%S')
        
        #open file for writing
        self.temp_file_path = "/home/ubuntu/FantasyFootball/ingestion/kafka_%s_%s_%s.dat" % (self.topic, self.group, timestamp)
        self.temp_file = open(self.temp_file_path,"w")
        one_entry = False 

        while True:
            try:
                messages = self.consumer.get_messages(count=100, block=False)

                #OffsetAndMessage(offset=43, message=Message(magic=0,
                # attributes=0, key=None, value='some message'))
                for message in messages:
                    one_entry = True
                    self.tempfile.write(message.message.value + "\n")

                if self.tempfile.tell() > 2000:
                    self.save_to_hdfs(output_dir)

                self.consumer.commit()
            except:
                self.consumer.seek(0, 2)

        if one_entry:
            self.save_to_hdfs(output_dir, self.topic)

        self.consumer.commit()

    def save_to_hdfs(self, output_dir):
        self.tempfile.close()

        timestamp = time.strftime('%Y%m%d%H%M%S')

        hadoop_path = "/user/solivero/playerpoints/history/%s_%s_%s.dat" % (self.group, self.topic, timestamp)
        cached_path = "/user/solivero/playerpoints/cached/%s_%s_%s.dat" % (self.group, self.topic, timestamp)
        print "Block " + str(self.block_cnt) + ": Saving file to HDFS " + hadoop_path
        self.block_cnt += 1

        # place blocked messages into history and cached folders on hdfs
        os.system("sudo -u hdfs hdfs dfs -put %s %s" % (self.temp_file_path,hadoop_path))
        os.system("sudo -u hdfs hdfs dfs -put %s %s" % (self.temp_file_path,cached_path))
        os.remove(self.temp_file_path)

        timestamp = time.strftime('%Y%m%d%H%M%S')

        self.temp_file_path = "/home/ubuntu/fantasyfootball/ingestion/kafka_%s_%s_%s.dat" % (self.topic, self.group, timestamp)
        self.temp_file = open(self.temp_file_path, "w")
Exemple #19
0
class KafkaDatawakeLookaheadSpout(Spout):
    group = 'datawake-crawler-out-consumer'.encode()

    def __init__(self):
        Spout.__init__(self)
        self.queue = None

    def initialize(self, stormconf, context):
        try:
            self.settings = all_settings.get_settings(
                stormconf['topology.deployment'])
            self.topic = self.settings['crawler-out-topic'].encode()
            self.conn_pool = self.settings['crawler_conn_pool'].encode()
            self.log('KafkaDatawakeLookaheadSpout initialized with topic =' +
                     self.topic + ' conn_pool=' + self.conn_pool)
            self.kafka = KafkaClient(self.conn_pool)
            self.kafka.ensure_topic_exists(self.topic)
            self.consumer = SimpleConsumer(self.kafka,
                                           self.group,
                                           self.topic,
                                           max_buffer_size=None)
            self.consumer.seek(0, 2)  # move to the tail of the queue
        except:
            self.log("KafkaDatawakeLookaheadSpout initialize error",
                     level='error')
            self.log(traceback.format_exc(), level='error')
            raise

    def next_tuple(self):
        """
        input message:
            dict(
                 crawlid = input['crawlid'],
                 appid = input['appid'],
                 url = url,
                 status_code = response.getcode(),
                 status_msg = 'Success',
                 timestamp = response.info()['date'],
                 links_found = links,
                 body =  html,
                 attrs = input['attrs']
            )
        :return:  (url, status, headers, flags, body, timestamp, source,context)
        """

        offsetAndMessage = self.consumer.get_messages(timeout=None)[0]
        message = offsetAndMessage.message.value

        crawled = json.loads(message)
        if crawled['appid'] == self.settings["appid"]:
            safeurl = crawled['url'].encode('utf-8', 'ignore')
            self.log("Lookahead spout received id: " + crawled['crawlid'] +
                     " url: " + safeurl)
            context = {
                'source': 'datawake-lookahead',
                'domain': crawled['attrs']['domain']
            }
            self.emit([
                crawled['url'], crawled['status_code'], '', '',
                crawled['body'], crawled['timestamp'], context['source'],
                context
            ])
class Consumer(object):
    def __init__(self, addr, group, topic):
        self.client = KafkaClient(addr)
        self.consumer = SimpleConsumer(self.client, group, topic,
                                       max_buffer_size=1310720000)
        self.temp_file_path = None
        self.temp_file = None
        self.topic = topic
        self.group = group
        self.block_cnt = 0

        os.system ( "hdfs dfs -mkdir /data2" )

    def consume_topic(self, output_dir):
        if not os.path.isdir ( output_dir ): os.makedirs ( output_dir )

        timestamp = time.strftime('%Y%m%d%H%M%S')

        self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % (output_dir,
                                                         self.topic,
                                                         self.group,
                                                         timestamp)
        self.temp_file = open(self.temp_file_path,"w")

        while True:
            try:
                # get 1000 messages at a time, non blocking
                messages = self.consumer.get_messages(count=1000, block=False)

                # OffsetAndMessage(offset=43, message=Message(magic=0,
                # attributes=0, key=None, value='some message'))
                for message in messages:
                    self.temp_file.write(message.message.value + "\n")

                # file size > 40MB
                if self.temp_file.tell() > 40000000:
                    self.flush_to_hdfs(output_dir)

                self.consumer.commit()
            except:
                # move to tail of kafka topic if consumer is referencing
                # unknown offset
                self.consumer.seek(0, 2)


    def flush_to_hdfs(self, output_dir):

        self.temp_file.close()

        timestamp = time.strftime('%Y%m%d%H%M%S')

        print "Block {}: Flushing 40MB file to HDFS => /data2".format(str(self.block_cnt))
        self.block_cnt += 1

        # place blocked messages into history and cached folders on hdfs
        os.system("hdfs dfs -copyFromLocal %s %s" % (self.temp_file_path,
                                                        "/data2"))
        os.remove(self.temp_file_path)

        timestamp = time.strftime('%Y%m%d%H%M%S')

        self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % (output_dir,
                                                         self.topic,
                                                         self.group,
                                                         timestamp)
        self.temp_file = open(self.temp_file_path, "w")
Exemple #21
0
class Consumer(object):
    def __init__(self, addr, group, topic):
        """Initialize Consumer with kafka broker IP, group, and topic."""
        self.client = KafkaClient(addr)
        self.consumer = SimpleConsumer(self.client,
                                       group,
                                       topic,
                                       max_buffer_size=1310720000)
        self.temp_file_path = None
        self.temp_file = None
        self.hadoop_path = "/insight/artsy/geo"
        self.topic = topic
        self.group = group
        self.block_cnt = 0

    def consume_topic(self, output_dir):
        """Consumes a stream of messages from the "post_geo_activity" topic.
        Code template from https://github.com/ajmssc/bitcoin-inspector.git
        """
        timestamp = time.strftime('%Y%m%d%H%M%S')

        # open file for writing
        self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % (
            output_dir, self.topic, self.group, timestamp)
        self.temp_file = open(self.temp_file_path, "w")

        while True:
            try:
                # get 1000 messages at a time, non blocking
                messages = self.consumer.get_messages(count=1000, block=False)
                for message in messages:
                    self.temp_file.write(message.message.value + "\n")

                # file size > 20MB
                if self.temp_file.tell() > 20000000:
                    self.flush_to_hdfs(output_dir)

                self.consumer.commit()
            except:
                # move to tail of kafka topic if consumer is referencing
                # unknown offset
                self.consumer.seek(0, 2)

    def flush_to_hdfs(self, output_dir):
        """Flushes the 20MB file into HDFS."""
        self.temp_file.close()
        timestamp = time.strftime('%Y%m%d%H%M%S')
        hadoop_fullpath = "%s/%s_%s_%s.dat" % (self.hadoop_path, self.group,
                                               self.topic, timestamp)

        print "Block {}: Flushing data file to HDFS => {}".format(
            str(self.block_cnt), hadoop_fullpath)
        self.block_cnt += 1
        os.system(
            "hdfs dfs -put %s %s" %
            (self.temp_file_path, hadoop_fullpath))  # save from local to hdfs
        os.remove(self.temp_file_path)  # remove temp local file
        timestamp = time.strftime('%Y%m%d%H%M%S')
        self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % (
            output_dir, self.topic, self.group, timestamp)
        self.temp_file = open(self.temp_file_path, "w")
class Consumer(object):

    def __init__(self, addr, group, topic):
        self.client = KafkaClient(addr)
        self.consumer = SimpleConsumer(self.client, group, topic, max_buffer_size=1310720000)
        self.temp_file_path = None
        self.temp_file = None
        self.hadoop_path = "/user/AdReport/%s/history" %(topic)
        self.cached_path = "/user/AdReport/%s/cached" % (topic)
        self.topic = topic
        self.group = group
        self.block_cnt = 0


    def consume_topic(self, output_dir):

        timestamp = time.strftime('%Y%m%d%H%M%S')
        
        #open file for writing
        self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % (output_dir,
                                                         self.topic,
                                                         self.group,
                                                         timestamp)
        self.temp_file = open(self.temp_file_path,"w")
	print ( self.temp_file) 
	#one_entry = False

        while True:
            try:
                messages = self.consumer.get_messages(count=10, block=False)
		
                #OffsetAndMessage(offset=43, message=Message(magic=0,
                # attributes=0, key=None, value='some message'))
                for message in messages:
		    print (message)
		    #one_entry = True
                    #print (self.temp_file.tell())
		    self.temp_file.write(message.message.value + "\n")		

                if self.temp_file.tell() > 2000000:
                    self.save_to_hdfs(output_dir)

                self.consumer.commit()
            except:
                self.consumer.seek(0, 2)

	#if one_entry:
	    #print ("sending to hdfs")
            #self.save_to_hdfs(output_dir, self.topic)
	#self.consumer.commit()

    def save_to_hdfs(self, output_dir):
	print ("Saving file to hdfs")
        self.temp_file.close()
	print ("Closed open file")
        timestamp = time.strftime('%Y%m%d%H%M%S')

        hadoop_fullpath = "%s/%s_%s_%s.dat" % (self.hadoop_path, self.group,
                                               self.topic, timestamp)
        cached_fullpath = "%s/%s_%s_%s.dat" % (self.cached_path, self.group,
                                               self.topic, timestamp)
        #print ("Block " + str(self.block_cnt) + ": Saving file to HDFS " + hadoop_fullpath)
        self.block_cnt += 1

        # place blocked messages into history and cached folders on hdfs
        os.system("sudo -u ubuntu /usr/local/hadoop/bin/hdfs dfs -put %s %s" % (self.temp_file_path,
                                                        hadoop_fullpath))
        os.system("sudo -u ubuntu /usr/local/hadoop/bin/hdfs dfs -put %s %s" % (self.temp_file_path,
                                                        cached_fullpath))
        os.remove(self.temp_file_path)

        timestamp = time.strftime('%Y%m%d%H%M%S')

        self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % (output_dir,
                                                         self.topic,
                                                         self.group,
                                                         timestamp)
        self.temp_file = open(self.temp_file_path, "w")
Exemple #23
0
        def run(self):
            client = None
            consumer = None
            try:
                prev = None
                print("Starting Kafka Client")
                print("Kafka topic: {}").format(self.topic)

                hosts = get_es_hosts()
                print "es hosts: ", hosts
                es = Elasticsearch(hosts=hosts)
                # ES_HOST = {"host": "localhost", "port": 9200}
                # es = Elasticsearch(hosts=[ES_HOST])

                # host_list = get_all_host('KAFKA')
                hosts = get_kafka_hosts()
                print "kafka hosts: ", hosts
                client = KafkaClient(hosts=hosts)
                # print "kafka client group name: "+self.groupName
                consumer = SimpleConsumer(client, self.groupName, self.topic)
                # consumer.seek --> offset=0, whence=current
                consumer.seek(-5, 2)
                print("Listening kafka message...")

                while self.stopCpu is False:
                    # for message in consumer.get_messages(count=5, block=False):
                    for message in consumer:
                        if self.stopCpu is True:
                            print("Kafka Consumer Listening Stopped")
                            break

                        if message:
                            print "Consuming kafka message: ", message
                            value = message.message.value
                            try:
                                json_value = json.loads(value)
                                offset = message.offset
                                json_value['data'][0]['offset'] = offset
                                value = json.dumps(json_value)
                                print "Publishing data: ", value

                                doc = json.dumps(json_value['data'][0])
                                if len(doc) > 0:
                                    es.index(index='kafka',
                                             doc_type=self.topic,
                                             id=offset,
                                             body=doc)

                                if len(value) > 0:
                                    self.parentOj.emit("ws" + str(self.pid),
                                                       value)
                            except Exception as e:
                                traceback.print_exc()
                                print "Skipping invalid message"
            except Exception as e:
                traceback.print_exc()
            finally:
                print("Listening kafka Stopped")
                print "Stopping consumer ..."
                consumer.stop()
                print "Closing client ..."
                client.close()
def main():
    """kafkadump: Kafka topic dump utility for debugging.

    Usage:
        kafkadump list --host=<host>
        kafkadump dump <topic> --host=<host> [--consumer=<consumer>]

    Examples:

        List all the topics on your local Kafka instance:

            python kafkadump.py list --host=<kafkahost>:9092

        Dump the contents of a single topic starting from offset 0:

            python kafkadump.py dump test.crawled_firehose --host=<kafkahost>:9092

        Use CTRL+C (SIGINT, KeyboardInterrupt) to stop it from polling Kafka.
        It will end by printing the total records serviced and the raw output
        of the most recent record.

    Options:
        -h --host <host>            Kafka host name where Kafka cluster will be resolved
        -c --consumer <consumer>    Consumer group ID to use for reading messages
    """
    args = docopt(main.__doc__)
    host = args["--host"]

    logging.basicConfig()

    print "=> Connecting to {0}...".format(host)
    kafka = KafkaClient(host)
    print "=> Connected."

    if args["list"]:
        for topic in kafka.topic_partitions.keys():
            print topic
        return 0
    elif args["dump"]:
        topic = args["<topic>"]
        consumer_id = args["--consumer"] or "default"
        consumer = SimpleConsumer(kafka, consumer_id, topic,
                            buffer_size=1024*100,      # 100kb
                            fetch_size_bytes=1024*100, # 100kb
                            max_buffer_size=None       # eliminate big message errors
                            )
        consumer.seek(0, 0)
        num_records = 0
        total_bytes = 0
        item = None
        while True:
            try:
                message = consumer.get_message()
                if message is None:
                    time.sleep(1)
                    continue
                val = message.message.value
                item = json.loads(val)
                body_bytes = len(item)
                print item
                num_records = num_records + 1
                total_bytes = total_bytes + body_bytes
            except:
                traceback.print_exc()
                break
        total_mbs = float(total_bytes) / (1024*1024)
        print
        if item is not None:
            print json.dumps(item, indent=4)
        if num_records == 0:
            num_records = 1
        print num_records, "records", total_mbs, "megabytes", (float(total_bytes) / num_records / 1024), "kb per msg"
        kafka.close()
        return 0
Exemple #25
0
if __name__ == "__main__":
    if len(sys.argv) != 4:
        print '功能: 移动consumer的消费指针至指定时间处'
        print 'Kafka Server为: xxxxxxx:9092 等同15/25:9092'
        print 'Usage: .py [topic] [group] [date]'
    else:
        topic = sys.argv[1]
        group = sys.argv[2]
        date = sys.argv[3]
        server = 'xxxxxxx:9092'
        print '将%s的%s的使用者%s的时间轴调整至%s...' % (server, topic, group, date)
        client = KafkaClient(server)
        consumer = SimpleConsumer(client, group, topic)
        step = 10000
        consumer.seek(step, 0)
        cnt = 0
        while step > 1:
            cnt = cnt + 1
            message = consumer.get_message()
            msg = json.loads(message.message.value)
            if msg.has_key('up_time'):
                if cnt % 2 == 0:
                    print 'Processed %s to date %s' % (cnt, msg['up_time'])
                if msg['up_time'] > date:
                    step = int(step * 2 / 3)
                    consumer.seek(-step, 1)
                elif msg['up_time'] == date:
                    break
                else:
                    consumer.seek(step, 1)
Exemple #26
0
class ZKConsumer(object):

    zk_timeout = 30
    jitter_seconds = 30
    broker_prefix = '/brokers/ids'

    def __init__(self,
                 zk_hosts,
                 group,
                 topic,
                 nodes,
                 zk_handler=None,
                 logger=None,
                 identifier=None,
                 **consumer_kwargs):
        """Creates a Consumer that tracks state in ZooKeeper,
        rebalancing partition ownership as registered consumers change.
        NOTE: this class is intended for version 0.8.1 of Kafka, where offsets
              are managed by Kafka but there is no rebalancing in the protocol.
        """
        if logger is None:
            logger = logging.getLogger('kafka.consumer.ZKConsumer')
        self.logger = logger
        self.identifier = identifier

        if KafkaClient is None:
            raise RuntimeError(
                "Kafka support requires cs.eyrie to be installed with the Kafka extra: install_requires= ['cs.eyrie[Kafka]']"
            )
        self.zk_handler = zk_handler
        self.zk_hosts = zk_hosts
        self.broker_hosts = []

        self.group = group
        self.topic = topic

        self.zk = None
        self.nodes = nodes
        self.client = None
        self.consumer = None
        self.consumer_kwargs = consumer_kwargs

        # This will kick off a cascading sequence to initialize ourselves:
        # 1. Connect to ZK and pull list of Kafka brokers
        # 2. Register ourselves as a consumer in ZK
        # 3. Rebalance partitions across all connected consumers
        self.init_zk()

    def zk_session_watch(self, state):
        self.logger.debug('ZK transitioned to: %s', state)
        if state == KazooState.SUSPENDED:
            if self.consumer is not None:
                self.logger.info('Stopping Kafka consumer')
                self.consumer.stop()
                self.consumer = None
            # Lost connection to ZK; we can't call any methods that would
            # try to contact it (i.e., we can't do self.zkp.finish() )
            self.zkp = None
        elif state == KazooState.CONNECTED:
            self.logger.info('Restarting ZK partitioner')
            self.zk.handler.spawn(self.init_zkp)

    def _zkp_wait(self):
        handler = self.zk.handler
        while 1:
            if self.zkp.failed:
                self.logger.warning("Lost or unable to acquire partition")
                self.stop()
            elif self.zkp.release:
                self.zkp.release_set()
            elif self.zkp.acquired:

                def group_change_proxy(event):
                    self.logger.warn('Connected consumers changed')
                    if self.zkp is None:
                        self.logger.info('Restarting ZK partitioner')
                        handler.spawn(self.init_zkp)
                    elif self.zkp is not None and self.zkp.failed:
                        self.logger.warning(
                            "Lost or unable to acquire partition")
                        self.stop()
                    else:
                        self.logger.info(
                            'Scheduling ZK partitioner set release')
                        rel_greenlet = handler.spawn(self.zkp.release_set)
                        self.logger.info('Scheduling group re-join')
                        rel_greenlet.link_value(
                            lambda greenlet: self.zkp.join_group)

                if not self.nodes:
                    self.logger.info(
                        'Partitioner aquired; setting child watch')
                    result = self.zk.get_children_async(self.zkp._group_path)
                    result.rawlink(group_change_proxy)
                # Break out of while loop to begin consuming events
                break
            elif self.zkp.allocating:
                self.zkp.wait_for_acquire()

    def init_zkp(self):
        if not hasattr(self, 'zkp') or self.zkp is None:
            if self.nodes:
                self.zkp = StaticZKPartitioner(
                    self.zk,
                    self.group,
                    self.topic,
                    self.nodes,
                    partitions_changed_cb=self.init_consumer,
                    logger=self.logger,
                    identifier=self.identifier)
            else:
                self.zkp = ZKPartitioner(
                    self.zk,
                    self.group,
                    self.topic,
                    time_boundary=self.jitter_seconds,
                    partitions_changed_cb=self.init_consumer,
                    logger=self.logger,
                    identifier=self.identifier)

        self._zkp_wait()

    def init_zk(self):
        # TODO: switch to async
        # 1. implement kazoo.interfaces.IHandler in terms of Tornado's IOLoop
        self.zk = KazooClient(hosts=self.zk_hosts, handler=self.zk_handler)
        self.zk.start()
        self.zk.add_listener(self.zk_session_watch)

        @self.zk.ChildrenWatch(self.broker_prefix)
        def broker_change_proxy(broker_ids):
            self.onBrokerChange(broker_ids)

        self.init_zkp()

    def onBrokerChange(self, broker_ids):
        self.broker_hosts = []
        for b_id in broker_ids:
            b_json, zstat = self.zk.get('/'.join([self.broker_prefix, b_id]))
            b_data = json.loads(b_json)
            self.broker_hosts.append('{}:{}'.format(b_data['host'],
                                                    b_data['port']))

        my_partitions = []
        if self.consumer is not None:
            self.logger.warn('Brokers changed, stopping Kafka consumer.')
            my_partitions = self.consumer.offsets.keys()
            self.consumer.stop()
            self.consumer = None
        if self.client is not None:
            self.logger.warn('Brokers changed, stopping Kafka client.')
            self.client.close()
            self.client = None

        if my_partitions:
            msg = 'Brokers changed, queuing restart of Kafka client / consumer.'
            self.logger.warn(msg)
            self.zk.handler.spawn(self.init_consumer, my_partitions)

    def init_consumer(self, my_partitions):
        if self.consumer is None:
            self.logger.warn('Starting Kafka client')
            self.client = KafkaClient(self.broker_hosts,
                                      client_id=self.zkp._identifier)
        else:
            if self.consumer is None or \
               sorted(my_partitions) != sorted(self.consumer.offsets.keys()):
                self.logger.warn(
                    'Partitions changed, restarting Kafka consumer.')
                self.consumer.stop()
            else:
                self.logger.info(
                    'Partitions unchanged, not restarting Kafka consumer.')
                return

        self.consumer = SimpleConsumer(self.client,
                                       self.group,
                                       self.topic,
                                       partitions=my_partitions,
                                       **self.consumer_kwargs)
        self.consumer.provide_partition_info()
        self.logger.info("Consumer connected to Kafka: %s",
                         self.consumer.offsets)

    def stop(self):
        if self.consumer is not None:
            self.logger.info('Stopping Kafka consumer')
            self.consumer.stop()
            self.consumer = None
        if self.client is not None:
            self.logger.info('Stopping Kafka client')
            self.client.close()
            self.client = None
        if self.zk is not None:
            self.logger.info('Stopping ZooKeeper client')
            if self.zkp is not None and not self.zkp.failed:
                self.zkp.finish()
                self.zk.stop()
            self.zkp = None
            self.zk = None

    def commit(self, partitions=None):
        """
        Commit offsets for this consumer

        partitions: list of partitions to commit, default is to commit
                    all of them
        """
        if self.consumer is None:
            return
        self.logger.debug('Begin committing offsets for partitions: %s',
                          partitions if partitions else 'All')
        self.consumer.commit(partitions)
        self.logger.debug('End committing offsets for partitions: %s',
                          partitions if partitions else 'All')

    def pending(self, partitions=None):
        """
        Gets the pending message count

        partitions: list of partitions to check for, default is to check all
        """
        return self.consumer.pending(partitions)

    def provide_partition_info(self):
        """
        Indicates that partition info must be returned by the consumer
        """
        self.consumer.provide_partition_info()

    def seek(self, offset, whence):
        """
        Alter the current offset in the consumer, similar to fseek

        offset: how much to modify the offset
        whence: where to modify it from
                0 is relative to the earliest available offset (head)
                1 is relative to the current offset
                2 is relative to the latest known offset (tail)
        """
        self.consumer.seek(offset, whence)

    def get_messages(self, count=1, block=True, timeout=0.1):
        """
        Fetch the specified number of messages

        count: Indicates the maximum number of messages to be fetched
        block: If True, the API will block till some messages are fetched.
        timeout: If block is True, the function will block for the specified
                 time (in seconds) until count messages is fetched. If None,
                 it will block forever.
        """
        if self.consumer is None:
            return []
        else:
            try:
                messages = self.consumer.get_messages(count, block, timeout)
                if not messages and self.zkp.failed:
                    raise FailedPayloadsError
                return messages
            except FailedPayloadsError as err:
                msg = 'Failed to retrieve payload, restarting consumer'
                self.logger.exception(msg)
                raise err

    def get_message(self, block=True, timeout=0.1, get_partition_info=None):
        return self.consumer.get_message(block, timeout, get_partition_info)

    def _get_message(self,
                     block=True,
                     timeout=0.1,
                     get_partition_info=None,
                     update_offset=True):
        return self.consumer._get_message(block, timeout, get_partition_info,
                                          update_offset)

    def __iter__(self):
        for msg in self.consumer:
            yield msg
Exemple #27
0
class KafkaMonitor:
    def __init__(self, settings):
        # dynamic import of settings file
        # remove the .py from the filename
        self.settings = importlib.import_module(settings[:-3])

        # only need kafka for both uses
        self.kafka_conn = KafkaClient(self.settings.KAFKA_HOSTS)

    def get_method(self, key):
        if key == 'handle_crawl_request':
            return self.handle_crawl_request
        elif key == 'handle_action_request':
            return self.handle_action_request
        raise AttributeError(key)

    def setup(self):
        self.redis_conn = redis.Redis(host=self.settings.REDIS_HOST,
                                      port=self.settings.REDIS_PORT)

        self.kafka_conn.ensure_topic_exists(self.settings.KAFKA_INCOMING_TOPIC)
        self.consumer = SimpleConsumer(self.kafka_conn,
                                       self.settings.KAFKA_GROUP,
                                       self.settings.KAFKA_INCOMING_TOPIC,
                                       auto_commit=True,
                                       iter_timeout=1.0)

        self.result_method = self.get_method(self.settings.SCHEMA_METHOD)

        self.validator = self.extend_with_default(Draft4Validator)

    def extend_with_default(self, validator_class):
        '''
        Method to add default fields to our schema validation
        ( From the docs )
        '''
        validate_properties = validator_class.VALIDATORS["properties"]

        def set_defaults(validator, properties, instance, schema):
            for error in validate_properties(
                    validator,
                    properties,
                    instance,
                    schema,
            ):
                yield error

            for property, subschema in properties.iteritems():
                if "default" in subschema:
                    instance.setdefault(property, subschema["default"])

        return validators.extend(
            validator_class,
            {"properties": set_defaults},
        )

    def handle_crawl_request(self, dict):
        '''
        Processes a vaild crawl request

        @param dict: a valid dictionary object
        '''
        # format key
        key = "{sid}:queue".format(sid=dict['spiderid'])
        val = pickle.dumps(dict, protocol=-1)

        # shortcut to shove stuff into the priority queue
        self.redis_conn.zadd(key, val, -dict['priority'])

        # if timeout crawl, add value to redis
        if 'expires' in dict:
            key = "timeout:{sid}:{appid}:{crawlid}".format(
                sid=dict['spiderid'],
                appid=dict['appid'],
                crawlid=dict['crawlid'])
            self.redis_conn.set(key, dict['expires'])

        print 'Added crawl to Redis'

    def handle_action_request(self, dict):
        '''
        Processes a vaild action request

        @param dict: The valid dictionary object
        '''
        # format key
        key = "{action}:{spiderid}:{appid}".format(action=dict['action'],
                                                   spiderid=dict['spiderid'],
                                                   appid=dict['appid'])

        if "crawlid" in dict:
            key = key + ":" + dict['crawlid']

        self.redis_conn.set(key, dict['uuid'])

        print 'Added action to Redis'

    def _main_loop(self):
        '''
        Continuous loop that reads from a kafka topic and tries to validate
        incoming messages
        '''
        while True:
            start = time.time()

            try:
                for message in self.consumer.get_messages():
                    if message is None:
                        break
                    try:
                        the_dict = json.loads(message.message.value)

                        try:
                            self.validator(self.schema).validate(the_dict)
                            self.result_method(the_dict)
                        except ValidationError as ex:
                            print "invalid json received"

                    except ValueError:
                        print "bad json recieved"
            except OffsetOutOfRangeError:
                # consumer has no idea where they are
                self.consumer.seek(0, 2)

            end = time.time()
            time.sleep(.01)

    def run(self):
        '''
        Sets up the schema to be validated against
        '''
        self.setup()
        with open(self.settings.SCHEMA) as the_file:
            # No try/catch so we can see if there is a json parse error
            # on the schemas
            self.schema = json.load(the_file)
            self._main_loop()

    def feed(self, json_item):
        '''
        Feeds a json item into the Kafka topic

        @param json_item: The loaded json object
        '''
        topic = self.settings.KAFKA_INCOMING_TOPIC
        producer = SimpleProducer(self.kafka_conn)
        print "=> feeding JSON request into {0}...".format(topic)
        print json.dumps(json_item, indent=4)
        self.kafka_conn.ensure_topic_exists(topic)
        producer.send_messages(topic, json.dumps(json_item))
        print "=> done feeding request."
Exemple #28
0
class Consumer(object):
    def __init__(self, addr, group, topic):
        self.client = KafkaClient(addr)
        self.consumer = SimpleConsumer(self.client,
                                       group,
                                       topic,
                                       max_buffer_size=1310720000)
        self.temp_file_path = None
        self.temp_file = None
        self.hadoop_path = "/user/AdReport/%s/history" % (topic)
        self.cached_path = "/user/AdReport/%s/cached" % (topic)
        self.topic = topic
        self.group = group
        self.block_cnt = 0

    def consume_topic(self, output_dir):

        timestamp = time.strftime('%Y%m%d%H%M%S')

        #open file for writing
        self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % (
            output_dir, self.topic, self.group, timestamp)
        self.temp_file = open(self.temp_file_path, "w")
        print(self.temp_file)
        #one_entry = False

        while True:
            try:
                messages = self.consumer.get_messages(count=10, block=False)

                #OffsetAndMessage(offset=43, message=Message(magic=0,
                # attributes=0, key=None, value='some message'))
                for message in messages:
                    print(message)
                    #one_entry = True
                    #print (self.temp_file.tell())
                    self.temp_file.write(message.message.value + "\n")

                if self.temp_file.tell() > 2000000:
                    self.save_to_hdfs(output_dir)

                self.consumer.commit()
            except:
                self.consumer.seek(0, 2)

#if one_entry:
#print ("sending to hdfs")
#self.save_to_hdfs(output_dir, self.topic)
#self.consumer.commit()

    def save_to_hdfs(self, output_dir):
        print("Saving file to hdfs")
        self.temp_file.close()
        print("Closed open file")
        timestamp = time.strftime('%Y%m%d%H%M%S')

        hadoop_fullpath = "%s/%s_%s_%s.dat" % (self.hadoop_path, self.group,
                                               self.topic, timestamp)
        cached_fullpath = "%s/%s_%s_%s.dat" % (self.cached_path, self.group,
                                               self.topic, timestamp)
        #print ("Block " + str(self.block_cnt) + ": Saving file to HDFS " + hadoop_fullpath)
        self.block_cnt += 1

        # place blocked messages into history and cached folders on hdfs
        os.system("sudo -u ubuntu /usr/local/hadoop/bin/hdfs dfs -put %s %s" %
                  (self.temp_file_path, hadoop_fullpath))
        os.system("sudo -u ubuntu /usr/local/hadoop/bin/hdfs dfs -put %s %s" %
                  (self.temp_file_path, cached_fullpath))
        os.remove(self.temp_file_path)

        timestamp = time.strftime('%Y%m%d%H%M%S')

        self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % (
            output_dir, self.topic, self.group, timestamp)
        self.temp_file = open(self.temp_file_path, "w")
Exemple #29
0
class KafkaMonitor:
    def __init__(self, settings_name, unit_test=False):
        '''
        @param settings_name: the local settings file name
        @param unit_test: whether running unit tests or not
        '''
        self.settings_name = settings_name
        self.wrapper = SettingsWrapper()
        self.logger = None
        self.unit_test = unit_test

    def _import_class(self, cl):
        '''
        Imports a class from a string

        @param name: the module and class name in dot notation
        '''
        d = cl.rfind(".")
        classname = cl[d + 1:len(cl)]
        m = __import__(cl[0:d], globals(), locals(), [classname])
        return getattr(m, classname)

    def _load_plugins(self):
        '''
        Sets up all plugins, defaults and settings.py
        '''
        plugins = self.settings['PLUGINS']

        self.plugins_dict = {}
        for key in plugins:
            # skip loading the plugin if its value is None
            if plugins[key] is None:
                continue
            # valid plugin, import and setup
            self.logger.debug("Trying to load plugin {cls}".format(cls=key))
            the_class = self._import_class(key)
            instance = the_class()
            instance._set_logger(self.logger)
            if not self.unit_test:
                instance.setup(self.settings)
            the_schema = None

            print("self.settings['PLUGIN_DIR'] + instance.schema====",
                  self.settings['PLUGIN_DIR'] + instance.schema)
            with open(self.settings['PLUGIN_DIR'] +
                      instance.schema) as the_file:
                the_schema = json.load(the_file)

            mini = {}
            mini['instance'] = instance
            mini['schema'] = the_schema

            self.plugins_dict[plugins[key]] = mini

        self.plugins_dict = OrderedDict(
            sorted(self.plugins_dict.items(), key=lambda t: t[0]))

    def setup(self, level=None, log_file=None, json=None):
        '''
        Load everything up. Note that any arg here will override both
        default and custom settings

        @param level: the log level
        @param log_file: boolean t/f whether to log to a file, else stdout
        @param json: boolean t/f whether to write the logs in json
        '''
        self.settings = self.wrapper.load(self.settings_name)

        my_level = level if level else self.settings['LOG_LEVEL']
        # negate because logger wants True for std out
        my_output = not log_file if log_file else self.settings['LOG_STDOUT']
        my_json = json if json else self.settings['LOG_JSON']
        self.logger = LogFactory.get_instance(
            json=my_json,
            stdout=my_output,
            level=my_level,
            name=self.settings['LOGGER_NAME'],
            dir=self.settings['LOG_DIR'],
            file=self.settings['LOG_FILE'],
            bytes=self.settings['LOG_MAX_BYTES'],
            backups=self.settings['LOG_BACKUPS'])

        self.validator = self.extend_with_default(Draft4Validator)

    def _setup_stats(self):
        '''
        Sets up the stats collection
        '''
        self.stats_dict = {}

        redis_conn = redis.Redis(host=self.settings['REDIS_HOST'],
                                 port=self.settings['REDIS_PORT'])

        try:
            redis_conn.info()
            self.logger.debug("Connected to Redis in StatsCollector Setup")
        except ConnectionError:
            self.logger.warn("Failed to connect to Redis in StatsCollector"
                             " Setup, no stats will be collected")
            return

        if self.settings['STATS_TOTAL']:
            self._setup_stats_total(redis_conn)

        if self.settings['STATS_PLUGINS']:
            self._setup_stats_plugins(redis_conn)

    def _setup_stats_total(self, redis_conn):
        '''
        Sets up the total stats collectors

        @param redis_conn: the redis connection
        '''
        self.stats_dict['total'] = {}
        self.stats_dict['fail'] = {}
        temp_key1 = 'stats:kafka-monitor:total'
        temp_key2 = 'stats:kafka-monitor:fail'
        for item in self.settings['STATS_TIMES']:
            try:
                time = getattr(StatsCollector, item)
                self.stats_dict['total'][time] = StatsCollector \
                        .get_rolling_time_window(
                                redis_conn=redis_conn,
                                key='{k}:{t}'.format(k=temp_key1, t=time),
                                window=time,
                                cycle_time=self.settings['STATS_CYCLE'])
                self.stats_dict['fail'][time] = StatsCollector \
                        .get_rolling_time_window(
                                redis_conn=redis_conn,
                                key='{k}:{t}'.format(k=temp_key2, t=time),
                                window=time,
                                cycle_time=self.settings['STATS_CYCLE'])
                self.logger.debug("Set up total/fail Stats Collector '{i}'"\
                        .format(i=item))
            except AttributeError as e:
                self.logger.warning("Unable to find Stats Time '{s}'"\
                        .format(s=item))
        total1 = StatsCollector.get_hll_counter(
            redis_conn=redis_conn,
            key='{k}:lifetime'.format(k=temp_key1),
            cycle_time=self.settings['STATS_CYCLE'],
            roll=False)
        total2 = StatsCollector.get_hll_counter(
            redis_conn=redis_conn,
            key='{k}:lifetime'.format(k=temp_key2),
            cycle_time=self.settings['STATS_CYCLE'],
            roll=False)
        self.logger.debug("Set up total/fail Stats Collector 'lifetime'")
        self.stats_dict['total']['lifetime'] = total1
        self.stats_dict['fail']['lifetime'] = total2

    def _setup_stats_plugins(self, redis_conn):
        '''
        Sets up the plugin stats collectors

        @param redis_conn: the redis connection
        '''
        self.stats_dict['plugins'] = {}
        for key in self.plugins_dict:
            plugin_name = self.plugins_dict[key]['instance'].__class__.__name__
            temp_key = 'stats:kafka-monitor:{p}'.format(p=plugin_name)
            self.stats_dict['plugins'][plugin_name] = {}
            for item in self.settings['STATS_TIMES']:
                try:
                    time = getattr(StatsCollector, item)

                    self.stats_dict['plugins'][plugin_name][time] = StatsCollector \
                            .get_rolling_time_window(
                                    redis_conn=redis_conn,
                                    key='{k}:{t}'.format(k=temp_key, t=time),
                                    window=time,
                                    cycle_time=self.settings['STATS_CYCLE'])
                    self.logger.debug("Set up {p} plugin Stats Collector '{i}'"\
                            .format(p=plugin_name, i=item))
                except AttributeError:
                    self.logger.warning("Unable to find Stats Time '{s}'"\
                            .format(s=item))
            total = StatsCollector.get_hll_counter(
                redis_conn=redis_conn,
                key='{k}:lifetime'.format(k=temp_key),
                cycle_time=self.settings['STATS_CYCLE'],
                roll=False)
            self.logger.debug("Set up {p} plugin Stats Collector 'lifetime'"\
                            .format(p=plugin_name))
            self.stats_dict['plugins'][plugin_name]['lifetime'] = total

    def _setup_kafka(self):
        '''
        Sets up kafka connections
        '''
        @MethodTimer.timeout(self.settings['KAFKA_CONN_TIMEOUT'], False)
        def _hidden_setup():
            try:
                self.kafka_conn = KafkaClient(self.settings['KAFKA_HOSTS'])
                self.kafka_conn.ensure_topic_exists(
                    self.settings['KAFKA_INCOMING_TOPIC'])
                self.consumer = SimpleConsumer(
                    self.kafka_conn,
                    self.settings['KAFKA_GROUP'],
                    self.settings['KAFKA_INCOMING_TOPIC'],
                    auto_commit=True,
                    iter_timeout=1.0)
            except KafkaUnavailableError as ex:
                message = "An exception '{0}' occured. Arguments:\n{1!r}" \
                    .format(type(ex).__name__, ex.args)
                self.logger.error(message)
                sys.exit(1)
            return True

        ret_val = _hidden_setup()

        if ret_val:
            self.logger.debug("Successfully connected to Kafka")
        else:
            self.logger.error("Failed to set up Kafka Connection within"
                              " timeout")
            # this is essential to running the kafka monitor
            sys.exit(1)

    def extend_with_default(self, validator_class):
        '''
        Method to add default fields to our schema validation
        ( From the docs )
        '''
        validate_properties = validator_class.VALIDATORS["properties"]

        def set_defaults(validator, properties, instance, schema):
            for error in validate_properties(
                    validator,
                    properties,
                    instance,
                    schema,
            ):
                yield error

            for property, subschema in properties.iteritems():
                if "default" in subschema:
                    instance.setdefault(property, subschema["default"])

        return validators.extend(
            validator_class,
            {"properties": set_defaults},
        )

    def _main_loop(self):
        '''
        Continuous loop that reads from a kafka topic and tries to validate
        incoming messages
        '''
        self.logger.debug("Processing messages")
        old_time = 0
        while True:
            self._process_messages()
            if self.settings['STATS_DUMP'] != 0:
                new_time = int(time.time() / self.settings['STATS_DUMP'])
                # only log every X seconds
                if new_time != old_time:
                    self._dump_stats()
                    old_time = new_time

            time.sleep(.01)

    def _process_messages(self):
        try:
            for message in self.consumer.get_messages():
                if message is None:
                    self.logger.debug("no message")
                    break
                try:
                    self._increment_total_stat(message.message.value)
                    the_dict = json.loads(message.message.value)
                    print('the_dict', the_dict)
                    found_plugin = False

                    print('self.plugins_dict', self.plugins_dict)
                    for key in self.plugins_dict:
                        obj = self.plugins_dict[key]
                        instance = obj['instance']

                        print('instance==', instance)

                        schema = obj['schema']

                        print(
                            'schema********************************************',
                            schema)
                        try:
                            print('before       v = self.validator(schema)')

                            v = self.validator(schema)

                            print('after       v = self.validator(schema)')

                            print('the_dict-------', the_dict)

                            v.validate(the_dict)
                            found_plugin = True

                            print('found_plugin====', found_plugin)

                            self._increment_plugin_stat(
                                instance.__class__.__name__, the_dict)

                            print('instance.handle(the_dict)', the_dict)
                            ret = instance.handle(the_dict)
                            # break if nothing is returned
                            if ret is None:
                                break
                        except ValidationError:

                            print('  except ValidationError:======')

                            pass
                    if not found_plugin:
                        extras = {}
                        extras['parsed'] = True
                        extras['valid'] = False
                        extras['data'] = the_dict
                        self.logger.warn(
                            "Did not find schema to validate "
                            "request",
                            extra=extras)
                        self._increment_fail_stat(the_dict)

                except ValueError:
                    extras = {}
                    extras['parsed'] = False
                    extras['valid'] = False
                    extras['data'] = message.message.value
                    self.logger.warning('Unparseable JSON Received',
                                        extra=extras)
                    self._increment_fail_stat(message.message.value)

        except OffsetOutOfRangeError:
            # consumer has no idea where they are
            self.consumer.seek(0, 2)
            self.logger.error("Kafka offset out of range error")

    def _increment_total_stat(self, string):
        '''
        Increments the total stat counters

        @param string: the loaded message object for the counter
        '''
        string = string + str(time.time())
        if 'total' in self.stats_dict:
            self.logger.debug("Incremented total stats")
            for key in self.stats_dict['total']:
                if key == 'lifetime':

                    self.stats_dict['total'][key].increment(string)
                else:
                    self.stats_dict['total'][key].increment()

    def _increment_fail_stat(self, item):
        '''
        Increments the total stat counters

        @param item: the loaded message object for HLL counter
        '''
        if isinstance(item, dict):
            item['ts'] = time.time()
        elif isinstance(item, str):
            item = item + str(time.time())

        if 'fail' in self.stats_dict:
            self.logger.debug("Incremented fail stats")
            for key in self.stats_dict['fail']:
                if key == 'lifetime':
                    self.stats_dict['fail'][key].increment(item)
                else:
                    self.stats_dict['fail'][key].increment()

    def _increment_plugin_stat(self, name, item):
        '''
        Increments the total stat counters

        @param name: The formal name of the plugin
        @param dict: the loaded message object for HLL counter
        '''
        item['ts'] = time.time()
        if 'plugins' in self.stats_dict:
            self.logger.debug("Incremented plugin '{p}' plugin stats"\
                    .format(p=name))
            for key in self.stats_dict['plugins'][name]:
                if key == 'lifetime':
                    self.stats_dict['plugins'][name][key].increment(item)
                else:
                    self.stats_dict['plugins'][name][key].increment()

    def _dump_stats(self):
        '''
        Dumps the stats out
        '''
        extras = {}
        if 'total' in self.stats_dict:
            self.logger.debug("Compiling total/fail dump stats")
            for key in self.stats_dict['total']:
                final = 'total_{t}'.format(t=key)
                extras[final] = self.stats_dict['total'][key].value()
            for key in self.stats_dict['fail']:
                final = 'fail_{t}'.format(t=key)
                extras[final] = self.stats_dict['fail'][key].value()

        if 'plugins' in self.stats_dict:
            self.logger.debug("Compiling plugin dump stats")
            for name in self.stats_dict['plugins']:
                for key in self.stats_dict['plugins'][name]:
                    final = 'plugin_{n}_{t}'.format(n=name, t=key)
                    extras[final] = self.stats_dict['plugins'][name][
                        key].value()

        if not self.logger.json:
            self.logger.info('Kafka Monitor Stats Dump:\n{0}'.format(
                json.dumps(extras, indent=4, sort_keys=True)))
        else:
            self.logger.info('Kafka Monitor Stats Dump', extra=extras)

    def run(self):
        '''
        Set up and run
        '''
        self._setup_kafka()
        self._load_plugins()
        self._setup_stats()
        self._main_loop()

    def feed(self, json_item):
        '''
        Feeds a json item into the Kafka topic

        @param json_item: The loaded json object
        '''
        @MethodTimer.timeout(self.settings['KAFKA_FEED_TIMEOUT'], False)
        def _feed(json_item):
            try:
                self.kafka_conn = KafkaClient(self.settings['KAFKA_HOSTS'])
                topic = self.settings['KAFKA_INCOMING_TOPIC']
                producer = SimpleProducer(self.kafka_conn)
            except KafkaUnavailableError:
                self.logger.error("Unable to connect to Kafka")
                return False

            if not self.logger.json:
                self.logger.info('Feeding JSON into {0}\n{1}'.format(
                    topic, json.dumps(json_item, indent=4)))
            else:
                self.logger.info('Feeding JSON into {0}\n'.format(topic),
                                 extra={'value': json_item})

            self.kafka_conn.ensure_topic_exists(topic)
            producer.send_messages(topic, json.dumps(json_item))

            return True

        result = _feed(json_item)

        if result:
            self.logger.info("Successfully fed item to Kafka")
        else:
            self.logger.error("Failed to feed item into Kafka")
Exemple #30
0
class KafkaMonitor:

    def __init__(self, settings_name, unit_test=False):
        '''
        @param settings_name: the local settings file name
        @param unit_test: whether running unit tests or not
        '''
        self.settings_name = settings_name
        self.wrapper = SettingsWrapper()
        self.logger = None
        self.unit_test = unit_test

    def _import_class(self, cl):
        '''
        Imports a class from a string

        @param name: the module and class name in dot notation
        '''
        d = cl.rfind(".")
        classname = cl[d+1:len(cl)]
        m = __import__(cl[0:d], globals(), locals(), [classname])
        return getattr(m, classname)

    def _load_plugins(self):
        '''
        Sets up all plugins, defaults and settings.py
        '''
        plugins = self.settings['PLUGINS']

        self.plugins_dict = {}
        for key in plugins:
            # skip loading the plugin if its value is None
            if plugins[key] is None:
                continue
            # valid plugin, import and setup
            self.logger.debug("Trying to load plugin {cls}".format(cls=key))
            the_class = self._import_class(key)
            instance = the_class()
            instance._set_logger(self.logger)
            if not self.unit_test:
                instance.setup(self.settings)
            the_schema = None
            with open(self.settings['PLUGIN_DIR'] + instance.schema) as the_file:
                the_schema = json.load(the_file)

            mini = {}
            mini['instance'] = instance
            mini['schema'] = the_schema

            self.plugins_dict[plugins[key]] = mini

        self.plugins_dict = OrderedDict(sorted(self.plugins_dict.items(),
                                               key=lambda t: t[0]))

    def setup(self, level=None, log_file=None, json=None):
        '''
        Load everything up. Note that any arg here will override both
        default and custom settings

        @param level: the log level
        @param log_file: boolean t/f whether to log to a file, else stdout
        @param json: boolean t/f whether to write the logs in json
        '''
        self.settings = self.wrapper.load(self.settings_name)

        my_level = level if level else self.settings['LOG_LEVEL']
        # negate because logger wants True for std out
        my_output = not log_file if log_file else self.settings['LOG_STDOUT']
        my_json = json if json else self.settings['LOG_JSON']
        self.logger = LogFactory.get_instance(json=my_json, stdout=my_output,
                                              level=my_level,
                                              name=self.settings['LOGGER_NAME'],
                                              dir=self.settings['LOG_DIR'],
                                              file=self.settings['LOG_FILE'],
                                              bytes=self.settings['LOG_MAX_BYTES'],
                                              backups=self.settings['LOG_BACKUPS'])

        self.validator = self.extend_with_default(Draft4Validator)

    def _setup_stats(self):
        '''
        Sets up the stats collection
        '''
        self.stats_dict = {}

        redis_conn = redis.Redis(host=self.settings['REDIS_HOST'],
                                 port=self.settings['REDIS_PORT'])

        try:
            redis_conn.info()
            self.logger.debug("Connected to Redis in StatsCollector Setup")
        except ConnectionError:
            self.logger.warn("Failed to connect to Redis in StatsCollector"
                             " Setup, no stats will be collected")
            return

        if self.settings['STATS_TOTAL']:
            self._setup_stats_total(redis_conn)

        if self.settings['STATS_PLUGINS']:
            self._setup_stats_plugins(redis_conn)

    def _setup_stats_total(self, redis_conn):
        '''
        Sets up the total stats collectors

        @param redis_conn: the redis connection
        '''
        self.stats_dict['total'] = {}
        self.stats_dict['fail'] = {}
        temp_key1 = 'stats:kafka-monitor:total'
        temp_key2 = 'stats:kafka-monitor:fail'
        for item in self.settings['STATS_TIMES']:
            try:
                time = getattr(StatsCollector, item)
                self.stats_dict['total'][time] = StatsCollector \
                        .get_rolling_time_window(
                                redis_conn=redis_conn,
                                key='{k}:{t}'.format(k=temp_key1, t=time),
                                window=time,
                                cycle_time=self.settings['STATS_CYCLE'])
                self.stats_dict['fail'][time] = StatsCollector \
                        .get_rolling_time_window(
                                redis_conn=redis_conn,
                                key='{k}:{t}'.format(k=temp_key2, t=time),
                                window=time,
                                cycle_time=self.settings['STATS_CYCLE'])
                self.logger.debug("Set up total/fail Stats Collector '{i}'"\
                        .format(i=item))
            except AttributeError as e:
                self.logger.warning("Unable to find Stats Time '{s}'"\
                        .format(s=item))
        total1 = StatsCollector.get_hll_counter(redis_conn=redis_conn,
                                                key='{k}:lifetime'.format(k=temp_key1),
                                                cycle_time=self.settings['STATS_CYCLE'],
                                                roll=False)
        total2 = StatsCollector.get_hll_counter(redis_conn=redis_conn,
                                                key='{k}:lifetime'.format(k=temp_key2),
                                                cycle_time=self.settings['STATS_CYCLE'],
                                                roll=False)
        self.logger.debug("Set up total/fail Stats Collector 'lifetime'")
        self.stats_dict['total']['lifetime'] = total1
        self.stats_dict['fail']['lifetime'] = total2

    def _setup_stats_plugins(self, redis_conn):
        '''
        Sets up the plugin stats collectors

        @param redis_conn: the redis connection
        '''
        self.stats_dict['plugins'] = {}
        for key in self.plugins_dict:
            plugin_name = self.plugins_dict[key]['instance'].__class__.__name__
            temp_key = 'stats:kafka-monitor:{p}'.format(p=plugin_name)
            self.stats_dict['plugins'][plugin_name] = {}
            for item in self.settings['STATS_TIMES']:
                try:
                    time = getattr(StatsCollector, item)

                    self.stats_dict['plugins'][plugin_name][time] = StatsCollector \
                            .get_rolling_time_window(
                                    redis_conn=redis_conn,
                                    key='{k}:{t}'.format(k=temp_key, t=time),
                                    window=time,
                                    cycle_time=self.settings['STATS_CYCLE'])
                    self.logger.debug("Set up {p} plugin Stats Collector '{i}'"\
                            .format(p=plugin_name, i=item))
                except AttributeError:
                    self.logger.warning("Unable to find Stats Time '{s}'"\
                            .format(s=item))
            total = StatsCollector.get_hll_counter(redis_conn=redis_conn,
                                                   key='{k}:lifetime'.format(k=temp_key),
                                                   cycle_time=self.settings['STATS_CYCLE'],
                                                   roll=False)
            self.logger.debug("Set up {p} plugin Stats Collector 'lifetime'"\
                            .format(p=plugin_name))
            self.stats_dict['plugins'][plugin_name]['lifetime'] = total

    def _setup_kafka(self):
        '''
        Sets up kafka connections
        '''
        @MethodTimer.timeout(self.settings['KAFKA_CONN_TIMEOUT'], False)
        def _hidden_setup():
            try:
                self.kafka_conn = KafkaClient(self.settings['KAFKA_HOSTS'])
                self.kafka_conn.ensure_topic_exists(
                        self.settings['KAFKA_INCOMING_TOPIC'])
                self.consumer = SimpleConsumer(self.kafka_conn,
                                               self.settings['KAFKA_GROUP'],
                                               self.settings['KAFKA_INCOMING_TOPIC'],
                                               auto_commit=True,
                                               iter_timeout=1.0)
            except KafkaUnavailableError as ex:
                message = "An exception '{0}' occured. Arguments:\n{1!r}" \
                    .format(type(ex).__name__, ex.args)
                self.logger.error(message)
                sys.exit(1)
            return True
        ret_val = _hidden_setup()

        if ret_val:
            self.logger.debug("Successfully connected to Kafka")
        else:
            self.logger.error("Failed to set up Kafka Connection within"
                              " timeout")
            # this is essential to running the kafka monitor
            sys.exit(1)

    def extend_with_default(self, validator_class):
        '''
        Method to add default fields to our schema validation
        ( From the docs )
        '''
        validate_properties = validator_class.VALIDATORS["properties"]

        def set_defaults(validator, properties, instance, schema):
            for error in validate_properties(
                validator, properties, instance, schema,
            ):
                yield error

            for property, subschema in properties.iteritems():
                if "default" in subschema:
                    instance.setdefault(property, subschema["default"])

        return validators.extend(
            validator_class, {"properties": set_defaults},
        )

    def _main_loop(self):
        '''
        Continuous loop that reads from a kafka topic and tries to validate
        incoming messages
        '''
        self.logger.debug("Processing messages")
        old_time = 0
        while True:
            self._process_messages()
            if self.settings['STATS_DUMP'] != 0:
                new_time = int(time.time() / self.settings['STATS_DUMP'])
                # only log every X seconds
                if new_time != old_time:
                    self._dump_stats()
                    old_time = new_time

            time.sleep(.01)

    def _process_messages(self):
        try:
            for message in self.consumer.get_messages():
                if message is None:
                    self.logger.debug("no message")
                    break
                try:
                    self._increment_total_stat(message.message.value)
                    the_dict = json.loads(message.message.value)
                    found_plugin = False
                    for key in self.plugins_dict:
                        obj = self.plugins_dict[key]
                        instance = obj['instance']
                        schema = obj['schema']
                        try:
                            self.validator(schema).validate(the_dict)
                            found_plugin = True
                            self._increment_plugin_stat(
                                    instance.__class__.__name__,
                                    the_dict)
                            ret = instance.handle(the_dict)
                            # break if nothing is returned
                            if ret is None:
                                break
                        except ValidationError:
                            pass
                    if not found_plugin:
                        extras = {}
                        extras['parsed'] = True
                        extras['valid'] = False
                        extras['data'] = the_dict
                        self.logger.warn("Did not find schema to validate "
                                         "request", extra=extras)
                        self._increment_fail_stat(the_dict)

                except ValueError:
                    extras = {}
                    extras['parsed'] = False
                    extras['valid'] = False
                    extras['data'] = message.message.value
                    self.logger.warning('Unparseable JSON Received',
                                        extra=extras)
                    self._increment_fail_stat(message.message.value)

        except OffsetOutOfRangeError:
            # consumer has no idea where they are
            self.consumer.seek(0, 2)
            self.logger.error("Kafka offset out of range error")

    def _increment_total_stat(self, string):
        '''
        Increments the total stat counters

        @param string: the loaded message object for the counter
        '''
        string = string + str(time.time())
        if 'total' in self.stats_dict:
            self.logger.debug("Incremented total stats")
            for key in self.stats_dict['total']:
                if key == 'lifetime':

                    self.stats_dict['total'][key].increment(string)
                else:
                    self.stats_dict['total'][key].increment()

    def _increment_fail_stat(self, item):
        '''
        Increments the total stat counters

        @param item: the loaded message object for HLL counter
        '''
        if isinstance(item, dict):
            item['ts'] = time.time()
        elif isinstance(item, str):
            item = item + str(time.time())

        if 'fail' in self.stats_dict:
            self.logger.debug("Incremented fail stats")
            for key in self.stats_dict['fail']:
                if key == 'lifetime':
                    self.stats_dict['fail'][key].increment(item)
                else:
                    self.stats_dict['fail'][key].increment()

    def _increment_plugin_stat(self, name, item):
        '''
        Increments the total stat counters

        @param name: The formal name of the plugin
        @param dict: the loaded message object for HLL counter
        '''
        item['ts'] = time.time()
        if 'plugins' in self.stats_dict:
            self.logger.debug("Incremented plugin '{p}' plugin stats"\
                    .format(p=name))
            for key in self.stats_dict['plugins'][name]:
                if key == 'lifetime':
                    self.stats_dict['plugins'][name][key].increment(item)
                else:
                    self.stats_dict['plugins'][name][key].increment()

    def _dump_stats(self):
        '''
        Dumps the stats out
        '''
        extras = {}
        if 'total' in self.stats_dict:
            self.logger.debug("Compiling total/fail dump stats")
            for key in self.stats_dict['total']:
                final = 'total_{t}'.format(t=key)
                extras[final] = self.stats_dict['total'][key].value()
            for key in self.stats_dict['fail']:
                final = 'fail_{t}'.format(t=key)
                extras[final] = self.stats_dict['fail'][key].value()

        if 'plugins' in self.stats_dict:
            self.logger.debug("Compiling plugin dump stats")
            for name in self.stats_dict['plugins']:
                for key in self.stats_dict['plugins'][name]:
                    final = 'plugin_{n}_{t}'.format(n=name, t=key)
                    extras[final] = self.stats_dict['plugins'][name][key].value()

        if not self.logger.json:
            self.logger.info('Kafka Monitor Stats Dump:\n{0}'.format(
                    json.dumps(extras, indent=4, sort_keys=True)))
        else:
            self.logger.info('Kafka Monitor Stats Dump', extra=extras)

    def run(self):
        '''
        Set up and run
        '''
        self._setup_kafka()
        self._load_plugins()
        self._setup_stats()
        self._main_loop()

    def feed(self, json_item):
        '''
        Feeds a json item into the Kafka topic

        @param json_item: The loaded json object
        '''
        @MethodTimer.timeout(self.settings['KAFKA_FEED_TIMEOUT'], False)
        def _feed(json_item):
            try:
                self.kafka_conn = KafkaClient(self.settings['KAFKA_HOSTS'])
                topic = self.settings['KAFKA_INCOMING_TOPIC']
                producer = SimpleProducer(self.kafka_conn)
            except KafkaUnavailableError:
                self.logger.error("Unable to connect to Kafka")
                return False

            if not self.logger.json:
                self.logger.info('Feeding JSON into {0}\n{1}'.format(
                    topic, json.dumps(json_item, indent=4)))
            else:
                self.logger.info('Feeding JSON into {0}\n'.format(topic),
                                 extra={'value': json_item})

            self.kafka_conn.ensure_topic_exists(topic)
            producer.send_messages(topic, json.dumps(json_item))

            return True

        result = _feed(json_item)

        if result:
            self.logger.info("Successfully fed item to Kafka")
        else:
            self.logger.error("Failed to feed item into Kafka")
class KafkaMonitor:
    def __init__(self, settings):
        # dynamic import of settings file
        # remove the .py from the filename
        self.settings = importlib.import_module(settings[:-3])

        # only need kafka for both uses
        self.kafka_conn = KafkaClient(self.settings.KAFKA_HOSTS)

    def get_method(self, key):
        if key == 'handle_crawl_request':
            return self.handle_crawl_request
        elif key == 'handle_action_request':
            return self.handle_action_request
        raise AttributeError(key)

    def setup(self):
        self.redis_conn = redis.Redis(host=self.settings.REDIS_HOST,
                                      port=self.settings.REDIS_PORT)

        self.kafka_conn.ensure_topic_exists(self.settings.KAFKA_INCOMING_TOPIC)
        self.consumer = SimpleConsumer(self.kafka_conn,
                                       self.settings.KAFKA_GROUP,
                                       self.settings.KAFKA_INCOMING_TOPIC,
                                       auto_commit=True,
                                       iter_timeout=1.0)

        self.result_method = self.get_method(self.settings.SCHEMA_METHOD)

        self.validator = self.extend_with_default(Draft4Validator)

    def extend_with_default(self, validator_class):
        '''
        Method to add default fields to our schema validation
        ( From the docs )
        '''
        validate_properties = validator_class.VALIDATORS["properties"]

        def set_defaults(validator, properties, instance, schema):
            for error in validate_properties(
                    validator, properties, instance, schema,
            ):
                yield error

            for property, subschema in properties.iteritems():
                if "default" in subschema:
                    instance.setdefault(property, subschema["default"])

        return validators.extend(
            validator_class, {"properties": set_defaults},
        )

    def handle_crawl_request(self, dict):
        '''
        Processes a vaild crawl request

        @param dict: a valid dictionary object
        '''
        # format key
        key = "{sid}:queue".format(sid=dict['spiderid'])
        val = pickle.dumps(dict, protocol=-1)

        # shortcut to shove stuff into the priority queue
        self.redis_conn.zadd(key, val, -dict['priority'])

        # if timeout crawl, add value to redis
        if 'expires' in dict:
            key = "timeout:{sid}:{appid}:{crawlid}".format(
                sid=dict['spiderid'],
                appid=dict['appid'],
                crawlid=dict['crawlid'])
            self.redis_conn.set(key, dict['expires'])

    def handle_action_request(self, dict):
        '''
        Processes a vaild action request

        @param dict: The valid dictionary object
        '''
        # format key
        key = "{action}:{spiderid}:{appid}".format(
            action=dict['action'],
            spiderid=dict['spiderid'],
            appid=dict['appid'])

        if "crawlid" in dict:
            key = key + ":" + dict['crawlid']

        self.redis_conn.set(key, dict['uuid'])

    def _main_loop(self):
        '''
        Continuous loop that reads from a kafka topic and tries to validate
        incoming messages
        '''
        while True:
            start = time.time()

            try:
                for message in self.consumer.get_messages():
                    if message is None:
                        break
                    try:
                        the_dict = json.loads(message.message.value)

                        try:
                            self.validator(self.schema).validate(the_dict)
                            self.result_method(the_dict)
                        except ValidationError as ex:
                            print "invalid json received"

                    except ValueError:
                        print "bad json recieved"
            except OffsetOutOfRangeError:
                # consumer has no idea where they are
                self.consumer.seek(0, 2)

            end = time.time()
            time.sleep(.01)

    def run(self):
        '''
        Sets up the schema to be validated against
        '''
        self.setup()
        with open(self.settings.SCHEMA) as the_file:
            # No try/catch so we can see if there is a json parse error
            # on the schemas
            self.schema = json.load(the_file)
            self._main_loop()

    def feed(self, json_item):
        '''
        Feeds a json item into the Kafka topic

        @param json_item: The loaded json object
        '''
        topic = self.settings.KAFKA_INCOMING_TOPIC
        producer = SimpleProducer(self.kafka_conn)
        print "=> feeding JSON request into {0}...".format(topic)
        print json.dumps(json_item, indent=4)
        self.kafka_conn.ensure_topic_exists(topic)
        producer.send_messages(topic, json.dumps(json_item))
        print "=> done feeding request."
Exemple #32
0
class Consumer(object):
    """Kafka consumer class with functions to consume messages to HDFS.
    Messages are blocked into 20MB files and transferred to HDFS
    Attributes:
        client: string representing IP:port of the kafka broker
        consumer: Consumer object specifying the client group, and topic
        temp_file_path: location of the 20MB file to be appended to before
            transfer to HDFS
        temp_file: File object opened from temp_file_path
        topic: String representing the topic on Kafka
        group: String representing the Kafka consumer group to be associated
            with
        block_cnt: integer representing the block count for print statements
    """

    def __init__(self, addr, group, topic):
        """Initialize Consumer with kafka broker IP, group, and topic."""
        self.client = KafkaClient(addr)
        self.consumer = SimpleConsumer(self.client, group, topic, max_buffer_size=1310720000)
        self.temp_file_path = None
        self.temp_file = None
        self.hadoop_path = "/user/parking_data/history"
        self.topic = topic
        self.group = group
        self.block_cnt = 0

    def consume_topic(self, output_dir):
        """Consumes a stream of messages from the "messages" topic.
        Code template from https://github.com/ajmssc/bitcoin-inspector.git
        Args:
            output_dir: string representing the directory to store the 20MB
                before transferring to HDFS
        Returns:
            None
        """
        timestamp = time.strftime("%Y%m%d%H%M%S")

        # open file for writing
        self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % (output_dir, self.topic, self.group, timestamp)
        self.temp_file = open(self.temp_file_path, "w")

        # while True:
        for ii in range(0, 2):
            try:
                # get 1000 messages at a time, non blocking
                messages = self.consumer.get_messages(count=1000, block=False)

                # OffsetAndMessage(offset=43, message=Message(magic=0,
                # attributes=0, key=None, value='some message'))
                for message in messages:
                    self.temp_file.write(message.message.value + "\n")

                # file size > 20MB
                if self.temp_file.tell() > 20000000:
                    self.flush_to_hdfs(output_dir)

                self.consumer.commit()
            except:
                # move to tail of kafka topic if consumer is referencing
                # unknown offset
                self.consumer.seek(0, 2)

    def flush_to_hdfs(self, output_dir):
        """Flushes the 20MB file into HDFS.
        Code template from https://github.com/ajmssc/bitcoin-inspector.git
        Flushes the file into HDFS folders
        Args:
            output_dir: string representing the directory to store the 20MB
                before transferring to HDFS
        Returns:
            None
        """
        self.temp_file.close()

        timestamp = time.strftime("%Y%m%d%H%M%S")

        hadoop_fullpath = "%s/%s_%s_%s.dat" % (self.hadoop_path, self.group, self.topic, timestamp)
        print "Block {}: Flushing 20MB file to HDFS => {}".format(str(self.block_cnt), hadoop_fullpath)
        self.block_cnt += 1

        # place blocked messages into history and cached folders on hdfs
        print ("hdfs dfs -put %s %s" % (self.temp_file_path, hadoop_fullpath))
        os.system("sudo hdfs dfs -put %s %s" % (self.temp_file_path, hadoop_fullpath))
        # os.system("sudo -u hdfs hdfs dfs -put %s %s" % (self.temp_file_path,
        # cached_fullpath))
        os.remove(self.temp_file_path)

        timestamp = time.strftime("%Y%m%d%H%M%S")

        self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % (output_dir, self.topic, self.group, timestamp)
        self.temp_file = open(self.temp_file_path, "w")
Exemple #33
0
class Prod2Cons(object):
    '''
    Implements blackbox producer & consumer test to/from Kafka
    '''
    def __init__(self, host, port, schema_path, topic, nbmsg, consumer_timeout):
        self.topic = topic
        self.nbmsg = nbmsg
        self.sent_msg = 0
        self.host = host
        self.port = port
        self.sent = [-100] * self.nbmsg
        self.rcv = [-100] * self.nbmsg
        self.runtag = str(random.randint(10, 100000))
        try:
            self.broker = KafkaClient("%s:%d" % (self.host, self.port))
        except:
            raise ValueError(
                "KafkaClient (%s:%d) - init failed" % (self.host, self.port))
        try:
            self.producer = SimpleProducer(self.broker)
        except:
            raise ValueError(
                "SimpleProducer (%s:%d) - init failed" % (self.host, self.port))
        try:
            self.consumer = SimpleConsumer(
                self.broker, "testbot", topic, iter_timeout=consumer_timeout)
        except:
            raise ValueError(
                "SimpleConsumer (%s:%d) - init failed" % (self.host, self.port))
        try:
            self.schema = avro.schema.parse(open(schema_path).read())
        except:
            raise ValueError(
                "Prod2Cons load schema (%s) - init failed" % (schema_path))

    def add_sent(self, index):
        '''
           add a datetime now event
        '''
        self.sent[index] = datetime.datetime.now()

    def add_rcv(self, index):
        '''
           add a datetime now event
        '''
        self.rcv[index] = datetime.datetime.now()

    def average_ms(self):
        '''
           compute average between sent / rcv values
        '''
        result = 0
        for i in range(len(self.sent)):
            delta = (self.rcv[i] - self.sent[i])
            result += int(delta.total_seconds() * 1000)  # milliseconds
        return int(result / len(self.sent))

    def prod(self):
        '''
           The test producer
        '''
        LOGGER.debug("prod2cons - start producer")
        writer = avro.io.DatumWriter(self.schema)
        for i in xrange(self.nbmsg):
            rawdata = "%s|%s" % (self.runtag, str(i))
            bytes_writer = io.BytesIO()
            encoder = avro.io.BinaryEncoder(bytes_writer)
            writer.write({"timestamp": TIMESTAMP_MILLIS(),
                          "src": "testbot",
                          "host_ip": "localhost",
                          "rawdata": rawdata},
                         encoder)
            raw_bytes = bytes_writer.getvalue()
            self.add_sent(i)
            self.producer.send_messages(self.topic, raw_bytes)
            self.sent_msg += 1
        return 0

    def consumer_reset(self):
        '''
           Indicate to restart from the most recent offset
        '''
        self.consumer.seek(0, 2)

    def cons(self):
        '''
           Run the consumer and return a test result struct
        '''
        LOGGER.debug("prod2cons - start consumer")
        readcount = 0
        readvalid = 0
        readnotvalid = 0
        avg_ms = -1
        # time.sleep(2) added for a local test for checking lond delay display
        for message in self.consumer:
            readcount += 1
            try:
                newmessage = message[1][3]
                bytes_reader = io.BytesIO(newmessage)
                decoder = avro.io.BinaryDecoder(bytes_reader)
                reader = avro.io.DatumReader(self.schema)
                msg = reader.read(decoder)
                rawsplit = msg['rawdata'].split('|')
                if rawsplit[0] == self.runtag:
                    readvalid += 1
                    self.add_rcv(int(rawsplit[1]))
                else:
                    readnotvalid += 1
                    LOGGER.error("consumer  reads unexpected message [%s] - runtag is [%s]",
                                 msg['rawdata'],
                                 self.runtag)

            except:
                LOGGER.error("prod2cons - consumer failed")
                raise Exception("consumer failed")

        if readcount == self.nbmsg and readvalid == self.nbmsg:
            LOGGER.debug("consumer : test run ok")
            avg_ms = self.average_ms()

        return TestbotResult(self.sent_msg, readvalid, readnotvalid, avg_ms)
Exemple #34
0
class Consumer(object):
    """Kafka consumer class with functions to consume messages to HDFS.
    Messages are blocked into 20MB files and transferred to HDFS
    Attributes:
        client: string representing IP:port of the kafka broker
        consumer: Consumer object specifying the client group, and topic
        temp_file_path: location of the 20MB file to be appended to before
            transfer to HDFS
        temp_file: File object opened from temp_file_path
        topic: String representing the topic on Kafka
        group: String representing the Kafka consumer group to be associated
            with
        block_cnt: integer representing the block count for print statements
    """
    def __init__(self, addr, group, topic):
        """Initialize Consumer with kafka broker IP, group, and topic."""
        self.client = KafkaClient(addr)
        self.consumer = SimpleConsumer(self.client,
                                       group,
                                       topic,
                                       max_buffer_size=1310720000)
        self.temp_file_path = None
        self.temp_file = None
        self.hadoop_path = "/user/parking_data/history"
        self.topic = topic
        self.group = group
        self.block_cnt = 0

    def consume_topic(self, output_dir):
        """Consumes a stream of messages from the "messages" topic.
        Code template from https://github.com/ajmssc/bitcoin-inspector.git
        Args:
            output_dir: string representing the directory to store the 20MB
                before transferring to HDFS
        Returns:
            None
        """
        timestamp = time.strftime('%Y%m%d%H%M%S')

        # open file for writing
        self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % (
            output_dir, self.topic, self.group, timestamp)
        self.temp_file = open(self.temp_file_path, "w")

        # while True:
        for ii in range(0, 2):
            try:
                # get 1000 messages at a time, non blocking
                messages = self.consumer.get_messages(count=1000, block=False)

                # OffsetAndMessage(offset=43, message=Message(magic=0,
                # attributes=0, key=None, value='some message'))
                for message in messages:
                    self.temp_file.write(message.message.value + "\n")

                # file size > 20MB
                if self.temp_file.tell() > 20000000:
                    self.flush_to_hdfs(output_dir)

                self.consumer.commit()
            except:
                # move to tail of kafka topic if consumer is referencing
                # unknown offset
                self.consumer.seek(0, 2)

    def flush_to_hdfs(self, output_dir):
        """Flushes the 20MB file into HDFS.
        Code template from https://github.com/ajmssc/bitcoin-inspector.git
        Flushes the file into HDFS folders
        Args:
            output_dir: string representing the directory to store the 20MB
                before transferring to HDFS
        Returns:
            None
        """
        self.temp_file.close()

        timestamp = time.strftime('%Y%m%d%H%M%S')

        hadoop_fullpath = "%s/%s_%s_%s.dat" % (self.hadoop_path, self.group,
                                               self.topic, timestamp)
        print("Block {}: Flushing 20MB file to HDFS => {}".format(
            str(self.block_cnt), hadoop_fullpath))
        self.block_cnt += 1

        # place blocked messages into history and cached folders on hdfs
        print("hdfs dfs -put %s %s" % (self.temp_file_path, hadoop_fullpath))
        os.system("sudo hdfs dfs -put %s %s" %
                  (self.temp_file_path, hadoop_fullpath))
        # os.system("sudo -u hdfs hdfs dfs -put %s %s" % (self.temp_file_path,
        # cached_fullpath))
        os.remove(self.temp_file_path)

        timestamp = time.strftime('%Y%m%d%H%M%S')

        self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % (
            output_dir, self.topic, self.group, timestamp)
        self.temp_file = open(self.temp_file_path, "w")