Esempio n. 1
0
 def forwarder(self):
     client = KafkaClient(hosts(self.server_list, self.kafka_port))
     client.ensure_topic_exists(self.topic_name)
     producer = SimpleProducer(client, batch_send=False)
     print producer
     for i in xrange(1, 100):
         with open(self.csvfile, 'r') as FR:
             fields = next(FR).strip().split('\t')
             print fields
             for cnc_log in FR:
                 values = cnc_log.strip().split('\t')
                 zipped = dict(zip(fields, values))
                 zipped['lower_bound'] = float(zipped['lower_bound'])
                 zipped['upper_bound'] = float(zipped['upper_bound'])
                 zipped['temperature'] = float(zipped['temperature'])
                 zipped['no'] = int(zipped['no'])
                 print json.dumps(zipped, sort_keys=True, indent=4)
                 # prob = 0.8
                 # y = lambda x, prob: '<span style="background-color:#bd362f; color:white">FAIL</span>' if randint(0,x) > x*prob  else 'PASS'
                 # cnc_log = (datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')+"\t"+y(10,0.8)+'\t'+cnc_log.strip()).split('\t')
                 # zipped = dict(zip(fields,cnc_log))
                 # node = zipped
                 sleep_sec = random.uniform(0, 3) * 5
                 time.sleep(sleep_sec)
                 producer.send_messages(self.topic_name, json.dumps(zipped))
Esempio n. 2
0
 def configure_internal_queues(self):
     """
     configures the internal queues used hold references to events in the input queue
     """
     for i in range(self.number_of_queues):
         client = KafkaClient(hosts=self.kafka_hosts)
         queue_name = SCHEDULER_QUEUE_FORMAT.format(2**i)
         client.ensure_topic_exists(queue_name)
         indexed_consumer = IndexedConsumer(self.input_topic,
                                            self.kafka_hosts)
         queue_consumer = KafkaConsumer(
             queue_name,
             bootstrap_servers=self.kafka_hosts,
             group_id=queue_name,
             consumer_timeout_ms=2000,
             auto_commit_enable=False,
         )
         queue_producer = SimpleProducer(client)
         queue_duration = 2**i
         self.queues.append(
             InternalQueue(
                 queue_consumer,
                 indexed_consumer,
                 queue_producer,
                 self.number_of_queues,
                 queue_duration,
             ))
Esempio n. 3
0
 def forwarder(self):
     client = KafkaClient(hosts(self.server_list, self.kafka_port))
     client.ensure_topic_exists(self.topic_name)
     producer = SimpleProducer(client, batch_send=False)
     print producer
     no = 1
     for i in xrange(1,10000):
         with open(self.csvfile, 'r') as FR:
             first_line = next(FR)
             print first_line
             fields = first_line.lstrip().rstrip().split('\t')
             print fields
             for cnc_log in FR:
                 print cnc_log
                 values = cnc_log.strip().split('\t')
                 zipped = dict(zip(fields,values))
                 zipped['lower_bound'] = float(zipped['lower_bound'])
                 zipped['upper_bound'] = float(zipped['upper_bound'])
                 zipped['spindle'] = float(zipped['spindle'])
                 # zipped['no'] = int(zipped['no'])
                 zipped['no'] = no
                 zipped['tool_no'] = int(zipped['tool_no'])
                 # zipped['tool_no'] = i
                 print json.dumps(zipped,sort_keys=True,indent=4)
                 sleep_sec = 1
                 time.sleep(sleep_sec)
                 producer.send_messages(self.topic_name, json.dumps(zipped))
                 no = no +1
Esempio n. 4
0
class KafkaConnector(object):

    def __init__(self, host_name, host_port):
        self.client = KafkaClient(host_name + ":" + host_port)
        self.producer = SimpleProducer(self.client)

    def create_topic(self, topic_name):
        topic_exists = self.client.has_metadata_for_topic(topic_name)
        if not topic_exists:
            self.client.ensure_topic_exists(topic_name)

    def send_message(self, topic_name, message):
        self.producer.send_messages(topic_name, message)

    def register_consumer(self, callback, parse_json, topic_group, topic_name):
        consumer = SimpleConsumer(self.client, topic_group, topic_name)
        consumer_thread = ConsumerThread(consumer, callback, parse_json)
        consumer_thread.start()

    def blocking_consumer(self, message_consume_function, parse_json, topic_group, topic_name):
        print "starting blocking consumer with topic group %s and topic name %s" % (topic_group, topic_name)
        consumer = SimpleConsumer(self.client, topic_group, topic_name)
        consumer.seek(0,2)

        for message in consumer:
            message = parse_json(message)
            print "=============" + str(message) + "============"
            message_consume_function(message)
            print "called message consume function"
Esempio n. 5
0
 def listen(self):
     client = KafkaClient(hosts(self.server_list, self.kafka_port))
     client.ensure_topic_exists(self.topic_name)
     # print client.topic_partitions()
     consumer = SimpleConsumer(client, self.consumer_name, self.topic_name)
     for message in consumer:
         value = message.message.value
         print value
Esempio n. 6
0
def _feed(settings_file, json_item):
    settings = importlib.import_module(settings_file[:-3])
    kafka_conn = KafkaClient(settings.KAFKA_HOSTS)
    topic = settings.KAFKA_INCOMING_TOPIC
    producer = SimpleProducer(kafka_conn)
    print "=> feeding JSON request into {0}...".format(topic)
    print json.dumps(json_item, indent=4)
    kafka_conn.ensure_topic_exists(topic)
    producer.send_messages(topic, json.dumps(json_item))
    print "=> done feeding request."
Esempio n. 7
0
 def listen(self):
     client = KafkaClient(hosts(self.server_list, self.kafka_port))
     client.ensure_topic_exists(self.topic_name)
     consumer = SimpleConsumer(client, self.consumer_name, self.topic_name)
     for message in consumer:
         value = message.message.value
         value = json.loads(value)
         if value['no'] % 10 == 0:
             print value
             subject = "test mail => "+message.message.value
             body = "Good day! Now is "+datetime.now().strftime('%Y-%m-%d %H:%M:%S')
             send_mail(self.email_address,subject,body)
Esempio n. 8
0
class KafkaDatawakeLookaheadSpout(Spout):
    group = 'datawake-crawler-out-consumer'.encode()

    def __init__(self):
        Spout.__init__(self)
        self.queue = None

    def initialize(self, stormconf, context):
        try:
            self.settings = all_settings.get_settings(stormconf['topology.deployment'])
            self.topic = self.settings['crawler-out-topic'].encode()
            self.conn_pool = self.settings['crawler_conn_pool'].encode()
            self.log('KafkaDatawakeLookaheadSpout initialized with topic =' + self.topic + ' conn_pool=' + self.conn_pool)
            self.kafka = KafkaClient(self.conn_pool)
            self.kafka.ensure_topic_exists(self.topic)
            self.consumer = SimpleConsumer(self.kafka, self.group, self.topic, max_buffer_size=None)
            self.consumer.seek(0, 2)  # move to the tail of the queue
        except:
            self.log("KafkaDatawakeLookaheadSpout initialize error", level='error')
            self.log(traceback.format_exc(), level='error')
            raise


    def next_tuple(self):
        """
        input message:
            dict(
                 crawlid = input['crawlid'],
                 appid = input['appid'],
                 url = url,
                 status_code = response.getcode(),
                 status_msg = 'Success',
                 timestamp = response.info()['date'],
                 links_found = links,
                 body =  html,
                 attrs = input['attrs']
            )
        :return:  (url, status, headers, flags, body, timestamp, source,context)
        """

        offsetAndMessage = self.consumer.get_messages(timeout=None)[0]
        message = offsetAndMessage.message.value

        crawled = json.loads(message)
        if crawled['appid'] == self.settings["appid"]:
            safeurl = crawled['url'].encode('utf-8', 'ignore')
            self.log("Lookahead spout received id: " + crawled['crawlid'] + " url: " + safeurl)
            context = {
                'source': 'datawake-lookahead',
                'domain': crawled['attrs']['domain']
            }
            self.emit([crawled['url'], crawled['status_code'], '', '', crawled['body'], crawled['timestamp'], context['source'], context])
Esempio n. 9
0
class Producer():
    def __init__(self, server_list, kafka_port, topic_name):
        self.server_list = server_list
        self.kafka_port = kafka_port
        self.topic_name = topic_name
        self.client = KafkaClient(hosts(self.server_list, self.kafka_port))
        self.producer = SimpleProducer(self.client, batch_send=False)

    def ensure_topic_exists(self):
        self.client.ensure_topic_exists(self.topic_name)

    def forwarder(self, message):
        self.producer.send_messages(self.topic_name, message)
Esempio n. 10
0
class KafkaDatawakeVisitedSpout(Spout):
    group = 'datawake-visited-consumer'.encode()

    def __init__(self):
        Spout.__init__(self)
        self.queue = None

    def initialize(self, stormconf, context):
        try:
            settings = all_settings.get_settings(
                stormconf['topology.deployment'])
            self.topic = settings['visited-topic'].encode()
            self.conn_pool = settings['conn_pool'].encode()
            self.log('KafkaDatawakeVisitedSpout initialized with topic =' +
                     self.topic + ' conn_pool=' + self.conn_pool)
            self.kafka = KafkaClient(self.conn_pool)
            self.kafka.ensure_topic_exists(self.topic)
            self.consumer = SimpleConsumer(self.kafka,
                                           self.group,
                                           self.topic,
                                           max_buffer_size=None)
            self.consumer.seek(0, 2)  # move to the tail of the queue
        except:
            self.log("KafkaDatawakeVisitedSpout initialize error",
                     level='error')
            self.log(traceback.format_exc(), level='error')
            raise

    def next_tuple(self):
        """
        input:  (timestamp,org,domain,user_id,url,html)
        :return:  (url, status, headers, flags, body, timestamp, source,context)
        """
        try:
            for message in self.consumer:
                self.log("msg")
                self.log(message)
                #offsetAndMessage = self.consumer.get_messages(timeout=None)[0]
                message = message.split('\0')
                (timestamp, org, domain, userId, url, html) = message
                context = {'source': 'datawake-visited', 'domain': domain}
                self.emit([
                    url, '', '', '', html, timestamp, context['source'],
                    context
                ])
        except:
            self.log(traceback.format_exc(), level='error')

    def fail(self, tup_id):
        pass
Esempio n. 11
0
 def configure_input_queue(self):
     """
     configures the input queue that other services can use to schedule an event to be delivered
     """
     client = KafkaClient(hosts=self.kafka_hosts)
     client.ensure_topic_exists(self.input_topic)
     indexed_consumer = IndexedConsumer(self.input_topic, self.kafka_hosts)
     queue_consumer = KafkaConsumer(self.input_topic,
                                    bootstrap_servers=self.kafka_hosts,
                                    group_id=CONSUMER_GROUP)
     queue_producer = SimpleProducer(KafkaClient(hosts=self.kafka_hosts))
     self.queues.append(
         InputQueue(queue_consumer, indexed_consumer, queue_producer,
                    self.number_of_queues))
Esempio n. 12
0
class KafkaConsumer:

    group = "python-lookahead-consumer"

    def __init__(self,conn_pool,topic,group):
        self.conn_pool = conn_pool
        self.topic = topic
        self.group = group
        self.kafka = KafkaClient(self.conn_pool)
        self.kafka.ensure_topic_exists(self.topic)
        self.consumer = SimpleConsumer(self.kafka,self.group,self.topic,max_buffer_size=None)
        self.consumer.seek(0,2) # move to the tail of the queue

    def next(self):
        offsetAndMessage = self.consumer.get_messages(timeout=None)[0]
        message = offsetAndMessage.message.value
        return message
Esempio n. 13
0
class KafkaDatawakeVisitedSpout(Spout):
    group = 'datawake-visited-consumer'.encode()

    def __init__(self):
        Spout.__init__(self)
        self.queue = None

    def initialize(self, stormconf, context):
        try:
            settings = all_settings.get_settings(stormconf['topology.deployment'])
            self.topic = settings['visited-topic'].encode()
            self.conn_pool = settings['conn_pool'].encode()
            self.log('KafkaDatawakeVisitedSpout initialized with topic =' + self.topic + ' conn_pool=' + self.conn_pool)
            self.kafka = KafkaClient(self.conn_pool)
            self.kafka.ensure_topic_exists(self.topic)
            self.consumer = SimpleConsumer(self.kafka, self.group, self.topic, max_buffer_size=None)
            self.consumer.seek(0, 2)  # move to the tail of the queue
        except:
            self.log("KafkaDatawakeVisitedSpout initialize error", level='error')
            self.log(traceback.format_exc(), level='error')
            raise

    def next_tuple(self):
        """
        input:  (timestamp,org,domain,user_id,url,html)
        :return:  (url, status, headers, flags, body, timestamp, source,context)
        """
        try:
            for message in self.consumer:
                self.log("msg")
                self.log(message)
                #offsetAndMessage = self.consumer.get_messages(timeout=None)[0]
                message = message.split('\0')
                (timestamp, org, domain, userId, url, html) = message
                context = {
                    'source': 'datawake-visited',
                    'domain': domain
                }
                self.emit([url, '', '', '', html, timestamp, context['source'], context])
        except:
            self.log(traceback.format_exc(), level='error')

    def fail(self, tup_id):
	pass 
class KafkaProducer:
    def __init__(self, conn_pool, topic):
        self.conn_pool = conn_pool
        self.topic = topic
        self.kafka = KafkaClient(self.conn_pool)
        self.kafka.ensure_topic_exists(self.topic)
        self.producer = SimpleProducer(self.kafka, async=True)

    def send(self, message):
        self.producer.send_messages(self.topic, message)

    def sendBulk(self, messages):
        self.producer.send_messages(self.topic, *messages)

    def close(self):
        self.producer.stop()
        self.kafka.close()
        self.kafka = None
        self.producer = None
class CrawlerSpout(Spout):

    group = 'datawake-crawler-in-consumer'.encode()


    def initialize(self, stormconf, context):
        try:
            settings = all_settings.get_settings(stormconf['topology.deployment'])
            self.topic = settings['crawler-in-topic'].encode()
            self.conn_pool = settings['conn_pool'].encode()
            self.log('CrawlerSpout initialized with topic ='+self.topic+' conn_pool='+self.conn_pool)
            self.kafka = KafkaClient(self.conn_pool)
	    self.kafka.ensure_topic_exists(self.topic)
            self.consumer = SimpleConsumer(self.kafka,self.group,self.topic,max_buffer_size=None, fetch_size_bytes=2000000)
            self.consumer.seek(0,2) # move to the tail of the queue
        except:
            self.log("CrawlerSpout initialize error",level='error')
            self.log(traceback.format_exc(),level='error')
            raise

    def next_tuple(self):
        """
        input message:
             json.dumps(dict(
                    id = 'abcdefg', #TODO generate UUID,
                    appid = self.appid,
                    url = url,
                    priority = 50,
                    depth = 0,
                    attrs  = dict(
                        userId = context['userId'],
                        org =  context['org'],
                        domain = context['domain']
                    )
                ))
        :return:
        """
        try:
            for message in self.consumer:
                to_crawl = json.loads(message)
                self.emit([to_crawl])
        except:
            self.log(traceback.format_exc(),level='error')
class KafkaProducer:
    def __init__(self, conn_pool, topic):
        self.conn_pool = conn_pool
        self.topic = topic
        self.kafka = KafkaClient(self.conn_pool)
        self.kafka.ensure_topic_exists(self.topic)
        self.producer = SimpleProducer(self.kafka, async=True)

    def send(self, message):
        self.producer.send_messages(self.topic, message)

    def sendBulk(self, messages):
        self.producer.send_messages(self.topic, *messages)

    def close(self):
        self.producer.stop()
        self.kafka.close()
        self.kafka = None
        self.producer = None
Esempio n. 17
0
class KafkaConsumer:

    group = "python-lookahead-consumer"

    def __init__(self, conn_pool, topic, group):
        self.conn_pool = conn_pool
        self.topic = topic
        self.group = group
        self.kafka = KafkaClient(self.conn_pool)
        self.kafka.ensure_topic_exists(self.topic)
        self.consumer = SimpleConsumer(self.kafka,
                                       self.group,
                                       self.topic,
                                       max_buffer_size=None)
        self.consumer.seek(0, 2)  # move to the tail of the queue

    def next(self):
        offsetAndMessage = self.consumer.get_messages(timeout=None)[0]
        message = offsetAndMessage.message.value
        return message
Esempio n. 18
0
 def forwarder(self):
     client = KafkaClient(hosts(self.server_list, self.kafka_port))
     client.ensure_topic_exists(self.topic_name)
     # print client.topic_partitions()
     producer = SimpleProducer(client, batch_send=False)
     for i in xrange(1, 100):
         with open(self.csvfile, 'r') as FR:
             fields = ("ARRIVAL_TIMESTAMP\t" + "DEFECT\t" +
                       next(FR).strip()).split('\t')
             for cnc_log in FR:
                 prob = 0.8
                 y = lambda x, prob: '<span style="background-color:#bd362f; color:white">FAIL</span>' if randint(
                     0, x) > x * prob else 'PASS'
                 cnc_log = (
                     datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f') +
                     "\t" + y(10, 0.8) + '\t' + cnc_log.strip()).split('\t')
                 zipped = dict(zip(fields, cnc_log))
                 node = zipped
                 sleep_sec = random.uniform(0, 10)
                 time.sleep(sleep_sec)
                 producer.send_messages(self.topic_name, json.dumps(node))
Esempio n. 19
0
class KafkaConnector(object):
    def __init__(self, host_name, host_port):
        self.client = KafkaClient(host_name + ":" + host_port)
        self.producer = SimpleProducer(self.client)

    def create_topic(self, topic_name):
        topic_exists = self.client.has_metadata_for_topic(topic_name)
        if not topic_exists:
            self.client.ensure_topic_exists(topic_name)

    def send_message(self, topic_name, message):
        self.producer.send_messages(topic_name, message)

    def register_consumer(self, callback, parse_json, topic_group, topic_name):
        consumer = SimpleConsumer(self.client,
                                  topic_group,
                                  topic_name,
                                  max_buffer_size=None)
        consumer_thread = ConsumerThread(consumer, callback, parse_json)
        print "Starting new subscriber for topic " + topic_name + ' with group ' + topic_group
        consumer_thread.start()
Esempio n. 20
0
class ImageConvertProcess(MultiDownloadProcess):

    name = "image_convert_process"

    topic_name = "jay.crawled_firehose_images"

    def __init__(self, settings):
        super(ImageConvertProcess, self).__init__(settings)
        self.kafka_client = KafkaClient(self.settings.get("KAFKA_HOSTS"))
        self.kafka_client.ensure_topic_exists(self.topic_name)
        self.producer = SimpleProducer(self.kafka_client)
        #self.lock = RLock()
        self.IC = ImageConvert(settings)
        self.IC.set_logger(self.logger)

    def decode(self, item):
        return map(lambda x: (x.get('url'), x.get('filename'), x.get('path')),
                   json.loads(item)["images"])

    def callback(self, item, flag):
        try:
            if flag:
                item = json.loads(item)
                spider = item.get("meta", {}).get("spiderid")
                if spider in DOMAINS:
                    self.logger.debug("process in pan. spider:%s" % (spider))
                    item["pan_result"] = self.IC.process_image(
                        item.get("meta", {}).get("collection_name"), item)
                    self.logger.debug(
                        "finish process in pan, spider:%s result:%s" %
                        (spider, item["pan_result"]))
                else:
                    self.logger.info("ignore %s images. " % spider)
                self.producer.send_messages(self.topic_name, json.dumps(item))
                self.logger.debug("send item to kafka. ")
            else:
                self.logger.error("download failed")
        except Exception:
            self.logger.error(traceback.format_exc())
Esempio n. 21
0
class FeedProducer:
    """
    Feed Producer class
    use send() to send to any topic
    """

    def __init__(self, broker):
        try:
            self.client = KafkaClient(broker)
            self.prod = SimpleProducer(self.client)
        except KafkaUnavailableError:
            log.critical("\nCluster Unavailable %s : Check broker string\n", broker)
            raise
        except:
            raise

    def send(self, topic, *msgs):
        try:
            self.prod.send_messages(topic, *msgs)
        except LeaderNotAvailableError:
            self.client.ensure_topic_exists(topic)
            return self.send(topic, *msgs)
        except:
            raise
Esempio n. 22
0
class FeedProducer():
    """
    Feed Producer class
    use send() to send to any topic
    """
    def __init__(self, broker):
        try:
            self.client = KafkaClient(broker)
            self.prod = SimpleProducer(self.client)
        except KafkaUnavailableError:
            log.critical("\nCluster Unavailable %s : Check broker string\n",
                         broker)
            raise
        except:
            raise

    def send(self, topic, *msgs):
        try:
            self.prod.send_messages(topic, *msgs)
        except LeaderNotAvailableError:
            self.client.ensure_topic_exists(topic)
            return self.send(topic, *msgs)
        except:
            raise
Esempio n. 23
0
class KafkaMonitor:
    def __init__(self, settings):
        # dynamic import of settings file
        # remove the .py from the filename
        self.settings = importlib.import_module(settings[:-3])

        # only need kafka for both uses
        self.kafka_conn = KafkaClient(self.settings.KAFKA_HOSTS)

    def get_method(self, key):
        if key == 'handle_crawl_request':
            return self.handle_crawl_request
        elif key == 'handle_action_request':
            return self.handle_action_request
        raise AttributeError(key)

    def setup(self):
        self.redis_conn = redis.Redis(host=self.settings.REDIS_HOST,
                                      port=self.settings.REDIS_PORT)

        self.kafka_conn.ensure_topic_exists(self.settings.KAFKA_INCOMING_TOPIC)
        self.consumer = SimpleConsumer(self.kafka_conn,
                                       self.settings.KAFKA_GROUP,
                                       self.settings.KAFKA_INCOMING_TOPIC,
                                       auto_commit=True,
                                       iter_timeout=1.0)

        self.result_method = self.get_method(self.settings.SCHEMA_METHOD)

        self.validator = self.extend_with_default(Draft4Validator)

    def extend_with_default(self, validator_class):
        '''
        Method to add default fields to our schema validation
        ( From the docs )
        '''
        validate_properties = validator_class.VALIDATORS["properties"]

        def set_defaults(validator, properties, instance, schema):
            for error in validate_properties(
                    validator,
                    properties,
                    instance,
                    schema,
            ):
                yield error

            for property, subschema in properties.iteritems():
                if "default" in subschema:
                    instance.setdefault(property, subschema["default"])

        return validators.extend(
            validator_class,
            {"properties": set_defaults},
        )

    def handle_crawl_request(self, dict):
        '''
        Processes a vaild crawl request

        @param dict: a valid dictionary object
        '''
        # format key
        key = "{sid}:queue".format(sid=dict['spiderid'])
        val = pickle.dumps(dict, protocol=-1)

        # shortcut to shove stuff into the priority queue
        self.redis_conn.zadd(key, val, -dict['priority'])

        # if timeout crawl, add value to redis
        if 'expires' in dict:
            key = "timeout:{sid}:{appid}:{crawlid}".format(
                sid=dict['spiderid'],
                appid=dict['appid'],
                crawlid=dict['crawlid'])
            self.redis_conn.set(key, dict['expires'])

        print 'Added crawl to Redis'

    def handle_action_request(self, dict):
        '''
        Processes a vaild action request

        @param dict: The valid dictionary object
        '''
        # format key
        key = "{action}:{spiderid}:{appid}".format(action=dict['action'],
                                                   spiderid=dict['spiderid'],
                                                   appid=dict['appid'])

        if "crawlid" in dict:
            key = key + ":" + dict['crawlid']

        self.redis_conn.set(key, dict['uuid'])

        print 'Added action to Redis'

    def _main_loop(self):
        '''
        Continuous loop that reads from a kafka topic and tries to validate
        incoming messages
        '''
        while True:
            start = time.time()

            try:
                for message in self.consumer.get_messages():
                    if message is None:
                        break
                    try:
                        the_dict = json.loads(message.message.value)

                        try:
                            self.validator(self.schema).validate(the_dict)
                            self.result_method(the_dict)
                        except ValidationError as ex:
                            print "invalid json received"

                    except ValueError:
                        print "bad json recieved"
            except OffsetOutOfRangeError:
                # consumer has no idea where they are
                self.consumer.seek(0, 2)

            end = time.time()
            time.sleep(.01)

    def run(self):
        '''
        Sets up the schema to be validated against
        '''
        self.setup()
        with open(self.settings.SCHEMA) as the_file:
            # No try/catch so we can see if there is a json parse error
            # on the schemas
            self.schema = json.load(the_file)
            self._main_loop()

    def feed(self, json_item):
        '''
        Feeds a json item into the Kafka topic

        @param json_item: The loaded json object
        '''
        topic = self.settings.KAFKA_INCOMING_TOPIC
        producer = SimpleProducer(self.kafka_conn)
        print "=> feeding JSON request into {0}...".format(topic)
        print json.dumps(json_item, indent=4)
        self.kafka_conn.ensure_topic_exists(topic)
        producer.send_messages(topic, json.dumps(json_item))
        print "=> done feeding request."
Esempio n. 24
0
        max_tries = 10
        is_connected = False
        while not is_connected:
            print("Retrying to start consumer client: {0!s}".format(num_tries), flush=True)
            time.sleep(5)
            try:
                client = KafkaClient(kafka_host)
                is_connected = True
            except (LeaderNotAvailableError, ConnectionError, KafkaUnavailableError,) as leader_err2:
                num_tries += 1
                if num_tries == max_tries:
                    raise leader_err2
    # except Exception as err:
    #     print("Error starting consumer client: " + str(err), flush=True)
    #     raise err
    client.ensure_topic_exists(topic)

    # Create consumer
    try:
        consumer = SimpleConsumer(client, consumer_group, topic)
    except LeaderNotAvailableError as leader_err:
        num_tries = 0
        max_tries = 10
        is_connected = False
        while not is_connected:
            print("Retrying to start consumer: {0!s}".format(num_tries), flush=True)
            time.sleep(5)
            try:
                consumer = SimpleConsumer(client, consumer_group, topic)
                is_connected = True
            except LeaderNotAvailableError as leader_err2:
Esempio n. 25
0
class BusAdapter(object):
    '''
    The BusAdapter class is intended to be imported to bus modules.
    Instances of this class provide the software bus illusion over
    Kafka. 
    
    Public methods are:
        
        * publish()
        * waitForMessage()
        * subscribeToTopic()
        * unSubscribeFromTopic()
        * addTopicListener()
        * removeTopicListener()
        * mySubscriptions()
        * returnError()
        * close()
    
    A minimal consumer module looks like this:
    
    ::

        # A callback function:
    	def printMessage(topicName, msgText, msgOffset):
    	    print('Msg[%s]: %s' % (topicName, msgText))
    	
    	bus = BusAdapter()
    	# Subscribe to a topic, passing the callback function:
    	bus.subscribeToTopic('exampleTopic', printMessage)
    	
    	while True:
    	    # do anything you like
    	    time.sleep(10)
		    
    
    A corresponding minimal producer module would be like this:
    
    ::
    
        bus = BusAdapter()
        while True:
            # Read one line from console:
            msgText = raw_input("Type a message to send: ('Q' to end.): ")
            if msgText == 'Q':
                break
            else:
                bus.publish(msgText, 'exampleTopic')    
    
    For better structured, but equivalent examples, see :py:class:`Example Producer <kafka_bus_python.example_producer.BusModuleProducer>`
    and :py:class:`Example Consumer <kafka_bus_python.example_consumer.BusModuleConsumer>`.
    
    Clients of this class may install multiple listeners
    for any given topic. The publish() method may be used asynchronously,
    just to send a message to subscribing modules on the bus, or
    synchronously like a remote procedure call.
        
    The BusAdapter wraps payloads into a JSON structure
    as follows: 
    
    ::
    
    	'id'     : <RFC 4122 UUID Version 4>   # e.g. 'b0f4259e-3d01-44bd-9eb3-25981c2dc643'
    	'type'   : {req | resp}
    	'status' : { OK | ERROR }
    	'time'   : <ISO 8601>                  # e.g. '2015-05-31T17:13:41.957350'
    	'content': <text>
    
    It is the responsibility of listener functions to 
    strip this header away, if desired. For an example
    see echo_service.EchoServer's echoRequestDelivery()
    method.
    
    '''
    
    _LEGAL_MSG_TYPES = ['req', 'resp']
    _LEGAL_STATUS    = ['OK', 'ERROR']
    
    _DEFAULT_KAFKA_LISTEN_PORT = 9092
    _KAFKA_SERVERS = [('localhost', _DEFAULT_KAFKA_LISTEN_PORT),
                     ('mono.stanford.edu', _DEFAULT_KAFKA_LISTEN_PORT),
                     ('datastage.stanford.edu', _DEFAULT_KAFKA_LISTEN_PORT),
                     ]

#     _KAFKA_SERVERS = [('mono.stanford.edu', _DEFAULT_KAFKA_LISTEN_PORT),
#                      ('localhost', _DEFAULT_KAFKA_LISTEN_PORT),
#                      ('datastage.stanford.edu', _DEFAULT_KAFKA_LISTEN_PORT),
#                      ]

       
    # Remember whether logging has been initialized (class var!):
    _loggingInitialized = False
    _logger = None

    def __init__(self, 
                 kafkaHost=None, 
                 kafkaPort=None,
                 loggingLevel=logging.DEBUG,
                 logFile=None,
                 kafkaGroupId='school_bus'
                 ):
        '''
        Initialize communications with Kafka.

        :param kafkaHost: hostname or ip address of host where Kafka server runs.
            If None, then BusAdapter._KAFKA_SERVERS are tried in turn.
        :type kafkaHost: {string | None}
        :param kafkaPort: port at which Kafka expects clients to come in.
            if None, then BusAdapter._DEFAULT_KAFKA_LISTEN_PORT is used.
        :type kafkaPort: {int | None}
        :param loggingLevel: detail of logging
        :type loggingLevel: {logging.DEBUG | logging.INFO | logging.ERROR}  
        :param logFile: file to which log is written; concole, if NONE
        :type logFile: {string | None}
        :param kafkaGroupId: name under which message offset management is
            stored [by Kafka in zookeeper]. Different groups of bus modules
            will have different sets of message offsets recorded. You can 
            leave this default.
        :type kafkaGroupId: string
        '''

        if kafkaPort is None:
            kafkaPort = BusAdapter._DEFAULT_KAFKA_LISTEN_PORT
        self.port = kafkaPort
        self.kafkaGroupId = kafkaGroupId
        
        self._setupLogging(loggingLevel, logFile)

        for hostPortTuple in BusAdapter._KAFKA_SERVERS:
            self.logDebug('Contacting Kafka server at %s:%s...' % hostPortTuple)
            try:
                self.kafkaClient = KafkaClient("%s:%s" % hostPortTuple)
            except KafkaUnavailableError:
                # Have we just contacted the last of the available
                # servers?
                if hostPortTuple == BusAdapter._KAFKA_SERVERS[-1]:
                    raise KafkaUnavailableError("No Kafka server found running at any of %s." % str(BusAdapter._KAFKA_SERVERS))
                else:
                    continue
            self.logDebug('Successfully contacted Kafka server at %s:%s...' % hostPortTuple)
            # If succeeded, init the 'bootstrap_servers' array
            # referenced in topic_waiter.py:
            self.bootstrapServers = ['%s:%s' % hostPortTuple]
            # Don't try any other servers:
            break
                
        self.producer    = SimpleProducer(self.kafkaClient)

        # Create a function that has the first method-arg
        # 'self' already built in. That new function is then
        # called with just the remaining positional/keyword parms.
        # In this case: see method :func:`addTopicListener`.
        
        # This way we can by default pass :func:`_deliverResult` to a
        # _TopicWaiter instance, and thereby cause it to invoke our
        # _deliverResult() *method* (which takes the hidden 'self.'
        # Yet other callers to subscribeToTopic() can specify 
        # a *function* which only takes the non-self parameters 
        # specified in method :func:`addTopicListener`. 
        
        self.resultCallback    = partial(self._deliverResult)
        
        # A function that will be called when the result to
        # a synchronous call arrives:
        self.syncResultWaiter  = partial(self._awaitSynchronousReturn)
        
        # Dict mapping topic names to thread objects that listen
        # to the respective topic. Used by subscribeToTopic() and
        # unsubscribeFromTopic():
        self.listenerThreads = {}
        
        # Dict mapping topic names to event objects that provide
        # communication between the topic's thread and the main
        # thread. Used in awaitMessage():
        self.topicEvents = {}
        
        # Dict used for synchronous calls: the dict maps
        # msg UUIDs to the results of a call. Set in 
        # _awaitSynchronousReturn(), and emptied in publish()
        self.resDict = {}

# --------------------------  Pulic Methods ---------------------
     
    def publish(self, busMessage, topicName=None, sync=False, msgId=None, msgType='req', timeout=None, auth=None):
        '''
        Publish either a string or a BusMessage object. If busMessage
        is a string, then the caller is responsible for ensuring that
        the string is UTF-8, and a topic name must be provided.
        
        If busMessage is a BusMessage object, then that object contains
        all the required information. In this case, parameter topicName
        overrides a topic name that might be stored in the BusMessage.
        
        Messages are wrapped in a JSON structure that provides
        'id', 'type', 'time', and 'content' fields. The 'content' field
        will contain the message payload.
        
        Two ways of using this method: asynchronously, and synchronously.
        In asynchronous invocation the passed-in message is published, and
        this method returns immediately. For this type of invocation just
        provide argument busMessage, and possibly topicName, if busMessage
        is a string. 
        
        Synchronous invocation is just like a remote procedure call.
        In synchronous invocation the passed-in message is published, and 
        this method will wait for a return message that carries the same
        message ID, and is of message type 'resp'. This method then
        returns the **content** of the returned message; the surrounding
        wrapper (time/msgId/msgType...) is stripped.  
        
        :param busMessage: string or BusMessage to publish
        :type busMessage: {string | BusMessage}
        :param topicName: name of topic to publish to. If None, then 
            parameter must be a BusMessage object that contains an
            associated topic name.
        :type topicName: {string | None}
        :param sync: if True, call will not return till answer received,
            or timeout (if given) has expired).
        :type sync: boolean
        :param msgId: if this publish() call is a response to a prior request,
            the request message's ID must be the id of the response. In that
            case the caller can use this parameter to provide the ID. If
            None, a new message ID is generated.
        :type msgId: string
        :param msgType: value for the message type field of the outgoing message.
            Usually this is 'req', but when calling publish() to return a result
            to a prior request, then set this argument to 'resp'. 
        :param timeout: timeout after which synchronous call should time out.
            if sync is False, the timeout parameter is ignored.
        :type timeout: float
        :param auth: reserved for later authentication mechanism.
        :type auth: not yet known
        :return: value is only defined for synchronous invocation.
        :rtype: string
        :raises ValueError: if targeted topic name is not provided in a msg object,
            or explicitly in the topicName parameter.
        :raises ValueError: if illegal message type is passed in.
        :raises BadInformation: if Kafka does not recognize the provided topic
            **and** Kafka is not configured to create topics on the fly.
        :raises SyncCallTimedOut: if no response is received to a synchronous call
            within the provided timeout period.
        :raises SyncCallRuntimeError: if a message received in response to a 
            synchronous call cannot be parsed.
        '''

        if not isinstance(busMessage, BusMessage):
            # We were passed a raw string to send. The topic name
            # to publish to better be given:
            if topicName is None:
                raise ValueError('Attempt to publish a string without specifying a topic name.')
            msg = busMessage
        else:
            # the busMessage parm is a BusMessage instance:
            # If topicName was given, it overrides any topic name
            # associated with the BusObject; else:
            if topicName is None:
                # Grab topic name from the BusMessage:
                topicName = busMessage.topicName()
                # If the BusMessage did not include a topic name: error
                if topicName is None:
                    raise ValueError('Attempt to publish a BusMessage instance that does not hold a topic name: %s' % str(busMessage))
            # Get the serialized, UTF-8 encoded message from the BusMessage:
            msg = busMessage.content()
            
        # Now msg contains the msg text.
        try:
            self.kafkaClient.ensure_topic_exists(topicName, timeout=5)
        except KafkaTimeoutError:
            raise BadInformation("Topic '%s' is not a recognized topic." % topicName)
        
        # Create a JSON struct:
        if msgId is None:
            msgUuid = str(uuid.uuid4())
        else:
            msgUuid = msgId
        # Sanity check on message type:
        if msgType not in BusAdapter._LEGAL_MSG_TYPES:
            raise ValueError('Legal message types are %s' % str(BusAdapter._LEGAL_MSG_TYPES))
        
        msgDict = dict(zip(['id', 'type', 'time', 'content'],
                           [msgUuid, msgType, datetime.now().isoformat(), msg]))

        # If synchronous operation requested, wait for response:
        if sync:
            
            # Before publishing the request, must prepare for 
            # a function that will be invoked with the result.
            
            # Use instance vars for communication with the result 
            # delivery thread.
            # Use of these instance vars means that publish
            # isn't re-entrant. Fine for now:

            # For the result delivery method to know which msg id
            # we are waiting for:            
            self.uuidToWaitFor   = msgUuid
            
            # For the result delivery method to know which topic
            # we are waiting for:
            self.topicToWaitFor  = topicName

            # For the result delivery method to put a string
            # if an error occurs while processing the result
            # bus message:

            self.syncResultError = None
            
            # Create event that will wake us when result
            # arrived and has been placed in self.resDict:

            self.resultArrivedEvent = threading.Event(timeout)

            # If not subscribed to the topic to which this synchronous
            # call is being published, then subscribe to it temporarily:

            wasSubscribed = topicName in self.mySubscriptions()
            if not wasSubscribed:
                self.subscribeToTopic(topicName, self.syncResultWaiter)
            else:
                self.addTopicListener(topicName, self.syncResultWaiter)
            
            # Finally: post the request...
            self.producer.send_messages(topicName, json.dumps(msgDict))
            
            # ... and wait for the answer message to invoke
            # self._awaitSynchronousReturn():
            resBeforeTimeout = self.resultArrivedEvent.wait(timeout)
            
            # Result arrived, and was placed into
            # self.resDict under the msgUuid. Remove the listener
            # that waited for the result:
            
            self.removeTopicListener(topicName, self.syncResultWaiter)
            
            # If we weren't subscribed to this topic, then
            # restore that condition:

            if not wasSubscribed:
                self.unsubscribeFromTopic(topicName)
            
            # If the 'call' timed out, raise exception:
            if not resBeforeTimeout:
                raise SyncCallTimedOut('Synchronous call on topic %s timed out' % topicName)
            
            # A result arrived from the call:
            res = self.resDict.get(msgUuid, None)
            
            # No longer need the result to be saved:
            try:
                del self.resDict[msgUuid]
            except KeyError:
                pass
            
            # Check whether awaitSynchronousReturn() placed an
            # error message into self.syncResultError:

            if self.syncResultError is not None:
                raise(SyncCallRuntimeError(self.syncResultError)) 
            
            return res
        
        else:
            # Not a synchronous call; just publish the request:
            self.producer.send_messages(topicName, json.dumps(msgDict))
       


    def subscribeToTopic(self, topicName, deliveryCallback=None, kafkaLiveCheckTimeout=30):
        '''
        Fork a new thread that keeps waiting for any messages
        on the topic of the given name. Stop listening for the topic
        by calling unsubscribeFromTropic(). 
        
        For convenience, a deliveryCallback function may be passed,
        saving a subsequent call to addTopicListener(). See addTopicListener()
        for details.
        
        If deliveryCallback is absent or None, then method _deliverResult()
        in this class will be used. That method is intended to be a 
        placeholder with no side effects.
        
        It is a no-op to call this method multiple times for the
        same topic.
                 
        :param topicName: official name of topic to listen for.
        :type topicName: string
        :param deliveryCallback: a function that takes two args: a topic
            name, and a topic content string.
        :type deliveryCallback: function
        :param kafkaLiveCheckTimeout: timeout in (fractional) seconds to
            wait when checking for a live Kafka server being available.
        :type kafkaLiveCheckTimeout: float
        :raises KafkaServerNotFound: when no Kafka server responds
        '''
        
        if deliveryCallback is None:
            deliveryCallback = self.resultCallback
            
        if type(deliveryCallback) != types.FunctionType and type(deliveryCallback) != functools.partial:
            raise ValueError("Parameter deliveryCallback must be a function, was of type %s" % type(deliveryCallback))

        try:
            # Does a thread for this msg already exist?
            self.listenerThreads[topicName]
            # Yep (b/c we didn't bomb out). Nothing to do:
            return
        
        except KeyError:
            # No thread exists for this topic. 
            
            # Create an event object that the thread will set()
            # whenever a msg arrives, even if no listeners exist:
            event = threading.Event()
            self.topicEvents[topicName] = event
            
            # Create the thread that will listen to Kafka;
            # raises KafkaServerNotFound if necessary:
            waitThread = _TopicWaiter(topicName, 
                                     self, 
                                     self.kafkaGroupId, 
                                     deliveryCallback=deliveryCallback, 
                                     eventObj=event,
                                     kafkaLiveCheckTimeout=kafkaLiveCheckTimeout)

            # Remember that this thread listens to the given topic:
            self.listenerThreads[topicName] = waitThread
            
            waitThread.start()

    def unsubscribeFromTopic(self, topicName):
        '''
        Unsubscribes from topic. Stops the topic's thread,
        and removes it from bookkeeping so that the Thread object
        will be garbage collected. Same for the Event object
        used by the thread to signal message arrival.
        
        Calling this method for a topic that is already
        unsubscribed is a no-op.
        
        :param topicName: name of topic to subscribe from
        :type topicName: string
        '''

        # Delete our record of the Event object used by the thread to
        # indicate message arrivals:
        try:
            del self.topicEvents[topicName]
        except KeyError:
            pass

        try:
            # Does a thread for this msg even exist?
            existingWaitThread = self.listenerThreads[topicName]

            # Yep, it exists. Stop it and remove it from
            # our bookkeeping
            existingWaitThread.stop()
            del self.listenerThreads[topicName]
            
        except KeyError:
            # No thread exists for this topic at all, so all done:
            return
    
    def addTopicListener(self, topicName, deliveryCallback):
        '''
        Add a listener function for a topic for which a
        subscription already exists. Parameter deliverCallback
        must be a function accepting parameters: topicName, rawResult, msgOffset
        It is an error to call the method without first
        having subscribed to the topic.
        
        :param topicName: name of topic to add
        :type topicName: String
        :param deliveryCallback: function to call when message to this topic arrives
        :type deliveryCallback: <function(topicName, rawResult, msgOffset)
        :raises NameError: if caller has not previously subscribed to topicName.

        '''
        
        if deliveryCallback != types.FunctionType and type(deliveryCallback) != functools.partial:
            raise ValueError("Parameter deliveryCallback must be a function, was of type %s" % type(deliveryCallback))
        try:
            # Does a thread for this msg already exist?
            existingWaitThread = self.listenerThreads[topicName]
            
            # Yep (b/c we didn't bomb out). Check whether the 
            # given deliveryCallback is already among the listeners 
            # added earlier:
            try:
                existingWaitThread.listeners().index(deliveryCallback)
                # Both, a thread and this callback already exist, do nothing:
                return
            except ValueError:
                pass
            # Thread exists for this topic, but an additional
            # callback is being registered:
            existingWaitThread.addListener(deliveryCallback)
            return
        except KeyError:
            # No thread exists for this topic, so no deliveryCallback
            # can be added:
            raise NameError("Attempt to add topic listener %s for topic '%s' without first subscribing to '%s'" %
                            (str(deliveryCallback), topicName, topicName))
        
    
    def removeTopicListener(self, topicName, deliveryCallback):
        '''
        Remove a topic listener function from a topic. It is
        a no-op to call this method with a topic that has not
        been subscribed to, or with a deliveryCallback function that
        was never added to the topic.
        
        :param topicName:
        :type topicName:
        :param deliveryCallback:
        :type deliveryCallback:
        '''
        
        try:
            # Does a thread for this msg even exist?
            existingWaitThread = self.listenerThreads[topicName]

            # Yep, exists (we didn't bomb). Now check whether the 
            # given deliveryCallback was actually added to the listeners 
            # earlier:

            existingListeners = existingWaitThread.listeners()
            try:
                existingListeners.index(deliveryCallback)
                # The listener to be removed does exist:
                existingWaitThread.removeListener(deliveryCallback)
                return 
            except NameError:
                # This listener isn't registered, so all done:
                return
            
        except KeyError:
            # No listener thread exists for this topic at all, so all done:
            return


    def waitForMessage(self, topicName, timeout=None):
        '''
        Block till a message on the given topic arrives. It is
        an error to call this method on a topic to which the
        caller has not previously subscribed.
        
        :param topicName:
        :type topicName:
        :param timeout: seconds (or fractions of second) to wait.
        :type timeout: float
        :returns: True if a message arrived in time, else returnes False
        :rtype: boolean
        :raises NameError: on attempt to wait for a topic for which no subscription exists.
        '''
        
        try:
            event = self.topicEvents[topicName]
            return(event.wait(timeout))
        except KeyError:
            raise NameError("Attempt to wait for messages on topic %s, which was never subscribed to." % topicName)
 
    def mySubscriptions(self):
        '''
        Return a list of topic names to which this bus adapter is subscribed.
        
        :return: List of topics to which caller is subscribed
        :rtype: [String]
        '''
        return self.topicEvents.keys()
        
    def returnError(self, req_key, topicName, errMsg):
        '''
        Convencience method when handling an incoming message.
        Returns a message that is marked as an error return.
        
        :param req_key: key of the incoming message; it will be used in the return message as well.
        :type req_key: String
        :param topicName: name of topic to use in the return message
        :type topicName: String
        :param errMsg: error message to include in the return message
        :type errMsg: String
        '''
        
        errMsg = {'resp_key'    : req_key,
                  'type'        : 'resp',
                  'status'      : 'ERROR',
                  'time'        : datetime.now().isoformat(),
                  'content'     : errMsg
                 }
        errMsgJSON = _JSONEncoderBusExtended.makeJSON(errMsg)
        self.bus.publish(errMsgJSON, topicName)
      
    def close(self):
        '''
        Cleanup. All threads are stopped. Kafka
        connection is closed.
        '''
        for thread in self.listenerThreads.values():
            thread.stop()
        self.listenerThreads.clear()
        self.topicEvents.clear()
        
        self.kafkaClient.close()

# --------------------------  Private Methods ---------------------


    def _deliverResult(self, topicName, rawResult, msgOffset):
        '''
        Simple default message delivery callback. Just prints 
        topic name and content. Override in subclass to get 
        more interesting behavior. Remember, though: you (I believe)
        need to do the functools.partial trick to create a function
        for your overriding method that already has 'self' curried out.
        We may be able to simplify that, because the listening threads
        do save the BusAdapter objecst that created them.    
        
        :param topicName: name of topic the msg came from
        :type topicName: string
        :param rawResult: the string from the wire; not yet de-serialized
        :type rawResult: string
        :param msgOffset: the Kafka queue offset of the message
        :type msgOffset: int 
        '''
        print('Msg at offset %d: %s' % (msgOffset,rawResult))
        

    def _awaitSynchronousReturn(self, topicName, rawResult, msgOffset):
        '''
        A callback for _TopicWaiter. Invoked from a different thread!!
        This callback is installed by publish() when a synchronous
        bus 'call' is executed. The main thread, i.e. publish() will
        have delivered the request to the bus, and initialized the 
        following instance variables for us:

          * self.uuidToWaitFor: the message id an incoming result must have
          * self.syncResultError: a place for this method to place an error message if necessary
          * self.resultArrivedEvent: a threading.Event() obj which this method will set() when it's done.
        
        :param topicName: name of topic on which a message arrived
        :type topicName: string
        :param rawResult: message payload; a JSON string
        :type rawResult: string
        :param msgOffset: offset in Kafka system
        :type msgOffset: int
        '''
        
        # If this incoming message is the wrong topic,
        # ignore; this should never happen, b/c this method
        # is only installed as a listener when we hang for
        # a synchronous call:

        if topicName != self.topicToWaitFor:
            return
        
        # Turn msg JSON into a dict:
        try:
            thisResDict = json.loads(rawResult)
        except ValueError:
            self.syncResultError = 'Bad JSON while waiting for sync response: %s' % rawResult
            # Tell main thread that answer to synchronous
            # call arrived, and was processed:
            self.resultArrivedEvent.set()
            return
        
        # Is this a response msg, and is it the one
        # we are waiting for?
        thisUuid    = thisResDict.get('id', None)
        thisMsgType = thisResDict.get('type', None)
        thisContent = thisResDict.get('content', None)
        
        if thisUuid    == self.uuidToWaitFor and \
           thisMsgType == 'resp':
            # All good; store just the msg content field
            # in a result dict that's shared with the main
            # thread:
            self.resDict[thisUuid] = thisContent
        
            # Tell main thread that answer to synchronous
            # call arrived, and was processed:
            self.resultArrivedEvent.set()
        else:
            # Not the msg we are waiting for:
            return
    
    
    def _setupLogging(self, loggingLevel, logFile):
        if BusAdapter._loggingInitialized:
            # Remove previous file or console handlers,
            # else we get logging output doubled:
            BusAdapter._logger.handlers = []
            
        # Set up logging:
        # A _logger named SchoolBusLog:
        BusAdapter._logger = logging.getLogger('SchoolBusLog')
        BusAdapter._logger.setLevel(loggingLevel)
        
        # A msg formatter that shows datetime, _logger name, 
        # the log level of the message, and the msg.
        # The datefmt=None causes ISO8601 to be used:
        
        formatter = logging.Formatter(fmt='%(asctime)s-%(name)s-%(levelname)s-%(module)s: %(message)s',datefmt=None)
        
        # Create file handler if requested:
        if logFile is not None:
            handler = logging.FileHandler(logFile)
        else:
            # Create console handler:
            handler = logging.StreamHandler()
        handler.setFormatter(formatter)
        handler.setLevel(loggingLevel)
#         # create formatter and add it to the handlers
#         formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
#         fh.setFormatter(formatter)
#         ch.setFormatter(formatter)
        # Add the handler to the _logger
        BusAdapter._logger.addHandler(handler)
        #**********************
        #BusAdapter._logger.info("Info for you")
        #BusAdapter._logger.warn("Warning for you")
        #BusAdapter._logger.debug("Debug for you")
        #**********************
        
        BusAdapter._loggingInitialized = True


    def logWarn(self, msg):
        '''
        Loccally log a warning message using the Python logging facility.
        The _logger name is 'SchoolBusLog'. Change format or _logger
        name by modifying _setupLogging().
        
        :param msg: message to log
        :type msg: String
        '''
        BusAdapter._logger.warn(msg)

    def logInfo(self, msg):
        '''
        Locally log an info message using the Python logging facility.
        The _logger name is 'SchoolBusLog'. Change format or _logger
        name by modifying _setupLogging().
        
        :param msg: message to log
        :type msg: String
        '''
        BusAdapter._logger.info(msg)
     
    def logError(self, msg):
        '''
        Locally log an error message using the Python logging facility.
        The _logger name is 'SchoolBusLog'. Change format or _logger
        name by modifying _setupLogging().
        
        :param msg: message to log
        :type msg: String
        '''
        
        BusAdapter._logger.error(msg)

    def logDebug(self, msg):
        '''
        Locally log a debug message using the Python logging facility.
        The _logger name is 'SchoolBusLog'. Change format or _logger
        name by modifying _setupLogging().
        
        :param msg: message to log
        :type msg: String
        '''
        BusAdapter._logger.debug(msg)
Esempio n. 26
0
class RedisMonitor:

    def __init__(self):
        self.setup()

    def setup(self):
        '''
        Connection stuff here so we can mock it
        '''
        self.redis_conn = redis.Redis(host=REDIS_HOST, port=REDIS_PORT)

        # set up kafka
        self.kafka_conn = KafkaClient(KAFKA_HOSTS)
        self.producer = SimpleProducer(self.kafka_conn)
        self.topic_prefix = KAFKA_TOPIC_PREFIX

    def run(self):
        '''
        The external main run loop
        '''
        self._main_loop()

    def _main_loop(self):
        '''
        The internal while true main loop for the redis monitor
        '''
        while True:
            self._do_info()
            self._do_expire()
            self._do_stop()

            time.sleep(0.1)

    def _do_info(self):
        '''
        Processes info action requests
        '''
        for key in self.redis_conn.scan_iter(match="info:*:*"):
            # the master dict to return
            master = {}
            master['uuid'] = self.redis_conn.get(key)
            master['total_pending'] = 0
            master['server_time'] = int(time.time())

            # break down key
            elements = key.split(":")
            dict = {}
            dict['spiderid'] = elements[1]
            dict['appid'] = elements[2]

            if len(elements) == 4:
                dict['crawlid'] = elements[3]

            # generate the information requested
            if 'crawlid' in dict:
                master = self._build_crawlid_info(master, dict)
            else:
                master = self._build_appid_info(master, dict)

            self.redis_conn.delete(key)

            if self._send_to_kafka(master):
                pass
                #print 'Sent info to kafka'
            else:
                print 'Failed to send info to kafka'

    def _send_to_kafka(self, master):
        '''
        Sends the message back to Kafka
        @param master: the final dict to send
        @log_extras: the extras to append to the log output
        @returns: True if successfully sent to kafka
        '''
        appid_topic = "{prefix}.outbound_{appid}".format(
                                                    prefix=self.topic_prefix,
                                                    appid=master['appid'])
        firehose_topic = "{prefix}.outbound_firehose".format(
                                                    prefix=self.topic_prefix)
        try:
            self.kafka_conn.ensure_topic_exists(appid_topic)
            self.kafka_conn.ensure_topic_exists(firehose_topic)
            # dont want logger in outbound kafka message
            dump = json.dumps(master)
            self.producer.send_messages(appid_topic, dump)
            self.producer.send_messages(firehose_topic, dump)

            return True
        except Exception as ex:
            print traceback.format_exc()
            pass

        return False

    def _build_appid_info(self, master, dict):
        '''
        Builds the appid info object

        @param master: the master dict
        @param dict: the dict object received
        @return: the appid info object
        '''
        master['total_crawlids'] = 0
        master['total_pending'] = 0
        master['total_domains'] = 0
        master['crawlids'] = {}
        master['appid'] = dict['appid']

        match_string = '{sid}:queue'.format(sid=dict['spiderid'])

        sortedDict = self._get_bin(match_string)

        # now iterate through binned dict
        for score in sortedDict:
            for item in sortedDict[score]:
                if 'meta' in item:
                    item = item['meta']
                if item['appid'] == dict['appid']:
                    crawlid = item['crawlid']

                    # add new crawlid to master dict
                    if crawlid not in master['crawlids']:
                        master['crawlids'][crawlid] = {}
                        master['crawlids'][crawlid]['total'] = 0
                        master['crawlids'][crawlid]['high_priority'] = -9999
                        master['crawlids'][crawlid]['low_priority'] = 9999

                        timeout_key = 'timeout:{sid}:{aid}:{cid}'.format(
                                    sid=dict['spiderid'],
                                    aid=dict['appid'],
                                    cid=crawlid)
                        if self.redis_conn.exists(timeout_key):
                            master['crawlids'][crawlid]['expires'] = self.redis_conn.get(timeout_key)

                        master['total_crawlids'] = master['total_crawlids'] + 1

                    if item['priority'] > master['crawlids'][crawlid]['high_priority']:
                        master['crawlids'][crawlid]['high_priority'] = item['priority']

                    if item['priority'] < master['crawlids'][crawlid]['low_priority']:
                        master['crawlids'][crawlid]['low_priority'] = item['priority']

                    master['crawlids'][crawlid]['total'] = master['crawlids'][crawlid]['total'] + 1
                    master['total_pending'] = master['total_pending'] + 1

        return master

    def _get_bin(self, key):
        '''
        Returns a binned dictionary based on redis zscore

        @return: The sorted dict
        '''
        # keys based on score
        sortedDict = {}
        # this doesnt return them in order, need to bin first
        for item in self.redis_conn.zscan_iter(key):
            my_item = pickle.loads(item[0])
            # score is negated in redis
            my_score = -item[1]

            if my_score not in sortedDict:
                sortedDict[my_score] = []

            sortedDict[my_score].append(my_item)

        return sortedDict

    def _build_crawlid_info(self,master, dict):
        '''
        Builds the crawlid info object

        @param master: the master dict
        @param dict: the dict object received
        @return: the crawlid info object
        '''
        master['total_pending'] = 0
        master['appid'] = dict['appid']
        master['crawlid'] = dict['crawlid']

        timeout_key = 'timeout:{sid}:{aid}:{cid}'.format(sid=dict['spiderid'],
                                                        aid=dict['appid'],
                                                        cid=dict['crawlid'])
        if self.redis_conn.exists(timeout_key):
            master['expires'] = self.redis_conn.get(timeout_key)

        # get all domain queues
        match_string = '{sid}:queue'.format(sid=dict['spiderid'])
        sortedDict = self._get_bin(match_string)

        # now iterate through binned dict
        for score in sortedDict:
            for item in sortedDict[score]:
                if 'meta' in item:
                    item = item['meta']
                if item['appid'] == dict['appid'] and \
                                item['crawlid'] == dict['crawlid']:

                    if 'high_priority' not in master:
                        master['high_priority'] = -99999

                    if 'low_priority' not in master:
                        master['low_priority'] = 99999

                    if item['priority'] > master['high_priority']:
                        master['high_priority'] = item['priority']

                    if item['priority'] < master['low_priority']:
                        master['low_priority'] = item['priority']

                    master['total_pending'] = master['total_pending'] + 1

        return master

    def _do_expire(self):
        '''
        Processes expire requests
        Very similar to _do_stop()
        '''
        for key in self.redis_conn.scan_iter(match="timeout:*:*:*"):
            timeout = float(self.redis_conn.get(key))
            curr_time = time.time()
            if curr_time > timeout:
                # break down key
                elements = key.split(":")
                spiderid = elements[1]
                appid = elements[2]
                crawlid = elements[3]

                # add crawl to blacklist so it doesnt propagate
                redis_key = spiderid + ":blacklist"
                value = '{appid}||{crawlid}'.format(appid=appid,
                                                crawlid=crawlid)
                # add this to the blacklist set
                self.redis_conn.sadd(redis_key, value)

                # everything stored in the queue is now expired
                result = self._purge_crawl(spiderid, appid, crawlid)

                # item to send to kafka
                extras = {}
                extras['action'] = "expire"
                extras['spiderid'] = spiderid
                extras['appid'] = appid
                extras['crawlid'] = crawlid
                extras['total_expired'] = result

                self.redis_conn.delete(key)

                if self._send_to_kafka(extras):
                    #print 'Sent expired ack to kafka'
                    pass
                else:
                    print 'Failed to send expired ack to kafka'

    def _do_stop(self):
        '''
        Processes stop action requests
        '''
        for key in self.redis_conn.scan_iter(match="stop:*:*:*"):
            # break down key
            elements = key.split(":")
            spiderid = elements[1]
            appid = elements[2]
            crawlid = elements[3]
            uuid = self.redis_conn.get(key)

            redis_key = spiderid + ":blacklist"
            value = '{appid}||{crawlid}'.format(appid=appid,
                                                crawlid=crawlid)

            # add this to the blacklist set
            self.redis_conn.sadd(redis_key, value)

            # purge crawlid from current set
            result = self._purge_crawl(spiderid, appid, crawlid)

            # item to send to kafka
            extras = {}
            extras['action'] = "stop"
            extras['spiderid'] = spiderid
            extras['appid'] = appid
            extras['crawlid'] = crawlid
            extras['total_purged'] = result

            self.redis_conn.delete(key)

            if self._send_to_kafka(extras):
                # delete timeout for crawl (if needed) since stopped
                timeout_key = 'timeout:{sid}:{aid}:{cid}'.format(
                                        sid=spiderid,
                                        aid=appid,
                                        cid=crawlid)
                self.redis_conn.delete(timeout_key)
                #print 'Sent stop ack to kafka'
            else:
                print 'Failed to send stop ack to kafka'

    def _purge_crawl(self, spiderid, appid, crawlid):
        '''
        Wrapper for purging the crawlid from the queues

        @param spiderid: the spider id
        @param appid: the app id
        @param crawlid: the crawl id
        @return: The number of requests purged
        '''
        # purge three times to try to make sure everything is cleaned
        total = self._mini_purge(spiderid, appid, crawlid)
        total = total + self._mini_purge(spiderid, appid, crawlid)
        total = total + self._mini_purge(spiderid, appid, crawlid)

        return total

    def _mini_purge(self, spiderid, appid, crawlid):
        '''
        Actually purges the crawlid from the queue

        @param spiderid: the spider id
        @param appid: the app id
        @param crawlid: the crawl id
        @return: The number of requests purged
        '''
        total_purged = 0

        match_string = '{sid}:queue'.format(sid=spiderid)
        # using scan for speed vs keys
        for item in self.redis_conn.zscan_iter(match_string):
            item_key = item[0]
            item = pickle.loads(item_key)
            if 'meta' in item:
                item = item['meta']

            if item['appid'] == appid and item['crawlid'] == crawlid:
                self.redis_conn.zrem(match_string, item_key)
                total_purged = total_purged + 1

        return total_purged
Esempio n. 27
0
                'name': 'id',
                'type': 'int'
            },
            {
                'name': 'random',
                'type': 'int'
            },
            {
                'name': 'data',
                'type': 'string'
            },
        ],
    }))

kafka = KafkaClient(kafkaConnect)
kafka.ensure_topic_exists(topic)
producer = SimpleProducer(kafka)

for x in xrange(maxRecords):
    writer = avro.io.DatumWriter(schema)
    bytes_writer = io.BytesIO()
    encoder = avro.io.BinaryEncoder(bytes_writer)

    writer.write(
        {
            'id': x,
            'random': randint(1, 3),
            'data': str(uuid.uuid4().get_hex().upper()[0:20])
        }, encoder)
    raw_bytes = bytes_writer.getvalue()
    producer.send_messages(topic, raw_bytes)
Esempio n. 28
0
class KafkaMonitor:
    def __init__(self, settings_name, unit_test=False):
        '''
        @param settings_name: the local settings file name
        @param unit_test: whether running unit tests or not
        '''
        self.settings_name = settings_name
        self.wrapper = SettingsWrapper()
        self.logger = None
        self.unit_test = unit_test

    def _import_class(self, cl):
        '''
        Imports a class from a string

        @param name: the module and class name in dot notation
        '''
        d = cl.rfind(".")
        classname = cl[d + 1:len(cl)]
        m = __import__(cl[0:d], globals(), locals(), [classname])
        return getattr(m, classname)

    def _load_plugins(self):
        '''
        Sets up all plugins, defaults and settings.py
        '''
        plugins = self.settings['PLUGINS']

        self.plugins_dict = {}
        for key in plugins:
            # skip loading the plugin if its value is None
            if plugins[key] is None:
                continue
            # valid plugin, import and setup
            self.logger.debug("Trying to load plugin {cls}".format(cls=key))
            the_class = self._import_class(key)
            instance = the_class()
            instance._set_logger(self.logger)
            if not self.unit_test:
                instance.setup(self.settings)
            the_schema = None

            print("self.settings['PLUGIN_DIR'] + instance.schema====",
                  self.settings['PLUGIN_DIR'] + instance.schema)
            with open(self.settings['PLUGIN_DIR'] +
                      instance.schema) as the_file:
                the_schema = json.load(the_file)

            mini = {}
            mini['instance'] = instance
            mini['schema'] = the_schema

            self.plugins_dict[plugins[key]] = mini

        self.plugins_dict = OrderedDict(
            sorted(self.plugins_dict.items(), key=lambda t: t[0]))

    def setup(self, level=None, log_file=None, json=None):
        '''
        Load everything up. Note that any arg here will override both
        default and custom settings

        @param level: the log level
        @param log_file: boolean t/f whether to log to a file, else stdout
        @param json: boolean t/f whether to write the logs in json
        '''
        self.settings = self.wrapper.load(self.settings_name)

        my_level = level if level else self.settings['LOG_LEVEL']
        # negate because logger wants True for std out
        my_output = not log_file if log_file else self.settings['LOG_STDOUT']
        my_json = json if json else self.settings['LOG_JSON']
        self.logger = LogFactory.get_instance(
            json=my_json,
            stdout=my_output,
            level=my_level,
            name=self.settings['LOGGER_NAME'],
            dir=self.settings['LOG_DIR'],
            file=self.settings['LOG_FILE'],
            bytes=self.settings['LOG_MAX_BYTES'],
            backups=self.settings['LOG_BACKUPS'])

        self.validator = self.extend_with_default(Draft4Validator)

    def _setup_stats(self):
        '''
        Sets up the stats collection
        '''
        self.stats_dict = {}

        redis_conn = redis.Redis(host=self.settings['REDIS_HOST'],
                                 port=self.settings['REDIS_PORT'])

        try:
            redis_conn.info()
            self.logger.debug("Connected to Redis in StatsCollector Setup")
        except ConnectionError:
            self.logger.warn("Failed to connect to Redis in StatsCollector"
                             " Setup, no stats will be collected")
            return

        if self.settings['STATS_TOTAL']:
            self._setup_stats_total(redis_conn)

        if self.settings['STATS_PLUGINS']:
            self._setup_stats_plugins(redis_conn)

    def _setup_stats_total(self, redis_conn):
        '''
        Sets up the total stats collectors

        @param redis_conn: the redis connection
        '''
        self.stats_dict['total'] = {}
        self.stats_dict['fail'] = {}
        temp_key1 = 'stats:kafka-monitor:total'
        temp_key2 = 'stats:kafka-monitor:fail'
        for item in self.settings['STATS_TIMES']:
            try:
                time = getattr(StatsCollector, item)
                self.stats_dict['total'][time] = StatsCollector \
                        .get_rolling_time_window(
                                redis_conn=redis_conn,
                                key='{k}:{t}'.format(k=temp_key1, t=time),
                                window=time,
                                cycle_time=self.settings['STATS_CYCLE'])
                self.stats_dict['fail'][time] = StatsCollector \
                        .get_rolling_time_window(
                                redis_conn=redis_conn,
                                key='{k}:{t}'.format(k=temp_key2, t=time),
                                window=time,
                                cycle_time=self.settings['STATS_CYCLE'])
                self.logger.debug("Set up total/fail Stats Collector '{i}'"\
                        .format(i=item))
            except AttributeError as e:
                self.logger.warning("Unable to find Stats Time '{s}'"\
                        .format(s=item))
        total1 = StatsCollector.get_hll_counter(
            redis_conn=redis_conn,
            key='{k}:lifetime'.format(k=temp_key1),
            cycle_time=self.settings['STATS_CYCLE'],
            roll=False)
        total2 = StatsCollector.get_hll_counter(
            redis_conn=redis_conn,
            key='{k}:lifetime'.format(k=temp_key2),
            cycle_time=self.settings['STATS_CYCLE'],
            roll=False)
        self.logger.debug("Set up total/fail Stats Collector 'lifetime'")
        self.stats_dict['total']['lifetime'] = total1
        self.stats_dict['fail']['lifetime'] = total2

    def _setup_stats_plugins(self, redis_conn):
        '''
        Sets up the plugin stats collectors

        @param redis_conn: the redis connection
        '''
        self.stats_dict['plugins'] = {}
        for key in self.plugins_dict:
            plugin_name = self.plugins_dict[key]['instance'].__class__.__name__
            temp_key = 'stats:kafka-monitor:{p}'.format(p=plugin_name)
            self.stats_dict['plugins'][plugin_name] = {}
            for item in self.settings['STATS_TIMES']:
                try:
                    time = getattr(StatsCollector, item)

                    self.stats_dict['plugins'][plugin_name][time] = StatsCollector \
                            .get_rolling_time_window(
                                    redis_conn=redis_conn,
                                    key='{k}:{t}'.format(k=temp_key, t=time),
                                    window=time,
                                    cycle_time=self.settings['STATS_CYCLE'])
                    self.logger.debug("Set up {p} plugin Stats Collector '{i}'"\
                            .format(p=plugin_name, i=item))
                except AttributeError:
                    self.logger.warning("Unable to find Stats Time '{s}'"\
                            .format(s=item))
            total = StatsCollector.get_hll_counter(
                redis_conn=redis_conn,
                key='{k}:lifetime'.format(k=temp_key),
                cycle_time=self.settings['STATS_CYCLE'],
                roll=False)
            self.logger.debug("Set up {p} plugin Stats Collector 'lifetime'"\
                            .format(p=plugin_name))
            self.stats_dict['plugins'][plugin_name]['lifetime'] = total

    def _setup_kafka(self):
        '''
        Sets up kafka connections
        '''
        @MethodTimer.timeout(self.settings['KAFKA_CONN_TIMEOUT'], False)
        def _hidden_setup():
            try:
                self.kafka_conn = KafkaClient(self.settings['KAFKA_HOSTS'])
                self.kafka_conn.ensure_topic_exists(
                    self.settings['KAFKA_INCOMING_TOPIC'])
                self.consumer = SimpleConsumer(
                    self.kafka_conn,
                    self.settings['KAFKA_GROUP'],
                    self.settings['KAFKA_INCOMING_TOPIC'],
                    auto_commit=True,
                    iter_timeout=1.0)
            except KafkaUnavailableError as ex:
                message = "An exception '{0}' occured. Arguments:\n{1!r}" \
                    .format(type(ex).__name__, ex.args)
                self.logger.error(message)
                sys.exit(1)
            return True

        ret_val = _hidden_setup()

        if ret_val:
            self.logger.debug("Successfully connected to Kafka")
        else:
            self.logger.error("Failed to set up Kafka Connection within"
                              " timeout")
            # this is essential to running the kafka monitor
            sys.exit(1)

    def extend_with_default(self, validator_class):
        '''
        Method to add default fields to our schema validation
        ( From the docs )
        '''
        validate_properties = validator_class.VALIDATORS["properties"]

        def set_defaults(validator, properties, instance, schema):
            for error in validate_properties(
                    validator,
                    properties,
                    instance,
                    schema,
            ):
                yield error

            for property, subschema in properties.iteritems():
                if "default" in subschema:
                    instance.setdefault(property, subschema["default"])

        return validators.extend(
            validator_class,
            {"properties": set_defaults},
        )

    def _main_loop(self):
        '''
        Continuous loop that reads from a kafka topic and tries to validate
        incoming messages
        '''
        self.logger.debug("Processing messages")
        old_time = 0
        while True:
            self._process_messages()
            if self.settings['STATS_DUMP'] != 0:
                new_time = int(time.time() / self.settings['STATS_DUMP'])
                # only log every X seconds
                if new_time != old_time:
                    self._dump_stats()
                    old_time = new_time

            time.sleep(.01)

    def _process_messages(self):
        try:
            for message in self.consumer.get_messages():
                if message is None:
                    self.logger.debug("no message")
                    break
                try:
                    self._increment_total_stat(message.message.value)
                    the_dict = json.loads(message.message.value)
                    print('the_dict', the_dict)
                    found_plugin = False

                    print('self.plugins_dict', self.plugins_dict)
                    for key in self.plugins_dict:
                        obj = self.plugins_dict[key]
                        instance = obj['instance']

                        print('instance==', instance)

                        schema = obj['schema']

                        print(
                            'schema********************************************',
                            schema)
                        try:
                            print('before       v = self.validator(schema)')

                            v = self.validator(schema)

                            print('after       v = self.validator(schema)')

                            print('the_dict-------', the_dict)

                            v.validate(the_dict)
                            found_plugin = True

                            print('found_plugin====', found_plugin)

                            self._increment_plugin_stat(
                                instance.__class__.__name__, the_dict)

                            print('instance.handle(the_dict)', the_dict)
                            ret = instance.handle(the_dict)
                            # break if nothing is returned
                            if ret is None:
                                break
                        except ValidationError:

                            print('  except ValidationError:======')

                            pass
                    if not found_plugin:
                        extras = {}
                        extras['parsed'] = True
                        extras['valid'] = False
                        extras['data'] = the_dict
                        self.logger.warn(
                            "Did not find schema to validate "
                            "request",
                            extra=extras)
                        self._increment_fail_stat(the_dict)

                except ValueError:
                    extras = {}
                    extras['parsed'] = False
                    extras['valid'] = False
                    extras['data'] = message.message.value
                    self.logger.warning('Unparseable JSON Received',
                                        extra=extras)
                    self._increment_fail_stat(message.message.value)

        except OffsetOutOfRangeError:
            # consumer has no idea where they are
            self.consumer.seek(0, 2)
            self.logger.error("Kafka offset out of range error")

    def _increment_total_stat(self, string):
        '''
        Increments the total stat counters

        @param string: the loaded message object for the counter
        '''
        string = string + str(time.time())
        if 'total' in self.stats_dict:
            self.logger.debug("Incremented total stats")
            for key in self.stats_dict['total']:
                if key == 'lifetime':

                    self.stats_dict['total'][key].increment(string)
                else:
                    self.stats_dict['total'][key].increment()

    def _increment_fail_stat(self, item):
        '''
        Increments the total stat counters

        @param item: the loaded message object for HLL counter
        '''
        if isinstance(item, dict):
            item['ts'] = time.time()
        elif isinstance(item, str):
            item = item + str(time.time())

        if 'fail' in self.stats_dict:
            self.logger.debug("Incremented fail stats")
            for key in self.stats_dict['fail']:
                if key == 'lifetime':
                    self.stats_dict['fail'][key].increment(item)
                else:
                    self.stats_dict['fail'][key].increment()

    def _increment_plugin_stat(self, name, item):
        '''
        Increments the total stat counters

        @param name: The formal name of the plugin
        @param dict: the loaded message object for HLL counter
        '''
        item['ts'] = time.time()
        if 'plugins' in self.stats_dict:
            self.logger.debug("Incremented plugin '{p}' plugin stats"\
                    .format(p=name))
            for key in self.stats_dict['plugins'][name]:
                if key == 'lifetime':
                    self.stats_dict['plugins'][name][key].increment(item)
                else:
                    self.stats_dict['plugins'][name][key].increment()

    def _dump_stats(self):
        '''
        Dumps the stats out
        '''
        extras = {}
        if 'total' in self.stats_dict:
            self.logger.debug("Compiling total/fail dump stats")
            for key in self.stats_dict['total']:
                final = 'total_{t}'.format(t=key)
                extras[final] = self.stats_dict['total'][key].value()
            for key in self.stats_dict['fail']:
                final = 'fail_{t}'.format(t=key)
                extras[final] = self.stats_dict['fail'][key].value()

        if 'plugins' in self.stats_dict:
            self.logger.debug("Compiling plugin dump stats")
            for name in self.stats_dict['plugins']:
                for key in self.stats_dict['plugins'][name]:
                    final = 'plugin_{n}_{t}'.format(n=name, t=key)
                    extras[final] = self.stats_dict['plugins'][name][
                        key].value()

        if not self.logger.json:
            self.logger.info('Kafka Monitor Stats Dump:\n{0}'.format(
                json.dumps(extras, indent=4, sort_keys=True)))
        else:
            self.logger.info('Kafka Monitor Stats Dump', extra=extras)

    def run(self):
        '''
        Set up and run
        '''
        self._setup_kafka()
        self._load_plugins()
        self._setup_stats()
        self._main_loop()

    def feed(self, json_item):
        '''
        Feeds a json item into the Kafka topic

        @param json_item: The loaded json object
        '''
        @MethodTimer.timeout(self.settings['KAFKA_FEED_TIMEOUT'], False)
        def _feed(json_item):
            try:
                self.kafka_conn = KafkaClient(self.settings['KAFKA_HOSTS'])
                topic = self.settings['KAFKA_INCOMING_TOPIC']
                producer = SimpleProducer(self.kafka_conn)
            except KafkaUnavailableError:
                self.logger.error("Unable to connect to Kafka")
                return False

            if not self.logger.json:
                self.logger.info('Feeding JSON into {0}\n{1}'.format(
                    topic, json.dumps(json_item, indent=4)))
            else:
                self.logger.info('Feeding JSON into {0}\n'.format(topic),
                                 extra={'value': json_item})

            self.kafka_conn.ensure_topic_exists(topic)
            producer.send_messages(topic, json.dumps(json_item))

            return True

        result = _feed(json_item)

        if result:
            self.logger.info("Successfully fed item to Kafka")
        else:
            self.logger.error("Failed to feed item into Kafka")
Esempio n. 29
0
class KafkaMonitor:

    def __init__(self, settings_name, unit_test=False):
        '''
        @param settings_name: the local settings file name
        @param unit_test: whether running unit tests or not
        '''
        self.settings_name = settings_name
        self.wrapper = SettingsWrapper()
        self.logger = None
        self.unit_test = unit_test

    def _import_class(self, cl):
        '''
        Imports a class from a string

        @param name: the module and class name in dot notation
        '''
        d = cl.rfind(".")
        classname = cl[d+1:len(cl)]
        m = __import__(cl[0:d], globals(), locals(), [classname])
        return getattr(m, classname)

    def _load_plugins(self):
        '''
        Sets up all plugins, defaults and settings.py
        '''
        plugins = self.settings['PLUGINS']

        self.plugins_dict = {}
        for key in plugins:
            # skip loading the plugin if its value is None
            if plugins[key] is None:
                continue
            # valid plugin, import and setup
            self.logger.debug("Trying to load plugin {cls}".format(cls=key))
            the_class = self._import_class(key)
            instance = the_class()
            instance._set_logger(self.logger)
            if not self.unit_test:
                instance.setup(self.settings)
            the_schema = None
            with open(self.settings['PLUGIN_DIR'] + instance.schema) as the_file:
                the_schema = json.load(the_file)

            mini = {}
            mini['instance'] = instance
            mini['schema'] = the_schema

            self.plugins_dict[plugins[key]] = mini

        self.plugins_dict = OrderedDict(sorted(self.plugins_dict.items(),
                                               key=lambda t: t[0]))

    def setup(self, level=None, log_file=None, json=None):
        '''
        Load everything up. Note that any arg here will override both
        default and custom settings

        @param level: the log level
        @param log_file: boolean t/f whether to log to a file, else stdout
        @param json: boolean t/f whether to write the logs in json
        '''
        self.settings = self.wrapper.load(self.settings_name)

        my_level = level if level else self.settings['LOG_LEVEL']
        # negate because logger wants True for std out
        my_output = not log_file if log_file else self.settings['LOG_STDOUT']
        my_json = json if json else self.settings['LOG_JSON']
        self.logger = LogFactory.get_instance(json=my_json, stdout=my_output,
                                              level=my_level,
                                              name=self.settings['LOGGER_NAME'],
                                              dir=self.settings['LOG_DIR'],
                                              file=self.settings['LOG_FILE'],
                                              bytes=self.settings['LOG_MAX_BYTES'],
                                              backups=self.settings['LOG_BACKUPS'])

        self.validator = self.extend_with_default(Draft4Validator)

    def _setup_stats(self):
        '''
        Sets up the stats collection
        '''
        self.stats_dict = {}

        redis_conn = redis.Redis(host=self.settings['REDIS_HOST'],
                                 port=self.settings['REDIS_PORT'])

        try:
            redis_conn.info()
            self.logger.debug("Connected to Redis in StatsCollector Setup")
        except ConnectionError:
            self.logger.warn("Failed to connect to Redis in StatsCollector"
                             " Setup, no stats will be collected")
            return

        if self.settings['STATS_TOTAL']:
            self._setup_stats_total(redis_conn)

        if self.settings['STATS_PLUGINS']:
            self._setup_stats_plugins(redis_conn)

    def _setup_stats_total(self, redis_conn):
        '''
        Sets up the total stats collectors

        @param redis_conn: the redis connection
        '''
        self.stats_dict['total'] = {}
        self.stats_dict['fail'] = {}
        temp_key1 = 'stats:kafka-monitor:total'
        temp_key2 = 'stats:kafka-monitor:fail'
        for item in self.settings['STATS_TIMES']:
            try:
                time = getattr(StatsCollector, item)
                self.stats_dict['total'][time] = StatsCollector \
                        .get_rolling_time_window(
                                redis_conn=redis_conn,
                                key='{k}:{t}'.format(k=temp_key1, t=time),
                                window=time,
                                cycle_time=self.settings['STATS_CYCLE'])
                self.stats_dict['fail'][time] = StatsCollector \
                        .get_rolling_time_window(
                                redis_conn=redis_conn,
                                key='{k}:{t}'.format(k=temp_key2, t=time),
                                window=time,
                                cycle_time=self.settings['STATS_CYCLE'])
                self.logger.debug("Set up total/fail Stats Collector '{i}'"\
                        .format(i=item))
            except AttributeError as e:
                self.logger.warning("Unable to find Stats Time '{s}'"\
                        .format(s=item))
        total1 = StatsCollector.get_hll_counter(redis_conn=redis_conn,
                                                key='{k}:lifetime'.format(k=temp_key1),
                                                cycle_time=self.settings['STATS_CYCLE'],
                                                roll=False)
        total2 = StatsCollector.get_hll_counter(redis_conn=redis_conn,
                                                key='{k}:lifetime'.format(k=temp_key2),
                                                cycle_time=self.settings['STATS_CYCLE'],
                                                roll=False)
        self.logger.debug("Set up total/fail Stats Collector 'lifetime'")
        self.stats_dict['total']['lifetime'] = total1
        self.stats_dict['fail']['lifetime'] = total2

    def _setup_stats_plugins(self, redis_conn):
        '''
        Sets up the plugin stats collectors

        @param redis_conn: the redis connection
        '''
        self.stats_dict['plugins'] = {}
        for key in self.plugins_dict:
            plugin_name = self.plugins_dict[key]['instance'].__class__.__name__
            temp_key = 'stats:kafka-monitor:{p}'.format(p=plugin_name)
            self.stats_dict['plugins'][plugin_name] = {}
            for item in self.settings['STATS_TIMES']:
                try:
                    time = getattr(StatsCollector, item)

                    self.stats_dict['plugins'][plugin_name][time] = StatsCollector \
                            .get_rolling_time_window(
                                    redis_conn=redis_conn,
                                    key='{k}:{t}'.format(k=temp_key, t=time),
                                    window=time,
                                    cycle_time=self.settings['STATS_CYCLE'])
                    self.logger.debug("Set up {p} plugin Stats Collector '{i}'"\
                            .format(p=plugin_name, i=item))
                except AttributeError:
                    self.logger.warning("Unable to find Stats Time '{s}'"\
                            .format(s=item))
            total = StatsCollector.get_hll_counter(redis_conn=redis_conn,
                                                   key='{k}:lifetime'.format(k=temp_key),
                                                   cycle_time=self.settings['STATS_CYCLE'],
                                                   roll=False)
            self.logger.debug("Set up {p} plugin Stats Collector 'lifetime'"\
                            .format(p=plugin_name))
            self.stats_dict['plugins'][plugin_name]['lifetime'] = total

    def _setup_kafka(self):
        '''
        Sets up kafka connections
        '''
        @MethodTimer.timeout(self.settings['KAFKA_CONN_TIMEOUT'], False)
        def _hidden_setup():
            try:
                self.kafka_conn = KafkaClient(self.settings['KAFKA_HOSTS'])
                self.kafka_conn.ensure_topic_exists(
                        self.settings['KAFKA_INCOMING_TOPIC'])
                self.consumer = SimpleConsumer(self.kafka_conn,
                                               self.settings['KAFKA_GROUP'],
                                               self.settings['KAFKA_INCOMING_TOPIC'],
                                               auto_commit=True,
                                               iter_timeout=1.0)
            except KafkaUnavailableError as ex:
                message = "An exception '{0}' occured. Arguments:\n{1!r}" \
                    .format(type(ex).__name__, ex.args)
                self.logger.error(message)
                sys.exit(1)
            return True
        ret_val = _hidden_setup()

        if ret_val:
            self.logger.debug("Successfully connected to Kafka")
        else:
            self.logger.error("Failed to set up Kafka Connection within"
                              " timeout")
            # this is essential to running the kafka monitor
            sys.exit(1)

    def extend_with_default(self, validator_class):
        '''
        Method to add default fields to our schema validation
        ( From the docs )
        '''
        validate_properties = validator_class.VALIDATORS["properties"]

        def set_defaults(validator, properties, instance, schema):
            for error in validate_properties(
                validator, properties, instance, schema,
            ):
                yield error

            for property, subschema in properties.iteritems():
                if "default" in subschema:
                    instance.setdefault(property, subschema["default"])

        return validators.extend(
            validator_class, {"properties": set_defaults},
        )

    def _main_loop(self):
        '''
        Continuous loop that reads from a kafka topic and tries to validate
        incoming messages
        '''
        self.logger.debug("Processing messages")
        old_time = 0
        while True:
            self._process_messages()
            if self.settings['STATS_DUMP'] != 0:
                new_time = int(time.time() / self.settings['STATS_DUMP'])
                # only log every X seconds
                if new_time != old_time:
                    self._dump_stats()
                    old_time = new_time

            time.sleep(.01)

    def _process_messages(self):
        try:
            for message in self.consumer.get_messages():
                if message is None:
                    self.logger.debug("no message")
                    break
                try:
                    self._increment_total_stat(message.message.value)
                    the_dict = json.loads(message.message.value)
                    found_plugin = False
                    for key in self.plugins_dict:
                        obj = self.plugins_dict[key]
                        instance = obj['instance']
                        schema = obj['schema']
                        try:
                            self.validator(schema).validate(the_dict)
                            found_plugin = True
                            self._increment_plugin_stat(
                                    instance.__class__.__name__,
                                    the_dict)
                            ret = instance.handle(the_dict)
                            # break if nothing is returned
                            if ret is None:
                                break
                        except ValidationError:
                            pass
                    if not found_plugin:
                        extras = {}
                        extras['parsed'] = True
                        extras['valid'] = False
                        extras['data'] = the_dict
                        self.logger.warn("Did not find schema to validate "
                                         "request", extra=extras)
                        self._increment_fail_stat(the_dict)

                except ValueError:
                    extras = {}
                    extras['parsed'] = False
                    extras['valid'] = False
                    extras['data'] = message.message.value
                    self.logger.warning('Unparseable JSON Received',
                                        extra=extras)
                    self._increment_fail_stat(message.message.value)

        except OffsetOutOfRangeError:
            # consumer has no idea where they are
            self.consumer.seek(0, 2)
            self.logger.error("Kafka offset out of range error")

    def _increment_total_stat(self, string):
        '''
        Increments the total stat counters

        @param string: the loaded message object for the counter
        '''
        string = string + str(time.time())
        if 'total' in self.stats_dict:
            self.logger.debug("Incremented total stats")
            for key in self.stats_dict['total']:
                if key == 'lifetime':

                    self.stats_dict['total'][key].increment(string)
                else:
                    self.stats_dict['total'][key].increment()

    def _increment_fail_stat(self, item):
        '''
        Increments the total stat counters

        @param item: the loaded message object for HLL counter
        '''
        if isinstance(item, dict):
            item['ts'] = time.time()
        elif isinstance(item, str):
            item = item + str(time.time())

        if 'fail' in self.stats_dict:
            self.logger.debug("Incremented fail stats")
            for key in self.stats_dict['fail']:
                if key == 'lifetime':
                    self.stats_dict['fail'][key].increment(item)
                else:
                    self.stats_dict['fail'][key].increment()

    def _increment_plugin_stat(self, name, item):
        '''
        Increments the total stat counters

        @param name: The formal name of the plugin
        @param dict: the loaded message object for HLL counter
        '''
        item['ts'] = time.time()
        if 'plugins' in self.stats_dict:
            self.logger.debug("Incremented plugin '{p}' plugin stats"\
                    .format(p=name))
            for key in self.stats_dict['plugins'][name]:
                if key == 'lifetime':
                    self.stats_dict['plugins'][name][key].increment(item)
                else:
                    self.stats_dict['plugins'][name][key].increment()

    def _dump_stats(self):
        '''
        Dumps the stats out
        '''
        extras = {}
        if 'total' in self.stats_dict:
            self.logger.debug("Compiling total/fail dump stats")
            for key in self.stats_dict['total']:
                final = 'total_{t}'.format(t=key)
                extras[final] = self.stats_dict['total'][key].value()
            for key in self.stats_dict['fail']:
                final = 'fail_{t}'.format(t=key)
                extras[final] = self.stats_dict['fail'][key].value()

        if 'plugins' in self.stats_dict:
            self.logger.debug("Compiling plugin dump stats")
            for name in self.stats_dict['plugins']:
                for key in self.stats_dict['plugins'][name]:
                    final = 'plugin_{n}_{t}'.format(n=name, t=key)
                    extras[final] = self.stats_dict['plugins'][name][key].value()

        if not self.logger.json:
            self.logger.info('Kafka Monitor Stats Dump:\n{0}'.format(
                    json.dumps(extras, indent=4, sort_keys=True)))
        else:
            self.logger.info('Kafka Monitor Stats Dump', extra=extras)

    def run(self):
        '''
        Set up and run
        '''
        self._setup_kafka()
        self._load_plugins()
        self._setup_stats()
        self._main_loop()

    def feed(self, json_item):
        '''
        Feeds a json item into the Kafka topic

        @param json_item: The loaded json object
        '''
        @MethodTimer.timeout(self.settings['KAFKA_FEED_TIMEOUT'], False)
        def _feed(json_item):
            try:
                self.kafka_conn = KafkaClient(self.settings['KAFKA_HOSTS'])
                topic = self.settings['KAFKA_INCOMING_TOPIC']
                producer = SimpleProducer(self.kafka_conn)
            except KafkaUnavailableError:
                self.logger.error("Unable to connect to Kafka")
                return False

            if not self.logger.json:
                self.logger.info('Feeding JSON into {0}\n{1}'.format(
                    topic, json.dumps(json_item, indent=4)))
            else:
                self.logger.info('Feeding JSON into {0}\n'.format(topic),
                                 extra={'value': json_item})

            self.kafka_conn.ensure_topic_exists(topic)
            producer.send_messages(topic, json.dumps(json_item))

            return True

        result = _feed(json_item)

        if result:
            self.logger.info("Successfully fed item to Kafka")
        else:
            self.logger.error("Failed to feed item into Kafka")
Esempio n. 30
0
class KafkaDatawakeLookaheadSpout(Spout):
    group = 'datawake-crawler-out-consumer'.encode()

    def __init__(self):
        Spout.__init__(self)
        self.queue = None

    def initialize(self, stormconf, context):
        try:
            self.settings = all_settings.get_settings(
                stormconf['topology.deployment'])
            self.topic = self.settings['crawler-out-topic'].encode()
            self.conn_pool = self.settings['crawler_conn_pool'].encode()
            self.log('KafkaDatawakeLookaheadSpout initialized with topic =' +
                     self.topic + ' conn_pool=' + self.conn_pool)
            self.kafka = KafkaClient(self.conn_pool)
            self.kafka.ensure_topic_exists(self.topic)
            self.consumer = SimpleConsumer(self.kafka,
                                           self.group,
                                           self.topic,
                                           max_buffer_size=None)
            self.consumer.seek(0, 2)  # move to the tail of the queue
        except:
            self.log("KafkaDatawakeLookaheadSpout initialize error",
                     level='error')
            self.log(traceback.format_exc(), level='error')
            raise

    def next_tuple(self):
        """
        input message:
            dict(
                 crawlid = input['crawlid'],
                 appid = input['appid'],
                 url = url,
                 status_code = response.getcode(),
                 status_msg = 'Success',
                 timestamp = response.info()['date'],
                 links_found = links,
                 body =  html,
                 attrs = input['attrs']
            )
        :return:  (url, status, headers, flags, body, timestamp, source,context)
        """

        offsetAndMessage = self.consumer.get_messages(timeout=None)[0]
        message = offsetAndMessage.message.value

        crawled = json.loads(message)
        if crawled['appid'] == self.settings["appid"]:
            safeurl = crawled['url'].encode('utf-8', 'ignore')
            self.log("Lookahead spout received id: " + crawled['crawlid'] +
                     " url: " + safeurl)
            context = {
                'source': 'datawake-lookahead',
                'domain': crawled['attrs']['domain']
            }
            self.emit([
                crawled['url'], crawled['status_code'], '', '',
                crawled['body'], crawled['timestamp'], context['source'],
                context
            ])
Esempio n. 31
0
class KafkaMonitor:
    def __init__(self, settings):
        # dynamic import of settings file
        # remove the .py from the filename
        self.settings = importlib.import_module(settings[:-3])

        # only need kafka for both uses
        self.kafka_conn = KafkaClient(self.settings.KAFKA_HOSTS)

    def get_method(self, key):
        if key == 'handle_crawl_request':
            return self.handle_crawl_request
        elif key == 'handle_action_request':
            return self.handle_action_request
        raise AttributeError(key)

    def setup(self):
        self.redis_conn = redis.Redis(host=self.settings.REDIS_HOST,
                                      port=self.settings.REDIS_PORT)

        self.kafka_conn.ensure_topic_exists(self.settings.KAFKA_INCOMING_TOPIC)
        self.consumer = SimpleConsumer(self.kafka_conn,
                                       self.settings.KAFKA_GROUP,
                                       self.settings.KAFKA_INCOMING_TOPIC,
                                       auto_commit=True,
                                       iter_timeout=1.0)

        self.result_method = self.get_method(self.settings.SCHEMA_METHOD)

        self.validator = self.extend_with_default(Draft4Validator)

    def extend_with_default(self, validator_class):
        '''
        Method to add default fields to our schema validation
        ( From the docs )
        '''
        validate_properties = validator_class.VALIDATORS["properties"]

        def set_defaults(validator, properties, instance, schema):
            for error in validate_properties(
                    validator, properties, instance, schema,
            ):
                yield error

            for property, subschema in properties.iteritems():
                if "default" in subschema:
                    instance.setdefault(property, subschema["default"])

        return validators.extend(
            validator_class, {"properties": set_defaults},
        )

    def handle_crawl_request(self, dict):
        '''
        Processes a vaild crawl request

        @param dict: a valid dictionary object
        '''
        # format key
        key = "{sid}:queue".format(sid=dict['spiderid'])
        val = pickle.dumps(dict, protocol=-1)

        # shortcut to shove stuff into the priority queue
        self.redis_conn.zadd(key, val, -dict['priority'])

        # if timeout crawl, add value to redis
        if 'expires' in dict:
            key = "timeout:{sid}:{appid}:{crawlid}".format(
                sid=dict['spiderid'],
                appid=dict['appid'],
                crawlid=dict['crawlid'])
            self.redis_conn.set(key, dict['expires'])

    def handle_action_request(self, dict):
        '''
        Processes a vaild action request

        @param dict: The valid dictionary object
        '''
        # format key
        key = "{action}:{spiderid}:{appid}".format(
            action=dict['action'],
            spiderid=dict['spiderid'],
            appid=dict['appid'])

        if "crawlid" in dict:
            key = key + ":" + dict['crawlid']

        self.redis_conn.set(key, dict['uuid'])

    def _main_loop(self):
        '''
        Continuous loop that reads from a kafka topic and tries to validate
        incoming messages
        '''
        while True:
            start = time.time()

            try:
                for message in self.consumer.get_messages():
                    if message is None:
                        break
                    try:
                        the_dict = json.loads(message.message.value)

                        try:
                            self.validator(self.schema).validate(the_dict)
                            self.result_method(the_dict)
                        except ValidationError as ex:
                            print "invalid json received"

                    except ValueError:
                        print "bad json recieved"
            except OffsetOutOfRangeError:
                # consumer has no idea where they are
                self.consumer.seek(0, 2)

            end = time.time()
            time.sleep(.01)

    def run(self):
        '''
        Sets up the schema to be validated against
        '''
        self.setup()
        with open(self.settings.SCHEMA) as the_file:
            # No try/catch so we can see if there is a json parse error
            # on the schemas
            self.schema = json.load(the_file)
            self._main_loop()

    def feed(self, json_item):
        '''
        Feeds a json item into the Kafka topic

        @param json_item: The loaded json object
        '''
        topic = self.settings.KAFKA_INCOMING_TOPIC
        producer = SimpleProducer(self.kafka_conn)
        print "=> feeding JSON request into {0}...".format(topic)
        print json.dumps(json_item, indent=4)
        self.kafka_conn.ensure_topic_exists(topic)
        producer.send_messages(topic, json.dumps(json_item))
        print "=> done feeding request."
Esempio n. 32
0
kafkaConnect = args["k"]
topic        = args["t"]
quiet        = args["q"]


schema = avro.schema.parse(json.dumps({
    'name': 'kafkatest',
    'namespace': 'test',
    'type': 'record',
    'fields': [
        {'name': 'id', 'type': 'int'},
        {'name': 'random', 'type': 'int'},
        {'name': 'data', 'type': 'string'},
    ],
  }))


kafka = KafkaClient(kafkaConnect)
kafka.ensure_topic_exists(topic)
producer = SimpleProducer(kafka)

writer = avro.io.DatumWriter(schema)
bytes_writer = io.BytesIO()
encoder = avro.io.BinaryEncoder(bytes_writer)

for x in xrange(maxRecords):  
  writer.write( {'id': x, 'random': randint(1, 3) ,'data': str(uuid.uuid4().get_hex().upper()[0:20])}, encoder)
  raw_bytes = bytes_writer.getvalue()
  producer.send_messages(topic, raw_bytes)
  if not quiet:
    print "Sent message ID: "+str(x)