class RedisMonitor: def __init__(self, settings_name, unit_test=False): ''' @param settings_name: the local settings file name @param unit_test: whether running unit tests or not ''' self.settings_name = settings_name self.redis_conn = None self.wrapper = SettingsWrapper() self.logger = None self.unit_test = unit_test def setup(self, level=None, log_file=None, json=None): ''' Load everything up. Note that any arg here will override both default and custom settings @param level: the log level @param log_file: boolean t/f whether to log to a file, else stdout @param json: boolean t/f whether to write the logs in json ''' self.settings = self.wrapper.load(self.settings_name) my_level = level if level else self.settings['LOG_LEVEL'] # negate because logger wants True for std out my_output = not log_file if log_file else self.settings['LOG_STDOUT'] my_json = json if json else self.settings['LOG_JSON'] self.logger = LogFactory.get_instance(json=my_json, stdout=my_output, level=my_level, name=self.settings['LOGGER_NAME'], dir=self.settings['LOG_DIR'], file=self.settings['LOG_FILE'], bytes=self.settings['LOG_MAX_BYTES'], backups=self.settings['LOG_BACKUPS']) self.redis_conn = redis.Redis(host=self.settings['REDIS_HOST'], port=self.settings['REDIS_PORT']) try: self.redis_conn.info() self.logger.debug("Successfully connected to Redis") except ConnectionError: self.logger.error("Failed to connect to Redis") # essential to functionality sys.exit(1) self._load_plugins() self._setup_stats() def import_class(self, cl): ''' Imports a class from a string @param name: the module and class name in dot notation ''' d = cl.rfind(".") classname = cl[d+1:len(cl)] m = __import__(cl[0:d], globals(), locals(), [classname]) return getattr(m, classname) def _load_plugins(self): ''' Sets up all plugins and defaults ''' plugins = self.settings['PLUGINS'] self.plugins_dict = {} for key in plugins: # skip loading the plugin if its value is None if plugins[key] is None: continue # valid plugin, import and setup self.logger.debug("Trying to load plugin {cls}" .format(cls=key)) the_class = self.import_class(key) instance = the_class() instance.redis_conn = self.redis_conn instance._set_logger(self.logger) if not self.unit_test: instance.setup(self.settings) the_regex = instance.regex mini = {} mini['instance'] = instance if the_regex is None: raise ImportError() # continue mini['regex'] = the_regex self.plugins_dict[plugins[key]] = mini self.plugins_dict = OrderedDict(sorted(self.plugins_dict.items(), key=lambda t: t[0])) def run(self): ''' The external main run loop ''' self._main_loop() def _main_loop(self): ''' The internal while true main loop for the redis monitor ''' self.logger.debug("Running main loop") print 'Running main loop' old_time = 0 while True: for plugin_key in self.plugins_dict: obj = self.plugins_dict[plugin_key] self._process_plugin(obj) if self.settings['STATS_DUMP'] != 0: new_time = int(time.time() / self.settings['STATS_DUMP']) # only log every X seconds if new_time != old_time: self._dump_stats() if self.settings['STATS_DUMP_CRAWL']: self._dump_crawl_stats() if self.settings['STATS_DUMP_QUEUE']: self._dump_queue_stats() old_time = new_time time.sleep(0.1) def _process_plugin(self, plugin): ''' Logic to handle each plugin that is active @param plugin: a plugin dict object ''' instance = plugin['instance'] regex = plugin['regex'] for key in self.redis_conn.scan_iter(match=regex): val = self.redis_conn.get(key) try: self._process_key_val(instance, key, val) except Exception: self.logger.error(traceback.format_exc()) self._increment_fail_stat('{k}:{v}'.format(k=key, v=val)) self.redis_conn.delete(key) def _process_key_val(self, instance, key, val): ''' Logic to let the plugin instance process the redis key/val Split out for unit testing @param instance: the plugin instance @param key: the redis key @param val: the key value from redis ''' if instance.check_precondition(key, val): combined = '{k}:{v}'.format(k=key, v=val) self._increment_total_stat(combined) self._increment_plugin_stat( instance.__class__.__name__, combined) instance.handle(key, val) self.redis_conn.delete(key) def _setup_stats(self): ''' Sets up the stats ''' # stats setup self.stats_dict = {} if self.settings['STATS_TOTAL']: self._setup_stats_total() if self.settings['STATS_PLUGINS']: self._setup_stats_plugins() def _setup_stats_total(self): ''' Sets up the total stats collectors ''' self.stats_dict['total'] = {} self.stats_dict['fail'] = {} temp_key1 = 'stats:redis-monitor:total' temp_key2 = 'stats:redis-monitor:fail' for item in self.settings['STATS_TIMES']: try: time = getattr(StatsCollector, item) self.stats_dict['total'][time] = StatsCollector \ .get_rolling_time_window( redis_conn=self.redis_conn, key='{k}:{t}'.format(k=temp_key1, t=time), window=time, cycle_time=self.settings['STATS_CYCLE']) self.stats_dict['fail'][time] = StatsCollector \ .get_rolling_time_window( redis_conn=self.redis_conn, key='{k}:{t}'.format(k=temp_key2, t=time), window=time, cycle_time=self.settings['STATS_CYCLE']) self.logger.debug("Set up total/fail Stats Collector '{i}'"\ .format(i=item)) except AttributeError as e: self.logger.warning("Unable to find Stats Time '{s}'"\ .format(s=item)) total1 = StatsCollector.get_hll_counter(redis_conn=self.redis_conn, key='{k}:lifetime'.format(k=temp_key1), cycle_time=self.settings['STATS_CYCLE'], roll=False) total2 = StatsCollector.get_hll_counter(redis_conn=self.redis_conn, key='{k}:lifetime'.format(k=temp_key2), cycle_time=self.settings['STATS_CYCLE'], roll=False) self.logger.debug("Set up total/fail Stats Collector 'lifetime'") self.stats_dict['total']['lifetime'] = total1 self.stats_dict['fail']['lifetime'] = total2 def _setup_stats_plugins(self): ''' Sets up the plugin stats collectors ''' self.stats_dict['plugins'] = {} for key in self.plugins_dict: plugin_name = self.plugins_dict[key]['instance'].__class__.__name__ temp_key = 'stats:redis-monitor:{p}'.format(p=plugin_name) self.stats_dict['plugins'][plugin_name] = {} for item in self.settings['STATS_TIMES']: try: time = getattr(StatsCollector, item) self.stats_dict['plugins'][plugin_name][time] = StatsCollector \ .get_rolling_time_window( redis_conn=self.redis_conn, key='{k}:{t}'.format(k=temp_key, t=time), window=time, cycle_time=self.settings['STATS_CYCLE']) self.logger.debug("Set up {p} plugin Stats Collector '{i}'"\ .format(p=plugin_name, i=item)) except AttributeError as e: self.logger.warning("Unable to find Stats Time '{s}'"\ .format(s=item)) total = StatsCollector.get_hll_counter(redis_conn=self.redis_conn, key='{k}:lifetime'.format(k=temp_key), cycle_time=self.settings['STATS_CYCLE'], roll=False) self.logger.debug("Set up {p} plugin Stats Collector 'lifetime'"\ .format(p=plugin_name)) self.stats_dict['plugins'][plugin_name]['lifetime'] = total def _increment_total_stat(self, item): ''' Increments the total stat counters @param item: the unique print for HLL counter ''' item = item + str(time.time()) if 'total' in self.stats_dict: self.logger.debug("Incremented total stats") for key in self.stats_dict['total']: if key == 'lifetime': self.stats_dict['total'][key].increment(item) else: self.stats_dict['total'][key].increment() def _increment_fail_stat(self, item): ''' Increments the total stat counters @param item: the unique print for HLL counter ''' item = item + str(time.time()) if 'fail' in self.stats_dict: self.logger.debug("Incremented fail stats") for key in self.stats_dict['fail']: if key == 'lifetime': self.stats_dict['fail'][key].increment(item) else: self.stats_dict['fail'][key].increment() def _increment_plugin_stat(self, name, item): ''' Increments the total stat counters @param name: The formal name of the plugin @param item: the unique print for HLL counter ''' item = item + str(time.time()) if 'plugins' in self.stats_dict: self.logger.debug("Incremented plugin '{p}' plugin stats"\ .format(p=name)) for key in self.stats_dict['plugins'][name]: if key == 'lifetime': self.stats_dict['plugins'][name][key].increment(item) else: self.stats_dict['plugins'][name][key].increment() def _dump_stats(self): ''' Dumps the stats out ''' extras = {} if 'total' in self.stats_dict: self.logger.debug("Compiling total/fail dump stats") for key in self.stats_dict['total']: final = 'total_{t}'.format(t=key) extras[final] = self.stats_dict['total'][key].value() for key in self.stats_dict['fail']: final = 'fail_{t}'.format(t=key) extras[final] = self.stats_dict['fail'][key].value() if 'plugins' in self.stats_dict: self.logger.debug("Compiling plugin dump stats") for name in self.stats_dict['plugins']: for key in self.stats_dict['plugins'][name]: final = 'plugin_{n}_{t}'.format(n=name, t=key) extras[final] = self.stats_dict['plugins'][name][key].value() if not self.logger.json: self.logger.info('Redis Monitor Stats Dump:\n{0}'.format( json.dumps(extras, indent=4, sort_keys=True))) else: self.logger.info('Redis Monitor Stats Dump', extra=extras) def _dump_crawl_stats(self): ''' Dumps flattened crawling stats so the spiders do not have to ''' extras = {} spiders = {} spider_set = set() total_spider_count = 0 keys = self.redis_conn.keys('stats:crawler:*:*:*') for key in keys: # we only care about the spider elements = key.split(":") spider = elements[3] if spider not in spiders: spiders[spider] = 0 if len(elements) == 6: # got a time based stat response = elements[4] end = elements[5] final = '{s}_{r}_{e}'.format(s=spider, r=response, e=end) if end == 'lifetime': value = self.redis_conn.execute_command("PFCOUNT", key) else: value = self.redis_conn.zcard(key) extras[final] = value elif len(elements) == 5: # got a spider identifier spiders[spider] += 1 total_spider_count += 1 spider_set.add(spider) else: self.logger.warn("Unknown crawler stat key", {"key":key}) # simple counts extras['unique_spider_count'] = len(spider_set) extras['total_spider_count'] = total_spider_count for spider in spiders: extras['{k}_spider_count'.format(k=spider)] = spiders[spider] if not self.logger.json: self.logger.info('Crawler Stats Dump:\n{0}'.format( json.dumps(extras, indent=4, sort_keys=True))) else: self.logger.info('Crawler Stats Dump', extra=extras) def _dump_queue_stats(self): ''' Dumps basic info about the queue lengths for the spider types ''' extras = {} keys = self.redis_conn.keys('*:*:queue') total_backlog = 0 for key in keys: elements = key.split(":") spider = elements[0] domain = elements[1] spider = 'queue_' + spider if spider not in extras: extras[spider] = {} extras[spider]['spider_backlog'] = 0 extras[spider]['num_domains'] = 0 count = self.redis_conn.zcard(key) total_backlog += count extras[spider]['spider_backlog'] += count extras[spider]['num_domains'] += 1 extras['total_backlog'] = total_backlog if not self.logger.json: self.logger.info('Queue Stats Dump:\n{0}'.format( json.dumps(extras, indent=4, sort_keys=True))) else: self.logger.info('Queue Stats Dump', extra=extras)
class JayRedisMonitor: def __init__(self, settings_name, unit_test=False): ''' @param settings_name: the local settings file name @param unit_test: whether running unit tests or not ''' self.settings_name = settings_name self.redis_conn = None self.wrapper = SettingsWrapper() self.logger = None self.unit_test = unit_test def setup(self, level=None, log_file=None, json=None): ''' Load everything up. Note that any arg here will override both default and custom settings @param level: the log level @param log_file: boolean t/f whether to log to a file, else stdout @param json: boolean t/f whether to write the logs in json ''' self.settings = self.wrapper.load(self.settings_name) my_level = level if level else self.settings['LOG_LEVEL'] # negate because logger wants True for std out my_output = not log_file if log_file else self.settings['LOG_STDOUT'] my_json = json if json else self.settings['LOG_JSON'] self.logger = LogFactory.get_instance( json=my_json, stdout=my_output, level=my_level, #name=self.settings['LOGGER_NAME'], name="jay-redis-monitor", dir=self.settings['LOG_DIR'], #file=self.settings['LOG_FILE'], file="jay_redis_monitor.log", bytes=self.settings['LOG_MAX_BYTES'], backups=self.settings['LOG_BACKUPS']) self.redis_conn = redis.Redis(host=self.settings['REDIS_HOST'], port=self.settings['REDIS_PORT']) try: self.redis_conn.info() self.logger.debug("Successfully connected to Redis") except ConnectionError: self.logger.error("Failed to connect to Redis") # essential to functionality sys.exit(1) def run(self): ''' The external main run loop ''' self._main_loop() def _main_loop(self): ''' The internal while true main loop for the redis monitor ''' self.logger.debug("Running main loop") print 'Running main loop' jaystats = JayStatsMonitor() jaystats.setup(self.logger, self.redis_conn) jayinfo = JayInfoMonitor() jayinfo.setup(self.logger, self.redis_conn) while True: jaystats.handle() jayinfo.handle() time.sleep(1)
def main(): # initial main parser setup parser = argparse.ArgumentParser( description='Kafka Dump: Scrapy Cluster Kafka topic dump utility for ' 'debugging.', add_help=False) parser.add_argument('-h', '--help', action=ArgparseHelper, help='show this help message and exit') subparsers = parser.add_subparsers(help='commands', dest='command') # args to use for all commands base_parser = argparse.ArgumentParser(add_help=False) base_parser.add_argument('-kh', '--kafka-host', action='store', required=False, help="The override Kafka host") base_parser.add_argument('-s', '--settings', action='store', required=False, help="The settings file to read from", default="localsettings.py") base_parser.add_argument('-ll', '--log-level', action='store', required=False, help="The log level", default=None, choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']) # list command list_parser = subparsers.add_parser('list', help='List all Kafka topics', parents=[base_parser]) # dump command dump_parser = subparsers.add_parser('dump', help='Dump a Kafka topic', parents=[base_parser]) dump_parser.add_argument('-t', '--topic', action='store', required=True, help="The Kafka topic to read from") dump_parser.add_argument('-c', '--consumer', action='store', required=False, default='default', help="The Kafka consumer id to use") dump_parser.add_argument('-b', '--from-beginning', action='store_const', required=False, const=True, help="Read the topic from the beginning") dump_parser.add_argument('-nb', '--no-body', action='store_const', required=False, const=True, default=False, help="Do not include the raw html 'body' key in" " the json dump of the topic") dump_parser.add_argument('-p', '--pretty', action='store_const', required=False, const=True, default=False, help="Pretty print the json objects consumed") dump_parser.add_argument('-d', '--decode-base64', action='store_const', required=False, const=True, default=False, help="Decode the base64 encoded raw html body") args = vars(parser.parse_args()) wrapper = SettingsWrapper() settings = wrapper.load(args['settings']) kafka_host = args['kafka_host'] if args['kafka_host'] else settings['KAFKA_HOSTS'] log_level = args['log_level'] if args['log_level'] else settings['LOG_LEVEL'] logger = LogFactory.get_instance(level=log_level, name='kafkadump') logger.debug("Connecting to {0}...".format(kafka_host)) try: kafka = KafkaClient(kafka_host) logger.info("Connected to {0}".format(kafka_host)) except KafkaUnavailableError as ex: message = "An exception '{0}' occured. Arguments:\n{1!r}" \ .format(type(ex).__name__, ex.args) logger.error(message) sys.exit(1) if args['command'] == 'list': logger.debug('Running list command') print "Topics:" for topic in kafka.topic_partitions.keys(): print "-", topic return 0 elif args['command'] == 'dump': logger.debug('Running dump command') topic = args["topic"] consumer_id = args["consumer"] @MethodTimer.timeout(5, None) def _hidden(): try: logger.debug("Ensuring topic {t} exists".format(t=topic)) kafka.ensure_topic_exists(topic) logger.debug("Getting Kafka consumer") consumer = SimpleConsumer(kafka, consumer_id, topic, buffer_size=1024*100, fetch_size_bytes=1024*100, max_buffer_size=None ) return consumer except KafkaUnavailableError as ex: message = "An exception '{0}' occured. Arguments:\n{1!r}" \ .format(type(ex).__name__, ex.args) logger.error(message) sys.exit(1) consumer = _hidden() if consumer is None: logger.error("Could not fully connect to Kafka within the timeout") sys.exit(1) if args["from_beginning"]: logger.debug("Seeking to beginning") consumer.seek(0, 0) else: logger.debug("Reading from the end") consumer.seek(0, 2) num_records = 0 total_bytes = 0 item = None while True: try: for message in consumer.get_messages(): if message is None: logger.debug("no message") break logger.debug("Received message") val = message.message.value try: item = json.loads(val) if args['decode_base64'] and 'body' in item: item['body'] = base64.b64decode(item['body']) if args['no_body'] and 'body' in item: del item['body'] except ValueError: logger.info("Message is not a JSON object") item = val body_bytes = len(item) if args['pretty']: print json.dumps(item, indent=4) else: print item num_records = num_records + 1 total_bytes = total_bytes + body_bytes except KeyboardInterrupt: logger.debug("Keyboard interrupt received") break except: logger.error(traceback.print_exc()) break total_mbs = float(total_bytes) / (1024*1024) if item is not None: print "Last item:" print json.dumps(item, indent=4) if num_records > 0: logger.info("Num Records: {n}, Total MBs: {m}, kb per message: {kb}" .format(n=num_records, m=total_mbs, kb=(float(total_bytes) / num_records / 1024))) else: logger.info("No records consumed") num_records = 0 logger.info("Closing Kafka connection") kafka.close() return 0
import argparse from scutils.settings_wrapper import SettingsWrapper # set up arg parser parser = argparse.ArgumentParser( description='Example SettingsWrapper parser.\n') parser.add_argument('-s', '--settings', action='store', required=False, help="The default settings file", default="settings.py") parser.add_argument('-o', '--override-settings', action='store', required=False, help="The override settings file", default="localsettings.py") parser.add_argument('-v', '--variable', action='store', required=False, help="The variable to print out", default=None) args = vars(parser.parse_args()) # load up settings wrapper = SettingsWrapper() my_settings = wrapper.load(default=args['settings'], local=args['override_settings']) if args['variable'] is not None: if args['variable'] in my_settings: print(args['variable'], '=', my_settings[args['variable']]) else: print(args['variable'], "not in loaded settings") else: print("Full settings:", my_settings)
class KafkaMonitor(object): consumer = None def __init__(self, settings_name, unit_test=False): ''' @param settings_name: the local settings file name @param unit_test: whether running unit tests or not ''' self.settings_name = settings_name self.wrapper = SettingsWrapper() self.logger = None self.unit_test = unit_test self.my_uuid = str(uuid.uuid4()).split('-')[4] def _import_class(self, cl): ''' Imports a class from a string @param name: the module and class name in dot notation ''' d = cl.rfind(".") classname = cl[d+1:len(cl)] m = __import__(cl[0:d], globals(), locals(), [classname]) return getattr(m, classname) def _load_plugins(self): ''' Sets up all plugins, defaults and settings.py ''' plugins = self.settings['PLUGINS'] self.plugins_dict = {} for key in plugins: # skip loading the plugin if its value is None if plugins[key] is None: continue # valid plugin, import and setup self.logger.debug("Trying to load plugin {cls}".format(cls=key)) the_class = self._import_class(key) instance = the_class() instance._set_logger(self.logger) if not self.unit_test: instance.setup(self.settings) the_schema = None with open(self.settings['PLUGIN_DIR'] + instance.schema) as the_file: the_schema = json.load(the_file) mini = {} mini['instance'] = instance mini['schema'] = the_schema self.logger.debug("Successfully loaded plugin {cls}".format(cls=key)) self.plugins_dict[plugins[key]] = mini self.plugins_dict = OrderedDict(sorted(list(self.plugins_dict.items()), key=lambda t: t[0])) def setup(self, level=None, log_file=None, json=None): ''' Load everything up. Note that any arg here will override both default and custom settings @param level: the log level @param log_file: boolean t/f whether to log to a file, else stdout @param json: boolean t/f whether to write the logs in json ''' self.settings = self.wrapper.load(self.settings_name) my_level = level if level else self.settings['LOG_LEVEL'] # negate because logger wants True for std out my_output = not log_file if log_file else self.settings['LOG_STDOUT'] my_json = json if json else self.settings['LOG_JSON'] self.logger = LogFactory.get_instance(json=my_json, stdout=my_output, level=my_level, name=self.settings['LOGGER_NAME'], dir=self.settings['LOG_DIR'], file=self.settings['LOG_FILE'], bytes=self.settings['LOG_MAX_BYTES'], backups=self.settings['LOG_BACKUPS']) self.validator = self.extend_with_default(Draft4Validator) def _setup_stats(self): ''' Sets up the stats collection ''' self.stats_dict = {} redis_conn = redis.Redis(host=self.settings['REDIS_HOST'], port=self.settings['REDIS_PORT'], db=self.settings.get('REDIS_DB')) try: redis_conn.info() self.logger.debug("Connected to Redis in StatsCollector Setup") self.redis_conn = redis_conn except ConnectionError: self.logger.warn("Failed to connect to Redis in StatsCollector" " Setup, no stats will be collected") return if self.settings['STATS_TOTAL']: self._setup_stats_total(redis_conn) if self.settings['STATS_PLUGINS']: self._setup_stats_plugins(redis_conn) def _setup_stats_total(self, redis_conn): ''' Sets up the total stats collectors @param redis_conn: the redis connection ''' self.stats_dict['total'] = {} self.stats_dict['fail'] = {} temp_key1 = 'stats:kafka-monitor:total' temp_key2 = 'stats:kafka-monitor:fail' for item in self.settings['STATS_TIMES']: try: time = getattr(StatsCollector, item) self.stats_dict['total'][time] = StatsCollector \ .get_rolling_time_window( redis_conn=redis_conn, key='{k}:{t}'.format(k=temp_key1, t=time), window=time, cycle_time=self.settings['STATS_CYCLE']) self.stats_dict['fail'][time] = StatsCollector \ .get_rolling_time_window( redis_conn=redis_conn, key='{k}:{t}'.format(k=temp_key2, t=time), window=time, cycle_time=self.settings['STATS_CYCLE']) self.logger.debug("Set up total/fail Stats Collector '{i}'"\ .format(i=item)) except AttributeError as e: self.logger.warning("Unable to find Stats Time '{s}'"\ .format(s=item)) total1 = StatsCollector.get_hll_counter(redis_conn=redis_conn, key='{k}:lifetime'.format(k=temp_key1), cycle_time=self.settings['STATS_CYCLE'], roll=False) total2 = StatsCollector.get_hll_counter(redis_conn=redis_conn, key='{k}:lifetime'.format(k=temp_key2), cycle_time=self.settings['STATS_CYCLE'], roll=False) self.logger.debug("Set up total/fail Stats Collector 'lifetime'") self.stats_dict['total']['lifetime'] = total1 self.stats_dict['fail']['lifetime'] = total2 def _setup_stats_plugins(self, redis_conn): ''' Sets up the plugin stats collectors @param redis_conn: the redis connection ''' self.stats_dict['plugins'] = {} for key in self.plugins_dict: plugin_name = self.plugins_dict[key]['instance'].__class__.__name__ temp_key = 'stats:kafka-monitor:{p}'.format(p=plugin_name) self.stats_dict['plugins'][plugin_name] = {} for item in self.settings['STATS_TIMES']: try: time = getattr(StatsCollector, item) self.stats_dict['plugins'][plugin_name][time] = StatsCollector \ .get_rolling_time_window( redis_conn=redis_conn, key='{k}:{t}'.format(k=temp_key, t=time), window=time, cycle_time=self.settings['STATS_CYCLE']) self.logger.debug("Set up {p} plugin Stats Collector '{i}'"\ .format(p=plugin_name, i=item)) except AttributeError: self.logger.warning("Unable to find Stats Time '{s}'"\ .format(s=item)) total = StatsCollector.get_hll_counter(redis_conn=redis_conn, key='{k}:lifetime'.format(k=temp_key), cycle_time=self.settings['STATS_CYCLE'], roll=False) self.logger.debug("Set up {p} plugin Stats Collector 'lifetime'"\ .format(p=plugin_name)) self.stats_dict['plugins'][plugin_name]['lifetime'] = total def _setup_kafka(self): ''' Sets up kafka connections ''' self.consumer = self._create_consumer() self.logger.debug("Successfully connected to Kafka") def extend_with_default(self, validator_class): ''' Method to add default fields to our schema validation ( From the docs ) ''' validate_properties = validator_class.VALIDATORS["properties"] def set_defaults(validator, properties, instance, schema): for error in validate_properties( validator, properties, instance, schema, ): yield error for property, subschema in list(properties.items()): if "default" in subschema: instance.setdefault(property, subschema["default"]) return validators.extend( validator_class, {"properties": set_defaults}, ) def _main_loop(self): ''' Continuous loop that reads from a kafka topic and tries to validate incoming messages ''' self.logger.debug("Processing messages") old_time = 0 while True: self._process_messages() if self.settings['STATS_DUMP'] != 0: new_time = int(old_div(time.time(), self.settings['STATS_DUMP'])) # only log every X seconds if new_time != old_time: self._dump_stats() old_time = new_time self._report_self() time.sleep(self.settings['SLEEP_TIME']) def _process_messages(self): try: for message in self.consumer: if message is None: self.logger.debug("no message") break try: self._increment_total_stat(message.value) loaded_dict = json.loads(message.value) found_plugin = False for key in self.plugins_dict: # to prevent reference modification the_dict = copy.deepcopy(loaded_dict) obj = self.plugins_dict[key] instance = obj['instance'] schema = obj['schema'] try: self.validator(schema).validate(the_dict) found_plugin = True self._increment_plugin_stat( instance.__class__.__name__, the_dict) ret = instance.handle(the_dict) # break if nothing is returned if ret is None: break except ValidationError: pass if not found_plugin: extras = {} extras['parsed'] = True extras['valid'] = False extras['data'] = the_dict self.logger.warn("Did not find schema to validate " "request", extra=extras) self._increment_fail_stat(the_dict) except ValueError: extras = {} extras['parsed'] = False extras['valid'] = False extras['data'] = message.value self.logger.warning('Unparseable JSON Received', extra=extras) self._increment_fail_stat(message.value) except OffsetOutOfRangeError: # consumer has no idea where they are self.consumer.seek_to_end() self.logger.error("Kafka offset out of range error") def _increment_total_stat(self, string): ''' Increments the total stat counters @param string: the loaded message object for the counter ''' string = string + str(time.time()) if 'total' in self.stats_dict: self.logger.debug("Incremented total stats") for key in self.stats_dict['total']: if key == 'lifetime': self.stats_dict['total'][key].increment(string) else: self.stats_dict['total'][key].increment() def _increment_fail_stat(self, item): ''' Increments the total stat counters @param item: the loaded message object for HLL counter ''' if isinstance(item, dict): item['ts'] = time.time() elif isinstance(item, str): item = item + str(time.time()) if 'fail' in self.stats_dict: self.logger.debug("Incremented fail stats") for key in self.stats_dict['fail']: if key == 'lifetime': self.stats_dict['fail'][key].increment(item) else: self.stats_dict['fail'][key].increment() def _increment_plugin_stat(self, name, item): ''' Increments the total stat counters @param name: The formal name of the plugin @param dict: the loaded message object for HLL counter ''' item['ts'] = time.time() if 'plugins' in self.stats_dict: self.logger.debug("Incremented plugin '{p}' plugin stats"\ .format(p=name)) for key in self.stats_dict['plugins'][name]: if key == 'lifetime': self.stats_dict['plugins'][name][key].increment(item) else: self.stats_dict['plugins'][name][key].increment() def _dump_stats(self): ''' Dumps the stats out ''' extras = {} if 'total' in self.stats_dict: self.logger.debug("Compiling total/fail dump stats") for key in self.stats_dict['total']: final = 'total_{t}'.format(t=key) extras[final] = self.stats_dict['total'][key].value() for key in self.stats_dict['fail']: final = 'fail_{t}'.format(t=key) extras[final] = self.stats_dict['fail'][key].value() if 'plugins' in self.stats_dict: self.logger.debug("Compiling plugin dump stats") for name in self.stats_dict['plugins']: for key in self.stats_dict['plugins'][name]: final = 'plugin_{n}_{t}'.format(n=name, t=key) extras[final] = self.stats_dict['plugins'][name][key].value() if not self.logger.json: self.logger.info('Kafka Monitor Stats Dump:\n{0}'.format( json.dumps(extras, indent=4, sort_keys=True))) else: self.logger.info('Kafka Monitor Stats Dump', extra=extras) def run(self): ''' Set up and run ''' self._setup_kafka() self._load_plugins() self._setup_stats() self._main_loop() def _report_self(self): ''' Reports the kafka monitor uuid to redis ''' key = "stats:kafka-monitor:self:{m}:{u}".format( m=socket.gethostname(), u=self.my_uuid) self.redis_conn.set(key, time.time()) self.redis_conn.expire(key, self.settings['HEARTBEAT_TIMEOUT']) def feed(self, json_item): ''' Feeds a json item into the Kafka topic @param json_item: The loaded json object ''' @MethodTimer.timeout(self.settings['KAFKA_FEED_TIMEOUT'], False) def _feed(json_item): producer = self._create_producer() topic = self.settings['KAFKA_INCOMING_TOPIC'] if not self.logger.json: self.logger.info('Feeding JSON into {0}\n{1}'.format( topic, json.dumps(json_item, indent=4))) else: self.logger.info('Feeding JSON into {0}\n'.format(topic), extra={'value': json_item}) if producer is not None: producer.send(topic, json_item) producer.flush() producer.close(timeout=10) return True else: return False result = _feed(json_item) if result: self.logger.info("Successfully fed item to Kafka") else: self.logger.error("Failed to feed item into Kafka") @retry(wait_exponential_multiplier=500, wait_exponential_max=10000) def _create_consumer(self): """Tries to establing the Kafka consumer connection""" try: brokers = self.settings['KAFKA_HOSTS'] self.logger.debug("Creating new kafka consumer using brokers: " + str(brokers) + ' and topic ' + self.settings['KAFKA_INCOMING_TOPIC']) return KafkaConsumer( self.settings['KAFKA_INCOMING_TOPIC'], group_id=self.settings['KAFKA_GROUP'], bootstrap_servers=brokers, consumer_timeout_ms=self.settings['KAFKA_CONSUMER_TIMEOUT'], auto_offset_reset=self.settings['KAFKA_CONSUMER_AUTO_OFFSET_RESET'], auto_commit_interval_ms=self.settings['KAFKA_CONSUMER_COMMIT_INTERVAL_MS'], enable_auto_commit=self.settings['KAFKA_CONSUMER_AUTO_COMMIT_ENABLE'], max_partition_fetch_bytes=self.settings['KAFKA_CONSUMER_FETCH_MESSAGE_MAX_BYTES']) except KeyError as e: self.logger.error('Missing setting named ' + str(e), {'ex': traceback.format_exc()}) except: self.logger.error("Couldn't initialize kafka consumer for topic", {'ex': traceback.format_exc(), 'topic': self.settings['KAFKA_INCOMING_TOPIC']}) raise @retry(wait_exponential_multiplier=500, wait_exponential_max=10000) def _create_producer(self): """Tries to establish a Kafka consumer connection""" try: brokers = self.settings['KAFKA_HOSTS'] self.logger.debug("Creating new kafka producer using brokers: " + str(brokers)) return KafkaProducer(bootstrap_servers=brokers, value_serializer=lambda m: json.dumps(m), retries=3, linger_ms=self.settings['KAFKA_PRODUCER_BATCH_LINGER_MS'], buffer_memory=self.settings['KAFKA_PRODUCER_BUFFER_BYTES']) except KeyError as e: self.logger.error('Missing setting named ' + str(e), {'ex': traceback.format_exc()}) except: self.logger.error("Couldn't initialize kafka producer.", {'ex': traceback.format_exc()}) raise def close(self): ''' Call to properly tear down the Kafka Monitor ''' if self.consumer is not None: self.consumer.close()
def main(): # initial main parser setup parser = argparse.ArgumentParser( description='Kafka Dump: Scrapy Cluster Kafka topic dump utility for ' 'debugging.', add_help=False) parser.add_argument('-h', '--help', action=ArgparseHelper, help='show this help message and exit') subparsers = parser.add_subparsers(help='commands', dest='command') # args to use for all commands base_parser = argparse.ArgumentParser(add_help=False) base_parser.add_argument('-kh', '--kafka-host', action='store', required=False, help="The override Kafka host") base_parser.add_argument('-s', '--settings', action='store', required=False, help="The settings file to read from", default="localsettings.py") base_parser.add_argument( '-ll', '--log-level', action='store', required=False, help="The log level", default=None, choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']) # list command list_parser = subparsers.add_parser('list', help='List all Kafka topics', parents=[base_parser]) # dump command dump_parser = subparsers.add_parser('dump', help='Dump a Kafka topic', parents=[base_parser]) dump_parser.add_argument('-t', '--topic', action='store', required=True, help="The Kafka topic to read from") dump_parser.add_argument('-c', '--consumer', action='store', required=False, default=None, help="The Kafka consumer id to use") dump_parser.add_argument('-b', '--from-beginning', action='store_const', required=False, const=True, help="Read the topic from the beginning") dump_parser.add_argument('-nb', '--no-body', action='store_const', required=False, const=True, default=False, help="Do not include the raw html 'body' key in" " the json dump of the topic") dump_parser.add_argument('-p', '--pretty', action='store_const', required=False, const=True, default=False, help="Pretty print the json objects consumed") dump_parser.add_argument('-d', '--decode-base64', action='store_const', required=False, const=True, default=False, help="Decode the base64 encoded raw html body") args = vars(parser.parse_args()) wrapper = SettingsWrapper() settings = wrapper.load(args['settings']) kafka_host = args['kafka_host'] if args['kafka_host'] else settings[ 'KAFKA_HOSTS'] log_level = args['log_level'] if args['log_level'] else settings[ 'LOG_LEVEL'] logger = LogFactory.get_instance(level=log_level, name='kafkadump') if args['command'] == 'list': try: logger.debug("Connecting to {0}...".format(kafka_host)) kafka = KafkaClient(kafka_host) logger.info("Connected to {0}".format(kafka_host)) except KafkaUnavailableError as ex: message = "An exception '{0}' occured. Arguments:\n{1!r}" \ .format(type(ex).__name__, ex.args) logger.error(message) sys.exit(1) logger.debug('Running list command') print("Topics:") for topic in list(kafka.topic_partitions.keys()): print("-", topic) kafka.close() return 0 elif args['command'] == 'dump': logger.debug('Running dump command') topic = args["topic"] consumer_id = args["consumer"] try: logger.debug("Getting Kafka consumer") offset = 'earliest' if args["from_beginning"] else 'latest' consumer = KafkaConsumer( # 消费来自demo.crawled_firehose话题的消息 topic, group_id=consumer_id, bootstrap_servers=kafka_host, consumer_timeout_ms=settings['KAFKA_CONSUMER_TIMEOUT'], auto_offset_reset=offset, auto_commit_interval_ms=settings[ 'KAFKA_CONSUMER_COMMIT_INTERVAL_MS'], enable_auto_commit=settings[ 'KAFKA_CONSUMER_AUTO_COMMIT_ENABLE'], max_partition_fetch_bytes=settings[ 'KAFKA_CONSUMER_FETCH_MESSAGE_MAX_BYTES']) except NoBrokersAvailable as ex: logger.error('Unable to connect to Kafka') sys.exit(1) num_records = 0 total_bytes = 0 item = None while True: try: for message in consumer: if message is None: logger.debug("no message") break logger.debug("Received message") val = message.value try: item = json.loads(val) if args['decode_base64'] and 'body' in item: item['body'] = base64.b64decode(item['body']) if args['no_body'] and 'body' in item: del item['body'] except ValueError: logger.info("Message is not a JSON object") item = val body_bytes = len(item) if args['pretty']: print(json.dumps(item, indent=4)) else: print(item) num_records = num_records + 1 total_bytes = total_bytes + body_bytes except KeyboardInterrupt: logger.debug("Keyboard interrupt received") break except: logger.error(traceback.print_exc()) break total_mbs = old_div(float(total_bytes), (1024 * 1024)) if item is not None: print("Last item:") print(json.dumps(item, indent=4)) if num_records > 0: logger.info( "Num Records: {n}, Total MBs: {m}, kb per message: {kb}". format(n=num_records, m=total_mbs, kb=(float(total_bytes) / num_records / 1024))) else: logger.info("No records consumed") num_records = 0 logger.info("Closing Kafka connection") try: consumer.close() except: # Exception is thrown when group_id is None. # See https://github.com/dpkp/kafka-python/issues/619 pass return 0
class RestService(object): # static strings SUCCESS = 'SUCCESS' FAILURE = 'FAILURE' UNKNOWN_ERROR = "An error occurred while processing your request." MUST_JSON = "The payload must be valid JSON." DOES_NOT_EXIST = "The desired endpoint does not exist" BAD_SCHEMA = "JSON did not validate against schema." consumer = None producer = None closed = False start_time = 0 _consumer_thread = None _kafka_thread = None _heartbeat_thread = None _redis_thread = None def __init__(self, settings_name): """ @param settings_name: the local settings file name """ self.settings_name = settings_name self.wrapper = SettingsWrapper() self.logger = None self.app = Flask(__name__) self.kafka_connected = False self.redis_connected = False self.my_uuid = str(uuid.uuid4()).split('-')[4] self.uuids = {} self.uuids_lock = threading.Lock() self.validator = self._extend_with_default(Draft4Validator) self.schemas = {} def setup(self, level=None, log_file=None, json=None): """ Load everything up. Note that any arg here will override both default and custom settings @param level: the log level @param log_file: boolean t/f whether to log to a file, else stdout @param json: boolean t/f whether to write the logs in json """ self.settings = self.wrapper.load(self.settings_name) my_level = level if level else self.settings['LOG_LEVEL'] # negate because logger wants True for std out my_output = not log_file if log_file else self.settings['LOG_STDOUT'] my_json = json if json else self.settings['LOG_JSON'] self.logger = LogFactory.get_instance(json=my_json, stdout=my_output, level=my_level, name=self.settings['LOGGER_NAME'], dir=self.settings['LOG_DIR'], file=self.settings['LOG_FILE'], bytes=self.settings['LOG_MAX_BYTES'], backups=self.settings['LOG_BACKUPS']) self._decorate_routes() self._spawn_redis_connection_thread() self._spawn_kafka_connection_thread() # spawn heartbeat processing loop self._heartbeat_thread = Thread(target=self._heartbeat_loop) self._heartbeat_thread.setDaemon(True) self._heartbeat_thread.start() self.start_time = self.get_time() # disable flask logger if self.settings['FLASK_LOGGING_ENABLED'] == False: log = logging.getLogger('werkzeug') log.disabled = True self._load_schemas() def get_time(self): """Returns the current time""" return time.time() def _load_schemas(self): """Loads any schemas for JSON validation""" for filename in os.listdir(self.settings['SCHEMA_DIR']): if filename[-4:] == 'json': name = filename[:-5] with open(self.settings['SCHEMA_DIR'] + filename) as the_file: self.schemas[name] = json.load(the_file) self.logger.debug("Successfully loaded " + filename + " schema") def _extend_with_default(self, validator_class): ''' Method to add default fields to our schema validation ( From the docs ) ''' validate_properties = validator_class.VALIDATORS["properties"] def set_defaults(validator, properties, instance, schema): for error in validate_properties( validator, properties, instance, schema, ): yield error for property, subschema in list(properties.items()): if "default" in subschema: instance.setdefault(property, subschema["default"]) return validators.extend( validator_class, {"properties": set_defaults}, ) def _spawn_redis_connection_thread(self): """Spawns a redis connection thread""" self.logger.debug("Spawn redis connection thread") self.redis_connected = False self._redis_thread = Thread(target=self._setup_redis) self._redis_thread.setDaemon(True) self._redis_thread.start() def _spawn_kafka_connection_thread(self): """Spawns a kafka connection thread""" self.logger.debug("Spawn kafka connection thread") self.kafka_connected = False self._kafka_thread = Thread(target=self._setup_kafka) self._kafka_thread.setDaemon(True) self._kafka_thread.start() def _spawn_kafka_consumer_thread(self): """Spawns a kafka continuous consumer thread""" self.logger.debug("Spawn kafka consumer thread""") self._consumer_thread = Thread(target=self._consumer_loop) self._consumer_thread.setDaemon(True) self._consumer_thread.start() def _consumer_loop(self): """The main consumer loop""" self.logger.debug("running main consumer thread") while not self.closed: if self.kafka_connected: self._process_messages() time.sleep(self.settings['KAFKA_CONSUMER_SLEEP_TIME']) def _process_messages(self): """Processes messages received from kafka""" try: for message in self.consumer: try: if message is None: self.logger.debug("no message") break loaded_dict = json.loads(message.value) self.logger.debug("got valid kafka message") with self.uuids_lock: if 'uuid' in loaded_dict: if loaded_dict['uuid'] in self.uuids and \ self.uuids[loaded_dict['uuid']] != 'poll': self.logger.debug("Found Kafka message from request") self.uuids[loaded_dict['uuid']] = loaded_dict else: self.logger.debug("Got poll result") self._send_result_to_redis(loaded_dict) else: self.logger.debug("Got message not intended for this process") except ValueError: extras = {} if message is not None: extras["data"] = message.value self.logger.warning('Unparseable JSON Received from kafka', extra=extras) self._check_kafka_disconnect() except OffsetOutOfRangeError: # consumer has no idea where they are self.consumer.seek_to_end() self.logger.error("Kafka offset out of range error") def _send_result_to_redis(self, result): """Sends the result of a poll to redis to be used potentially by another process @param result: the result retrieved from kafka""" if self.redis_connected: self.logger.debug("Sending result to redis") try: key = "rest:poll:{u}".format(u=result['uuid']) self.redis_conn.set(key, json.dumps(result)) except ConnectionError: self.logger.error("Lost connection to Redis") self._spawn_redis_connection_thread() else: self.logger.warning("Unable to send result to redis, not connected") def _check_kafka_disconnect(self): """Checks the kafka connection is still valid""" for node_id in self.consumer._client._conns: conn = self.consumer._client._conns[node_id] if conn.state == ConnectionStates.DISCONNECTED or \ conn.state == ConnectionStates.DISCONNECTING: self._spawn_kafka_connection_thread() break def _heartbeat_loop(self): """A main run loop thread to do work""" self.logger.debug("running main heartbeat thread") while not self.closed: time.sleep(self.settings['SLEEP_TIME']) self._report_self() def _report_self(self): """ Reports the crawler uuid to redis """ if self.redis_connected: self.logger.debug("Reporting self to redis") try: key = "stats:rest:self:{m}:{u}".format( m=socket.gethostname(), u=self.my_uuid) self.redis_conn.set(key, self.get_time()) self.redis_conn.expire(key, self.settings['HEARTBEAT_TIMEOUT']) except ConnectionError: self.logger.error("Lost connection to Redis") self._spawn_redis_connection_thread() else: self.logger.warn("Cannot report self to redis, not connected") @retry(wait_exponential_multiplier=500, wait_exponential_max=10000) def _setup_redis(self): """Returns a Redis Client""" if not self.closed: try: self.logger.debug("Creating redis connection to host " + str(self.settings['REDIS_HOST'])) self.redis_conn = redis.StrictRedis(host=self.settings['REDIS_HOST'], port=self.settings['REDIS_PORT'], db=self.settings['REDIS_DB']) self.redis_conn.info() self.redis_connected = True self.logger.info("Successfully connected to redis") except KeyError as e: self.logger.error('Missing setting named ' + str(e), {'ex': traceback.format_exc()}) except: self.logger.error("Couldn't initialize redis client.", {'ex': traceback.format_exc()}) raise def _setup_kafka(self): """ Sets up kafka connections """ # close older connections if self.consumer is not None: self.logger.debug("Closing existing kafka consumer") self.consumer.close() self.consumer = None if self.producer is not None: self.logger.debug("Closing existing kafka producer") self.producer.flush() self.producer.close(timeout=10) self.producer = None # create new connections self._consumer_thread = None self.logger.debug("Creating kafka connections") self.consumer = self._create_consumer() if not self.closed: self.logger.debug("Kafka Conumer created") self.producer = self._create_producer() if not self.closed: self.logger.debug("Kafka Producer created") if not self.closed: self.kafka_connected = True self.logger.info("Connected successfully to Kafka") self._spawn_kafka_consumer_thread() @retry(wait_exponential_multiplier=500, wait_exponential_max=10000) def _create_consumer(self): """Tries to establing the Kafka consumer connection""" if not self.closed: try: self.logger.debug("Creating new kafka consumer using brokers: " + str(self.settings['KAFKA_HOSTS']) + ' and topic ' + self.settings['KAFKA_TOPIC_PREFIX'] + ".outbound_firehose") return KafkaConsumer( self.settings['KAFKA_TOPIC_PREFIX'] + ".outbound_firehose", group_id=None, bootstrap_servers=self.settings['KAFKA_HOSTS'], consumer_timeout_ms=self.settings['KAFKA_CONSUMER_TIMEOUT'], auto_offset_reset=self.settings['KAFKA_CONSUMER_AUTO_OFFSET_RESET'], auto_commit_interval_ms=self.settings['KAFKA_CONSUMER_COMMIT_INTERVAL_MS'], enable_auto_commit=self.settings['KAFKA_CONSUMER_AUTO_COMMIT_ENABLE'], max_partition_fetch_bytes=self.settings['KAFKA_CONSUMER_FETCH_MESSAGE_MAX_BYTES']) except KeyError as e: self.logger.error('Missing setting named ' + str(e), {'ex': traceback.format_exc()}) except: self.logger.error("Couldn't initialize kafka consumer for topic", {'ex': traceback.format_exc()}) raise @retry(wait_exponential_multiplier=500, wait_exponential_max=10000) def _create_producer(self): """Tries to establish a Kafka consumer connection""" if not self.closed: try: self.logger.debug("Creating new kafka producer using brokers: " + str(self.settings['KAFKA_HOSTS'])) return KafkaProducer(bootstrap_servers=self.settings['KAFKA_HOSTS'], value_serializer=lambda v: json.dumps(v).encode('utf-8'), retries=3, linger_ms=self.settings['KAFKA_PRODUCER_BATCH_LINGER_MS'], buffer_memory=self.settings['KAFKA_PRODUCER_BUFFER_BYTES']) except KeyError as e: self.logger.error('Missing setting named ' + str(e), {'ex': traceback.format_exc()}) except: self.logger.error("Couldn't initialize kafka producer.", {'ex': traceback.format_exc()}) raise def run(self): """Main flask run loop""" self.logger.info("Running main flask method on port " + str(self.settings['FLASK_PORT'])) self.app.run(host='0.0.0.0', port=self.settings['FLASK_PORT']) def _create_ret_object(self, status=SUCCESS, data=None, error=False, error_message=None, error_cause=None): """ Create generic reponse objects. :param str status: The SUCCESS or FAILURE of the request :param obj data: The data to return :param bool error: Set to True to add Error response :param str error_message: The generic error message :param str error_cause: The cause of the error :returns: A dictionary of values """ ret = {} if status == self.FAILURE: ret['status'] = self.FAILURE else: ret['status'] = self.SUCCESS ret['data'] = data if error: ret['error'] = {} if error_message is not None: ret['error']['message'] = error_message if error_cause is not None: ret['error']['cause'] = error_cause else: ret['error'] = None return ret def _close_thread(self, thread, thread_name): """Closes daemon threads @param thread: the thread to close @param thread_name: a human readable name of the thread """ if thread is not None and thread.isAlive(): self.logger.debug("Waiting for {} thread to close".format(thread_name)) thread.join(timeout=self.settings['DAEMON_THREAD_JOIN_TIMEOUT']) if thread.isAlive(): self.logger.warn("{} daemon thread unable to be shutdown" " within timeout".format(thread_name)) def close(self): """ Cleans up anything from the process """ self.logger.info("Closing Rest Service") self.closed = True # close threads self._close_thread(self._redis_thread, "Redis setup") self._close_thread(self._heartbeat_thread, "Heartbeat") self._close_thread(self._kafka_thread, "Kafka setup") self._close_thread(self._consumer_thread, "Consumer") # close kafka if self.consumer is not None: self.logger.debug("Closing kafka consumer") self.consumer.close() if self.producer is not None: self.logger.debug("Closing kafka producer") self.producer.close(timeout=10) def _calculate_health(self): """Returns a string representation of the node health @returns: GREEN if fully connected, YELLOW if partially connected, RED if not connected """ if self.redis_connected and self.kafka_connected: return "GREEN" elif self.redis_connected or self.kafka_connected: return "YELLOW" else: return "RED" def _kafka_success(self, response): ''' Callback for successful send ''' self.logger.debug("Sent message to Kafka") def _kafka_failure(self, response): ''' Callback for failed send ''' self.logger.error("Failed to send message to Kafka") self._spawn_kafka_connection_thread() def _feed_to_kafka(self, json_item): """Sends a request to Kafka :param json_item: The json item to send :returns: A boolean indicating whther the data was sent successfully or not """ @MethodTimer.timeout(self.settings['KAFKA_FEED_TIMEOUT'], False) def _feed(json_item): try: self.logger.debug("Sending json to kafka at " + str(self.settings['KAFKA_PRODUCER_TOPIC'])) future = self.producer.send(self.settings['KAFKA_PRODUCER_TOPIC'], json_item) future.add_callback(self._kafka_success) future.add_errback(self._kafka_failure) self.producer.flush() return True except Exception as e: self.logger.error("Lost connection to Kafka") self._spawn_kafka_connection_thread() return False return _feed(json_item) # Routes -------------------- def _decorate_routes(self): """ Decorates the routes to use within the flask app """ self.logger.debug("Decorating routes") # self.app.add_url_rule('/', 'catch', self.catch, methods=['GET'], # defaults={'path': ''}) self.app.add_url_rule('/<path:path>', 'catch', self.catch, methods=['GET', 'POST'], defaults={'path': ''}) self.app.add_url_rule('/', 'index', self.index, methods=['POST', 'GET']) self.app.add_url_rule('/feed', 'feed', self.feed, methods=['POST']) self.app.add_url_rule('/poll', 'poll', self.poll, methods=['POST']) @log_call('Non-existant route called') @error_catch def catch(self, path): return self._create_ret_object(self.FAILURE, None, True, self.DOES_NOT_EXIST), 404 @log_call('\'index\' endpoint called') @error_catch def index(self): data = { "kafka_connected": self.kafka_connected, "redis_connected": self.redis_connected, "uptime_sec": int(self.get_time() - self.start_time), "my_id": self.my_uuid, "node_health": self._calculate_health() } return data @validate_json @log_call('\'feed\' endpoint called') @error_catch def feed(self): # proof of concept to write things to kafka if self.kafka_connected: json_item = request.get_json() self.wait_for_response = False result = self._feed_to_kafka(json_item) if 'uuid' in json_item: self.wait_for_response = True with self.uuids_lock: self.uuids[json_item['uuid']] = None if result: true_response = None if self.wait_for_response: self.logger.debug("expecting kafka response for request") the_time = self.get_time() found_item = False while not found_item and int(self.get_time() - the_time) <= self.settings['WAIT_FOR_RESPONSE_TIME']: if self.uuids[json_item['uuid']] is not None: found_item = True true_response = self.uuids[json_item['uuid']] with self.uuids_lock: del self.uuids[json_item['uuid']] else: with self.uuids_lock: # key still exists, means we didnt find get our # response in time if json_item['uuid'] in self.uuids: self.uuids[json_item['uuid']] = 'poll' self.logger.debug("Did not find response, " "adding to poll") if found_item: self.logger.debug("Got successful reponse back from kafka") else: self.logger.warn("Did not get response within timeout " "from kafka. If the request is still " "running, use the `/poll` API") true_response = { "poll_id": json_item['uuid'] } else: self.logger.debug("Not expecting response from kafka") return self._create_ret_object(self.SUCCESS, true_response) self.logger.warn("Unable to write request to Kafka, not connected") return self._create_ret_object(self.FAILURE, None, True, "Unable to connect to Kafka"), 500 @validate_json @validate_schema('poll') @log_call('\'poll\' endpoint called') @error_catch def poll(self): """Retrieves older requests that may not make it back quick enough""" if self.redis_connected: json_item = request.get_json() result = None try: key = "rest:poll:{u}".format(u=json_item['poll_id']) result = self.redis_conn.get(key) if result is not None: result = json.loads(result) self.logger.debug("Found previous poll") self.redis_conn.delete(key) return self._create_ret_object(self.SUCCESS, result) else: self.logger.debug("poll key does not exist") return self._create_ret_object(self.FAILURE, None, True, "Could not find matching poll_id"), 404 except ConnectionError: self.logger.error("Lost connection to Redis") self._spawn_redis_connection_thread() except ValueError: extras = { "value": result } self.logger.warning('Unparseable JSON Received from redis', extra=extras) self.redis_conn.delete(key) return self._create_ret_object(self.FAILURE, None, True, "Unparseable JSON Received " "from redis"), 500 self.logger.warn("Unable to poll redis, not connected") return self._create_ret_object(self.FAILURE, None, True, "Unable to connect to Redis"), 500
class Feed: def __init__(self, settings_name, unit_test=False): ''' @param settings_name: the local settings file name @param unit_test: whether running unit tests or not ''' self.settings_name = settings_name self.wrapper = SettingsWrapper() self.logger = None self.unit_test = unit_test def setup(self, level=None, log_file=None, json=None): ''' Load everything up. Note that any arg here will override both default and custom settings @param level: the log level @param log_file: boolean t/f whether to log to a file, else stdout @param json: boolean t/f whether to write the logs in json ''' self.settings = self.wrapper.load(self.settings_name) my_level = level if level else self.settings['LOG_LEVEL'] # negate because logger wants True for std out my_output = not log_file if log_file else self.settings['LOG_STDOUT'] my_json = json if json else self.settings['LOG_JSON'] self.logger = LogFactory.get_instance(json=my_json, stdout=my_output, level=my_level, name=self.settings['LOGGER_NAME'], dir=self.settings['LOG_DIR'], file=self.settings['LOG_FILE'], bytes=self.settings['LOG_MAX_BYTES'], backups=self.settings['LOG_BACKUPS']) self.validator = self.extend_with_default(Draft4Validator) def extend_with_default(self, validator_class): ''' Method to add default fields to our schema validation ( From the docs ) ''' validate_properties = validator_class.VALIDATORS["properties"] def set_defaults(validator, properties, instance, schema): for error in validate_properties( validator, properties, instance, schema, ): yield error for property, subschema in properties.iteritems(): if "default" in subschema: instance.setdefault(property, subschema["default"]) return validators.extend( validator_class, {"properties": set_defaults}, ) def feed(self, json_item): instance = ScraperHandler() instance._set_logger(self.logger) instance.setup(self.settings) the_schema = None with open(self.settings['PLUGIN_DIR'] + instance.schema) as the_file: the_schema = json.load(the_file) the_dict = json_item ret = True try: self.validator(the_schema).validate(the_dict) instance.handle(the_dict) self.logger.info("Successfully fed item to Kafka") except ValidationError: self.logger.error("Failed to feed item into Kafka")
class KafkaMonitor: def __init__(self, settings_name, unit_test=False): ''' @param settings_name: the local settings file name @param unit_test: whether running unit tests or not ''' self.settings_name = settings_name self.wrapper = SettingsWrapper() self.logger = None self.unit_test = unit_test def _import_class(self, cl): ''' Imports a class from a string @param name: the module and class name in dot notation ''' d = cl.rfind(".") classname = cl[d+1:len(cl)] m = __import__(cl[0:d], globals(), locals(), [classname]) return getattr(m, classname) def _load_plugins(self): ''' Sets up all plugins, defaults and settings.py ''' plugins = self.settings['PLUGINS'] self.plugins_dict = {} for key in plugins: # skip loading the plugin if its value is None if plugins[key] is None: continue # valid plugin, import and setup self.logger.debug("Trying to load plugin {cls}".format(cls=key)) the_class = self._import_class(key) instance = the_class() instance._set_logger(self.logger) if not self.unit_test: instance.setup(self.settings) the_schema = None with open(self.settings['PLUGIN_DIR'] + instance.schema) as the_file: the_schema = json.load(the_file) mini = {} mini['instance'] = instance mini['schema'] = the_schema self.plugins_dict[plugins[key]] = mini self.plugins_dict = OrderedDict(sorted(self.plugins_dict.items(), key=lambda t: t[0])) def setup(self, level=None, log_file=None, json=None): ''' Load everything up. Note that any arg here will override both default and custom settings @param level: the log level @param log_file: boolean t/f whether to log to a file, else stdout @param json: boolean t/f whether to write the logs in json ''' self.settings = self.wrapper.load(self.settings_name) my_level = level if level else self.settings['LOG_LEVEL'] # negate because logger wants True for std out my_output = not log_file if log_file else self.settings['LOG_STDOUT'] my_json = json if json else self.settings['LOG_JSON'] self.logger = LogFactory.get_instance(json=my_json, stdout=my_output, level=my_level, name=self.settings['LOGGER_NAME'], dir=self.settings['LOG_DIR'], file=self.settings['LOG_FILE'], bytes=self.settings['LOG_MAX_BYTES'], backups=self.settings['LOG_BACKUPS']) self.validator = self.extend_with_default(Draft4Validator) def _setup_stats(self): ''' Sets up the stats collection ''' self.stats_dict = {} redis_conn = redis.Redis(host=self.settings['REDIS_HOST'], port=self.settings['REDIS_PORT']) try: redis_conn.info() self.logger.debug("Connected to Redis in StatsCollector Setup") except ConnectionError: self.logger.warn("Failed to connect to Redis in StatsCollector" " Setup, no stats will be collected") return if self.settings['STATS_TOTAL']: self._setup_stats_total(redis_conn) if self.settings['STATS_PLUGINS']: self._setup_stats_plugins(redis_conn) def _setup_stats_total(self, redis_conn): ''' Sets up the total stats collectors @param redis_conn: the redis connection ''' self.stats_dict['total'] = {} self.stats_dict['fail'] = {} temp_key1 = 'stats:kafka-monitor:total' temp_key2 = 'stats:kafka-monitor:fail' for item in self.settings['STATS_TIMES']: try: time = getattr(StatsCollector, item) self.stats_dict['total'][time] = StatsCollector \ .get_rolling_time_window( redis_conn=redis_conn, key='{k}:{t}'.format(k=temp_key1, t=time), window=time, cycle_time=self.settings['STATS_CYCLE']) self.stats_dict['fail'][time] = StatsCollector \ .get_rolling_time_window( redis_conn=redis_conn, key='{k}:{t}'.format(k=temp_key2, t=time), window=time, cycle_time=self.settings['STATS_CYCLE']) self.logger.debug("Set up total/fail Stats Collector '{i}'"\ .format(i=item)) except AttributeError as e: self.logger.warning("Unable to find Stats Time '{s}'"\ .format(s=item)) total1 = StatsCollector.get_hll_counter(redis_conn=redis_conn, key='{k}:lifetime'.format(k=temp_key1), cycle_time=self.settings['STATS_CYCLE'], roll=False) total2 = StatsCollector.get_hll_counter(redis_conn=redis_conn, key='{k}:lifetime'.format(k=temp_key2), cycle_time=self.settings['STATS_CYCLE'], roll=False) self.logger.debug("Set up total/fail Stats Collector 'lifetime'") self.stats_dict['total']['lifetime'] = total1 self.stats_dict['fail']['lifetime'] = total2 def _setup_stats_plugins(self, redis_conn): ''' Sets up the plugin stats collectors @param redis_conn: the redis connection ''' self.stats_dict['plugins'] = {} for key in self.plugins_dict: plugin_name = self.plugins_dict[key]['instance'].__class__.__name__ temp_key = 'stats:kafka-monitor:{p}'.format(p=plugin_name) self.stats_dict['plugins'][plugin_name] = {} for item in self.settings['STATS_TIMES']: try: time = getattr(StatsCollector, item) self.stats_dict['plugins'][plugin_name][time] = StatsCollector \ .get_rolling_time_window( redis_conn=redis_conn, key='{k}:{t}'.format(k=temp_key, t=time), window=time, cycle_time=self.settings['STATS_CYCLE']) self.logger.debug("Set up {p} plugin Stats Collector '{i}'"\ .format(p=plugin_name, i=item)) except AttributeError: self.logger.warning("Unable to find Stats Time '{s}'"\ .format(s=item)) total = StatsCollector.get_hll_counter(redis_conn=redis_conn, key='{k}:lifetime'.format(k=temp_key), cycle_time=self.settings['STATS_CYCLE'], roll=False) self.logger.debug("Set up {p} plugin Stats Collector 'lifetime'"\ .format(p=plugin_name)) self.stats_dict['plugins'][plugin_name]['lifetime'] = total def _setup_kafka(self): ''' Sets up kafka connections ''' @MethodTimer.timeout(self.settings['KAFKA_CONN_TIMEOUT'], False) def _hidden_setup(): try: self.kafka_conn = KafkaClient(self.settings['KAFKA_HOSTS']) self.kafka_conn.ensure_topic_exists( self.settings['KAFKA_INCOMING_TOPIC']) self.consumer = SimpleConsumer(self.kafka_conn, self.settings['KAFKA_GROUP'], self.settings['KAFKA_INCOMING_TOPIC'], auto_commit=True, iter_timeout=1.0) except KafkaUnavailableError as ex: message = "An exception '{0}' occured. Arguments:\n{1!r}" \ .format(type(ex).__name__, ex.args) self.logger.error(message) sys.exit(1) return True ret_val = _hidden_setup() if ret_val: self.logger.debug("Successfully connected to Kafka") else: self.logger.error("Failed to set up Kafka Connection within" " timeout") # this is essential to running the kafka monitor sys.exit(1) def extend_with_default(self, validator_class): ''' Method to add default fields to our schema validation ( From the docs ) ''' validate_properties = validator_class.VALIDATORS["properties"] def set_defaults(validator, properties, instance, schema): for error in validate_properties( validator, properties, instance, schema, ): yield error for property, subschema in properties.iteritems(): if "default" in subschema: instance.setdefault(property, subschema["default"]) return validators.extend( validator_class, {"properties": set_defaults}, ) def _main_loop(self): ''' Continuous loop that reads from a kafka topic and tries to validate incoming messages ''' self.logger.debug("Processing messages") old_time = 0 while True: self._process_messages() if self.settings['STATS_DUMP'] != 0: new_time = int(time.time() / self.settings['STATS_DUMP']) # only log every X seconds if new_time != old_time: self._dump_stats() old_time = new_time time.sleep(.01) def _process_messages(self): try: for message in self.consumer.get_messages(): if message is None: self.logger.debug("no message") break try: self._increment_total_stat(message.message.value) the_dict = json.loads(message.message.value) found_plugin = False for key in self.plugins_dict: obj = self.plugins_dict[key] instance = obj['instance'] schema = obj['schema'] try: self.validator(schema).validate(the_dict) found_plugin = True self._increment_plugin_stat( instance.__class__.__name__, the_dict) ret = instance.handle(the_dict) # break if nothing is returned if ret is None: break except ValidationError: pass if not found_plugin: extras = {} extras['parsed'] = True extras['valid'] = False extras['data'] = the_dict self.logger.warn("Did not find schema to validate " "request", extra=extras) self._increment_fail_stat(the_dict) except ValueError: extras = {} extras['parsed'] = False extras['valid'] = False extras['data'] = message.message.value self.logger.warning('Unparseable JSON Received', extra=extras) self._increment_fail_stat(message.message.value) except OffsetOutOfRangeError: # consumer has no idea where they are self.consumer.seek(0, 2) self.logger.error("Kafka offset out of range error") def _increment_total_stat(self, string): ''' Increments the total stat counters @param string: the loaded message object for the counter ''' string = string + str(time.time()) if 'total' in self.stats_dict: self.logger.debug("Incremented total stats") for key in self.stats_dict['total']: if key == 'lifetime': self.stats_dict['total'][key].increment(string) else: self.stats_dict['total'][key].increment() def _increment_fail_stat(self, item): ''' Increments the total stat counters @param item: the loaded message object for HLL counter ''' if isinstance(item, dict): item['ts'] = time.time() elif isinstance(item, str): item = item + str(time.time()) if 'fail' in self.stats_dict: self.logger.debug("Incremented fail stats") for key in self.stats_dict['fail']: if key == 'lifetime': self.stats_dict['fail'][key].increment(item) else: self.stats_dict['fail'][key].increment() def _increment_plugin_stat(self, name, item): ''' Increments the total stat counters @param name: The formal name of the plugin @param dict: the loaded message object for HLL counter ''' item['ts'] = time.time() if 'plugins' in self.stats_dict: self.logger.debug("Incremented plugin '{p}' plugin stats"\ .format(p=name)) for key in self.stats_dict['plugins'][name]: if key == 'lifetime': self.stats_dict['plugins'][name][key].increment(item) else: self.stats_dict['plugins'][name][key].increment() def _dump_stats(self): ''' Dumps the stats out ''' extras = {} if 'total' in self.stats_dict: self.logger.debug("Compiling total/fail dump stats") for key in self.stats_dict['total']: final = 'total_{t}'.format(t=key) extras[final] = self.stats_dict['total'][key].value() for key in self.stats_dict['fail']: final = 'fail_{t}'.format(t=key) extras[final] = self.stats_dict['fail'][key].value() if 'plugins' in self.stats_dict: self.logger.debug("Compiling plugin dump stats") for name in self.stats_dict['plugins']: for key in self.stats_dict['plugins'][name]: final = 'plugin_{n}_{t}'.format(n=name, t=key) extras[final] = self.stats_dict['plugins'][name][key].value() if not self.logger.json: self.logger.info('Kafka Monitor Stats Dump:\n{0}'.format( json.dumps(extras, indent=4, sort_keys=True))) else: self.logger.info('Kafka Monitor Stats Dump', extra=extras) def run(self): ''' Set up and run ''' self._setup_kafka() self._load_plugins() self._setup_stats() self._main_loop() def feed(self, json_item): ''' Feeds a json item into the Kafka topic @param json_item: The loaded json object ''' @MethodTimer.timeout(self.settings['KAFKA_FEED_TIMEOUT'], False) def _feed(json_item): try: self.kafka_conn = KafkaClient(self.settings['KAFKA_HOSTS']) topic = self.settings['KAFKA_INCOMING_TOPIC'] producer = SimpleProducer(self.kafka_conn) except KafkaUnavailableError: self.logger.error("Unable to connect to Kafka") return False if not self.logger.json: self.logger.info('Feeding JSON into {0}\n{1}'.format( topic, json.dumps(json_item, indent=4))) else: self.logger.info('Feeding JSON into {0}\n'.format(topic), extra={'value': json_item}) self.kafka_conn.ensure_topic_exists(topic) producer.send_messages(topic, json.dumps(json_item)) return True result = _feed(json_item) if result: self.logger.info("Successfully fed item to Kafka") else: self.logger.error("Failed to feed item into Kafka")
class Feed: def __init__(self, settings_name, unit_test=False): ''' @param settings_name: the local settings file name @param unit_test: whether running unit tests or not ''' self.settings_name = settings_name self.wrapper = SettingsWrapper() self.logger = None self.unit_test = unit_test def setup(self, level=None, log_file=None, json=None): ''' Load everything up. Note that any arg here will override both default and custom settings @param level: the log level @param log_file: boolean t/f whether to log to a file, else stdout @param json: boolean t/f whether to write the logs in json ''' self.settings = self.wrapper.load(self.settings_name) my_level = level if level else self.settings['LOG_LEVEL'] # negate because logger wants True for std out my_output = not log_file if log_file else self.settings['LOG_STDOUT'] my_json = json if json else self.settings['LOG_JSON'] self.logger = LogFactory.get_instance( json=my_json, stdout=my_output, level=my_level, name=self.settings['LOGGER_NAME'], dir=self.settings['LOG_DIR'], file=self.settings['LOG_FILE'], bytes=self.settings['LOG_MAX_BYTES'], backups=self.settings['LOG_BACKUPS']) self.validator = self.extend_with_default(Draft4Validator) def extend_with_default(self, validator_class): ''' Method to add default fields to our schema validation ( From the docs ) ''' validate_properties = validator_class.VALIDATORS["properties"] def set_defaults(validator, properties, instance, schema): for error in validate_properties( validator, properties, instance, schema, ): yield error for property, subschema in properties.iteritems(): if "default" in subschema: instance.setdefault(property, subschema["default"]) return validators.extend( validator_class, {"properties": set_defaults}, ) def feed(self, json_item): instance = ScraperHandler() instance._set_logger(self.logger) instance.setup(self.settings) the_schema = None with open(self.settings['PLUGIN_DIR'] + instance.schema) as the_file: the_schema = json.load(the_file) the_dict = json_item ret = True try: self.validator(the_schema).validate(the_dict) instance.handle(the_dict) self.logger.info("Successfully fed item to Kafka") except ValidationError: self.logger.error("Failed to feed item into Kafka")
class Dispatcher: def __init__(self, tasks, redis_conn): self.tasks = tasks # 初始URL种子队列 self.redis_conn = redis_conn self.wrapper = SettingsWrapper() self.spiders = [] # 当前运行爬虫节点 self.spiders_weights = None # 当前爬虫节点的权值 self.settings = None self.logger = None def setup(self): """从配置文件中加载配置信息""" self.settings = self.wrapper.load('settings.py') self.logger = LogFactory.get_instance( json=self.settings['LOG_JSON'], stdout=self.settings['LOG_STDOUT'], level=self.settings['LOG_LEVEL'], name=self.settings['LOGGER_NAME'], dir=self.settings['LOG_DIR'], file=self.settings['LOG_FILE'], bytes=self.settings['LOG_MAX_BYTES'], backups=self.settings['LOG_BACKUPS']) def schedule_seeds(self): """ 分配初始种子队列 :return: """ # 加载算法 robin = Robin() count = 0 # 计数器: 每100次执行一次批量操作 pip = self.redis_conn.pipeline() while True: task_json = self.redis_conn.lpop('seeds') if not task_json: time.sleep(3) # 等待3秒再向redis查询 continue self.logger.debug('分配初始种子URLs........') task = pickle.loads(task_json) url = task['url'] spider_type = task['spider_type'] domain = tldextract.extract(url).domain # 更新加权调度算法中最大爬虫节点序号值 spiders_weights = copy.deepcopy(self.spiders_weights) max_job = max(spiders_weights.keys()) if max_job != robin.get_max_job(): robin.update_max_job(max_job) job_id = robin.choose_spider(spiders_weights) # 调用加权轮叫算法选择一个爬虫节点 if job_id == -1: # 爬虫节点出现故障 pass queue_key = '{spider_type}:{job_id}:{domain}:queue'.format( spider_type=spider_type, job_id=job_id, domain=domain) priority = task['priority'] pip.zadd(queue_key, pickle.dumps(task), priority) count += 1 if count == 100: pip.execute() count = 0 def get_spiders_weights(self, interval): """ 定时获取各个爬虫节点的权值 :return: """ spiders_weights = dict() for key in self.redis_conn.keys('weight:spider:*:*'): job_id = int(key.split(':')[3]) spiders_weights[job_id] = int(self.redis_conn.get(key)) self.spiders_weights = spiders_weights t = Timer(interval, self.get_spiders_weights, (interval, )) t.start() def schedule_tasks(self): """主节点任务调度""" # 加载算法 robin = Robin() count = 0 # 计数器: 每500次执行一次批量操作 pip = self.redis_conn.pipeline() while True: self.logger.debug('将新加入的URLs分配给各个爬虫节点.........') task_json = self.redis_conn.lpop('tasks') task = pickle.loads(task_json) url = task['url'] spider_type = task['spider_type'] domain = tldextract.extract(url).domain # 更新加权调度算法中最大爬虫节点序号值 spiders_weights = copy.deepcopy(self.spiders_weights) max_job = max(spiders_weights.keys()) if max_job != robin.get_max_job(): robin.update_max_job(max_job) job_id = robin.choose_spider(spiders_weights) # 调用加权轮叫算法选择一个爬虫节点 queue_key = '{spider_type}:{job_id}:{domain}:queue'.format( spider_type=spider_type, job_id=job_id, domain=domain) priority = task['priority'] pip.zadd(queue_key, pickle.dumps(task), priority) count += 1 if count == 500: pip.execute() count = 0 def run(self): self.get_spiders_weights(3) # 分配初始种子队列 schedule_seeds_thread = Thread(target=self.schedule_seeds) schedule_seeds_thread.start()
class JayRedisMonitor: def __init__(self, settings_name, unit_test=False): ''' @param settings_name: the local settings file name @param unit_test: whether running unit tests or not ''' self.settings_name = settings_name self.redis_conn = None self.wrapper = SettingsWrapper() self.logger = None self.unit_test = unit_test def setup(self, level=None, log_file=None, json=None): ''' Load everything up. Note that any arg here will override both default and custom settings @param level: the log level @param log_file: boolean t/f whether to log to a file, else stdout @param json: boolean t/f whether to write the logs in json ''' self.settings = self.wrapper.load(self.settings_name) my_level = level if level else self.settings['LOG_LEVEL'] # negate because logger wants True for std out my_output = not log_file if log_file else self.settings['LOG_STDOUT'] my_json = json if json else self.settings['LOG_JSON'] self.logger = LogFactory.get_instance(json=my_json, stdout=my_output, level=my_level, #name=self.settings['LOGGER_NAME'], name = "jay-redis-monitor", dir=self.settings['LOG_DIR'], #file=self.settings['LOG_FILE'], file="jay_redis_monitor.log", bytes=self.settings['LOG_MAX_BYTES'], backups=self.settings['LOG_BACKUPS']) self.redis_conn = redis.Redis(host=self.settings['REDIS_HOST'], port=self.settings['REDIS_PORT']) try: self.redis_conn.info() self.logger.debug("Successfully connected to Redis") except ConnectionError: self.logger.error("Failed to connect to Redis") # essential to functionality sys.exit(1) def run(self): ''' The external main run loop ''' self._main_loop() def _main_loop(self): ''' The internal while true main loop for the redis monitor ''' self.logger.debug("Running main loop") print 'Running main loop' jaystats = JayStatsMonitor() jaystats.setup(self.logger,self.redis_conn) jayinfo = JayInfoMonitor() jayinfo.setup(self.logger,self.redis_conn) while True: jaystats.handle() jayinfo.handle() time.sleep(1)
import argparse from scutils.settings_wrapper import SettingsWrapper # set up arg parser parser = argparse.ArgumentParser( description='Example SettingsWrapper parser.\n') parser.add_argument('-s', '--settings', action='store', required=False, help="The default settings file", default="settings.py") parser.add_argument('-o', '--override-settings', action='store', required=False, help="The override settings file", default="localsettings.py") parser.add_argument('-v', '--variable', action='store', required=False, help="The variable to print out", default=None) args = vars(parser.parse_args()) # load up settings wrapper = SettingsWrapper() my_settings = wrapper.load(default=args['settings'], local=args['override_settings']) if args['variable'] is not None: if args['variable'] in my_settings: print args['variable'], '=', my_settings[args['variable']] else: print args['variable'], "not in loaded settings" else: print "Full settings:", my_settings
def main(): # initial main parser setup parser = argparse.ArgumentParser( description='Kafka Dump: Scrapy Cluster Kafka topic dump utility for ' 'debugging.', add_help=False) parser.add_argument('-h', '--help', action=ArgparseHelper, help='show this help message and exit') subparsers = parser.add_subparsers(help='commands', dest='command') # args to use for all commands base_parser = argparse.ArgumentParser(add_help=False) base_parser.add_argument('-kh', '--kafka-host', action='store', required=False, help="The override Kafka host") base_parser.add_argument('-s', '--settings', action='store', required=False, help="The settings file to read from", default="localsettings.py") base_parser.add_argument( '-ll', '--log-level', action='store', required=False, help="The log level", default=None, choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']) # list command list_parser = subparsers.add_parser('list', help='List all Kafka topics', parents=[base_parser]) # dump command dump_parser = subparsers.add_parser('dump', help='Dump a Kafka topic', parents=[base_parser]) dump_parser.add_argument('-t', '--topic', action='store', required=True, help="The Kafka topic to read from") dump_parser.add_argument('-c', '--consumer', action='store', required=False, default='default', help="The Kafka consumer id to use") dump_parser.add_argument('-b', '--from-beginning', action='store_const', required=False, const=True, help="Read the topic from the beginning") dump_parser.add_argument('-nb', '--no-body', action='store_const', required=False, const=True, default=False, help="Do not include the raw html 'body' key in" " the json dump of the topic") dump_parser.add_argument('-p', '--pretty', action='store_const', required=False, const=True, default=False, help="Pretty print the json objects consumed") dump_parser.add_argument('-d', '--decode-base64', action='store_const', required=False, const=True, default=False, help="Decode the base64 encoded raw html body") args = vars(parser.parse_args()) wrapper = SettingsWrapper() settings = wrapper.load(args['settings']) kafka_host = args['kafka_host'] if args['kafka_host'] else settings[ 'KAFKA_HOSTS'] log_level = args['log_level'] if args['log_level'] else settings[ 'LOG_LEVEL'] logger = LogFactory.get_instance(level=log_level, name='kafkadump') logger.debug("Connecting to {0}...".format(kafka_host)) try: kafka = KafkaClient(kafka_host) logger.info("Connected to {0}".format(kafka_host)) except KafkaUnavailableError as ex: message = "An exception '{0}' occured. Arguments:\n{1!r}" \ .format(type(ex).__name__, ex.args) logger.error(message) sys.exit(1) if args['command'] == 'list': logger.debug('Running list command') print "Topics:" for topic in kafka.topic_partitions.keys(): print "-", topic return 0 elif args['command'] == 'dump': logger.debug('Running dump command') topic = args["topic"] consumer_id = args["consumer"] @MethodTimer.timeout(5, None) def _hidden(): try: logger.debug("Ensuring topic {t} exists".format(t=topic)) kafka.ensure_topic_exists(topic) logger.debug("Getting Kafka consumer") consumer = SimpleConsumer(kafka, consumer_id, topic, buffer_size=1024 * 100, fetch_size_bytes=1024 * 100, max_buffer_size=None) return consumer except KafkaUnavailableError as ex: message = "An exception '{0}' occured. Arguments:\n{1!r}" \ .format(type(ex).__name__, ex.args) logger.error(message) sys.exit(1) consumer = _hidden() if consumer is None: logger.error("Could not fully connect to Kafka within the timeout") sys.exit(1) if args["from_beginning"]: logger.debug("Seeking to beginning") consumer.seek(0, 0) else: logger.debug("Reading from the end") consumer.seek(0, 2) num_records = 0 total_bytes = 0 item = None while True: try: for message in consumer.get_messages(): if message is None: logger.debug("no message") break logger.debug("Received message") val = message.message.value try: item = json.loads(val) if args['decode_base64'] and 'body' in item: item['body'] = base64.b64decode(item['body']) if args['no_body'] and 'body' in item: del item['body'] except ValueError: logger.info("Message is not a JSON object") item = val body_bytes = len(item) if args['pretty']: print json.dumps(item, indent=4) else: print item num_records = num_records + 1 total_bytes = total_bytes + body_bytes except KeyboardInterrupt: logger.debug("Keyboard interrupt received") break except: logger.error(traceback.print_exc()) break total_mbs = float(total_bytes) / (1024 * 1024) if item is not None: print "Last item:" print json.dumps(item, indent=4) if num_records > 0: logger.info( "Num Records: {n}, Total MBs: {m}, kb per message: {kb}". format(n=num_records, m=total_mbs, kb=(float(total_bytes) / num_records / 1024))) else: logger.info("No records consumed") num_records = 0 logger.info("Closing Kafka connection") kafka.close() return 0
class KafkaMonitor(object): consumer = None def __init__(self, settings_name, unit_test=False): ''' @param settings_name: the local settings file name @param unit_test: whether running unit tests or not ''' self.settings_name = settings_name self.wrapper = SettingsWrapper() self.logger = None self.unit_test = unit_test self.my_uuid = str(uuid.uuid4()).split('-')[4] def _import_class(self, cl): ''' Imports a class from a string @param name: the module and class name in dot notation ''' d = cl.rfind(".") classname = cl[d + 1:len(cl)] m = __import__(cl[0:d], globals(), locals(), [classname]) return getattr(m, classname) def _load_plugins(self): ''' Sets up all plugins, defaults and settings.py ''' plugins = self.settings['PLUGINS'] self.plugins_dict = {} for key in plugins: # skip loading the plugin if its value is None if plugins[key] is None: continue # valid plugin, import and setup self.logger.debug("Trying to load plugin {cls}".format(cls=key)) the_class = self._import_class(key) instance = the_class() # 插件类的实例 instance._set_logger(self.logger) if not self.unit_test: instance.setup(self.settings) the_schema = None with open(self.settings['PLUGIN_DIR'] + instance.schema) as the_file: the_schema = json.load(the_file) mini = {} # 每个插件的实例 和 对应的schema(请求的规范) mini['instance'] = instance mini['schema'] = the_schema self.logger.debug( "Successfully loaded plugin {cls}".format(cls=key)) self.plugins_dict[ plugins[key]] = mini # {'插件名':{'插件实例':'','schema':''} ,} self.plugins_dict = OrderedDict( sorted(list(self.plugins_dict.items()), key=lambda t: t[0])) def setup(self, level=None, log_file=None, json=None): ''' Load everything up. Note that any arg here will override both default and custom settings @param level: the log level @param log_file: boolean t/f whether to log to a file, else stdout @param json: boolean t/f whether to write the logs in json ''' self.settings = self.wrapper.load(self.settings_name) my_level = level if level else self.settings['LOG_LEVEL'] # negate because logger wants True for std out my_output = not log_file if log_file else self.settings['LOG_STDOUT'] my_json = json if json else self.settings['LOG_JSON'] self.logger = LogFactory.get_instance( json=my_json, stdout=my_output, level=my_level, name=self.settings['LOGGER_NAME'], dir=self.settings['LOG_DIR'], file=self.settings['LOG_FILE'], bytes=self.settings['LOG_MAX_BYTES'], backups=self.settings['LOG_BACKUPS']) self.validator = self.extend_with_default(Draft4Validator) def _setup_stats(self): ''' Sets up the stats collection ''' self.stats_dict = {} redis_conn = redis.Redis(host=self.settings['REDIS_HOST'], port=self.settings['REDIS_PORT'], db=self.settings.get('REDIS_DB')) try: redis_conn.info() self.logger.debug("Connected to Redis in StatsCollector Setup") self.redis_conn = redis_conn except ConnectionError: self.logger.warn("Failed to connect to Redis in StatsCollector" " Setup, no stats will be collected") return if self.settings['STATS_TOTAL']: self._setup_stats_total(redis_conn) if self.settings['STATS_PLUGINS']: self._setup_stats_plugins(redis_conn) def _setup_stats_total(self, redis_conn): ''' Sets up the total stats collectors @param redis_conn: the redis connection ''' self.stats_dict['total'] = {} self.stats_dict['fail'] = {} temp_key1 = 'stats:kafka-monitor:total' temp_key2 = 'stats:kafka-monitor:fail' for item in self.settings['STATS_TIMES']: try: time = getattr(StatsCollector, item) self.stats_dict['total'][time] = StatsCollector \ .get_rolling_time_window( redis_conn=redis_conn, key='{k}:{t}'.format(k=temp_key1, t=time), window=time, cycle_time=self.settings['STATS_CYCLE']) self.stats_dict['fail'][time] = StatsCollector \ .get_rolling_time_window( redis_conn=redis_conn, key='{k}:{t}'.format(k=temp_key2, t=time), window=time, cycle_time=self.settings['STATS_CYCLE']) self.logger.debug("Set up total/fail Stats Collector '{i}'"\ .format(i=item)) except AttributeError as e: self.logger.warning("Unable to find Stats Time '{s}'"\ .format(s=item)) total1 = StatsCollector.get_hll_counter( redis_conn=redis_conn, key='{k}:lifetime'.format(k=temp_key1), cycle_time=self.settings['STATS_CYCLE'], roll=False) total2 = StatsCollector.get_hll_counter( redis_conn=redis_conn, key='{k}:lifetime'.format(k=temp_key2), cycle_time=self.settings['STATS_CYCLE'], roll=False) self.logger.debug("Set up total/fail Stats Collector 'lifetime'") self.stats_dict['total']['lifetime'] = total1 self.stats_dict['fail']['lifetime'] = total2 def _setup_stats_plugins(self, redis_conn): ''' Sets up the plugin stats collectors @param redis_conn: the redis connection ''' self.stats_dict['plugins'] = {} for key in self.plugins_dict: plugin_name = self.plugins_dict[key]['instance'].__class__.__name__ temp_key = 'stats:kafka-monitor:{p}'.format(p=plugin_name) self.stats_dict['plugins'][plugin_name] = {} for item in self.settings['STATS_TIMES']: try: time = getattr(StatsCollector, item) self.stats_dict['plugins'][plugin_name][time] = StatsCollector \ .get_rolling_time_window( redis_conn=redis_conn, key='{k}:{t}'.format(k=temp_key, t=time), window=time, cycle_time=self.settings['STATS_CYCLE']) self.logger.debug("Set up {p} plugin Stats Collector '{i}'"\ .format(p=plugin_name, i=item)) except AttributeError: self.logger.warning("Unable to find Stats Time '{s}'"\ .format(s=item)) total = StatsCollector.get_hll_counter( redis_conn=redis_conn, key='{k}:lifetime'.format(k=temp_key), cycle_time=self.settings['STATS_CYCLE'], roll=False) self.logger.debug("Set up {p} plugin Stats Collector 'lifetime'"\ .format(p=plugin_name)) self.stats_dict['plugins'][plugin_name]['lifetime'] = total def _setup_kafka(self): ''' Sets up kafka connections ''' self.consumer = self._create_consumer() self.logger.debug("Successfully connected to Kafka") def extend_with_default(self, validator_class): ''' Method to add default fields to our schema validation ( From the docs ) ''' validate_properties = validator_class.VALIDATORS["properties"] def set_defaults(validator, properties, instance, schema): for error in validate_properties( validator, properties, instance, schema, ): yield error for property, subschema in list(properties.items()): if "default" in subschema: instance.setdefault(property, subschema["default"]) return validators.extend( validator_class, {"properties": set_defaults}, ) def _main_loop(self): ''' Continuous loop that reads from a kafka topic and tries to validate incoming messages ''' self.logger.debug("Processing messages") old_time = 0 while True: self._process_messages() if self.settings['STATS_DUMP'] != 0: new_time = int( old_div(time.time(), self.settings['STATS_DUMP'])) # only log every X seconds if new_time != old_time: self._dump_stats() old_time = new_time self._report_self() time.sleep(self.settings['SLEEP_TIME']) # 生产者已经把json_item = {u'uuid': u'abc123', u'appid': u'stuff'}加入到dome.incoming topic 中 # 这里创建消费者去topic 中取消息,通过将 消息格式 与 插件 匹配 让插件处理消息(scraper插件实现让crawl消息存入redis中) def _process_messages(self): try: # self.consumer = <kafka.consumer.group.KafkaConsumer object at 0x2648f10> for message in self.consumer: # message: ConsumerRecord( # topic=u'demo.incoming', partition=0, offset=13, # timestamp=1515581029167, timestamp_type=0, key=None, # value='{"url": "istresearch.com", "crawlid": "abc123", "appid": "madisonTest"}', # checksum=-342948442, serialized_key_size=-1, serialized_value_size=71 # ) # ConsumerRecord( # topic=u'demo.incoming', partition=0, offset=14, # timestamp=1515581538398, timestamp_type=0, key=None, # value='{"uuid": "abc123", "appid": "stuff"}', # checksum=-439167079, serialized_key_size=-1, serialized_value_size=36) if message is None: self.logger.debug("no message") break try: self._increment_total_stat(message.value) loaded_dict = json.loads( message.value) # {"uuid": "abc123", "appid": "stuff"} found_plugin = False for key in self.plugins_dict: # {'插件名':{'插件实例':'','schema':''} ,} # to prevent reference modification the_dict = copy.deepcopy(loaded_dict) obj = self.plugins_dict[key] instance = obj['instance'] schema = obj['schema'] try: self.validator(schema).validate(the_dict) # 匹配插件 found_plugin = True self._increment_plugin_stat( instance.__class__.__name__, the_dict) # the_dict: {u'allowed_domains': None, u'allow_regex': None, # u'crawlid': u'abc123', # u'url': u'istresearch.com', u'expires': 0, # 'ts': 1515581029.185904, u'priority': 1, # u'deny_regex': None, u'cookie': None, # u'attrs': None, u'appid': u'madisonTest', # u'spiderid': u'link', u'useragent': None, u'deny_extensions': None, # u'maxdepth': 0} #{u'stats': u'all', u'uuid': u'abc123', 'ts': 1515581538.40857, u'appid': u'stuff'} ret = instance.handle(the_dict) # break if nothing is returned if ret is None: break except ValidationError: pass if not found_plugin: extras = {} extras['parsed'] = True extras['valid'] = False extras['data'] = the_dict self.logger.warn( "Did not find schema to validate " "request", extra=extras) self._increment_fail_stat(the_dict) except ValueError: extras = {} extras['parsed'] = False extras['valid'] = False extras['data'] = message.value self.logger.warning('Unparseable JSON Received', extra=extras) self._increment_fail_stat(message.value) except OffsetOutOfRangeError: # consumer has no idea where they are self.consumer.seek_to_end() self.logger.error("Kafka offset out of range error") def _increment_total_stat(self, string): ''' Increments the total stat counters @param string: the loaded message object for the counter ''' string = string + str(time.time()) if 'total' in self.stats_dict: self.logger.debug("Incremented total stats") for key in self.stats_dict['total']: if key == 'lifetime': self.stats_dict['total'][key].increment(string) else: self.stats_dict['total'][key].increment() def _increment_fail_stat(self, item): ''' Increments the total stat counters @param item: the loaded message object for HLL counter ''' if isinstance(item, dict): item['ts'] = time.time() elif isinstance(item, str): item = item + str(time.time()) if 'fail' in self.stats_dict: self.logger.debug("Incremented fail stats") for key in self.stats_dict['fail']: if key == 'lifetime': self.stats_dict['fail'][key].increment(item) else: self.stats_dict['fail'][key].increment() def _increment_plugin_stat(self, name, item): ''' Increments the total stat counters @param name: The formal name of the plugin @param dict: the loaded message object for HLL counter ''' item['ts'] = time.time() if 'plugins' in self.stats_dict: self.logger.debug("Incremented plugin '{p}' plugin stats"\ .format(p=name)) for key in self.stats_dict['plugins'][name]: if key == 'lifetime': self.stats_dict['plugins'][name][key].increment(item) else: self.stats_dict['plugins'][name][key].increment() def _dump_stats(self): ''' Dumps the stats out ''' extras = {} if 'total' in self.stats_dict: self.logger.debug("Compiling total/fail dump stats") for key in self.stats_dict['total']: final = 'total_{t}'.format(t=key) extras[final] = self.stats_dict['total'][key].value() for key in self.stats_dict['fail']: final = 'fail_{t}'.format(t=key) extras[final] = self.stats_dict['fail'][key].value() if 'plugins' in self.stats_dict: self.logger.debug("Compiling plugin dump stats") for name in self.stats_dict['plugins']: for key in self.stats_dict['plugins'][name]: final = 'plugin_{n}_{t}'.format(n=name, t=key) extras[final] = self.stats_dict['plugins'][name][ key].value() if not self.logger.json: self.logger.info('Kafka Monitor Stats Dump:\n{0}'.format( json.dumps(extras, indent=4, sort_keys=True))) else: self.logger.info('Kafka Monitor Stats Dump', extra=extras) def run(self): ''' Set up and run ''' self._setup_kafka() self._load_plugins() self._setup_stats() self._main_loop() def _report_self(self): ''' Reports the kafka monitor uuid to redis ''' key = "stats:kafka-monitor:self:{m}:{u}".format(m=socket.gethostname(), u=self.my_uuid) self.redis_conn.set(key, time.time()) self.redis_conn.expire(key, self.settings['HEARTBEAT_TIMEOUT']) def feed(self, json_item): ''' Feeds a json item into the Kafka topic @param json_item: The loaded json object ''' @MethodTimer.timeout(self.settings['KAFKA_FEED_TIMEOUT'], False) def _feed(json_item): producer = self._create_producer() topic = self.settings['KAFKA_INCOMING_TOPIC'] if not self.logger.json: self.logger.info('Feeding JSON into {0}\n{1}'.format( topic, json.dumps(json_item, indent=4))) else: self.logger.info('Feeding JSON into {0}\n'.format(topic), extra={'value': json_item}) if producer is not None: producer.send(topic, json_item) producer.flush() producer.close(timeout=10) return True else: return False result = _feed(json_item) if result: self.logger.info("Successfully fed item to Kafka") else: self.logger.error("Failed to feed item into Kafka") @retry(wait_exponential_multiplier=500, wait_exponential_max=10000) def _create_consumer(self): """Tries to establing the Kafka consumer connection""" try: brokers = self.settings['KAFKA_HOSTS'] self.logger.debug("Creating new kafka consumer using brokers: " + str(brokers) + ' and topic ' + self.settings['KAFKA_INCOMING_TOPIC']) return KafkaConsumer( self.settings['KAFKA_INCOMING_TOPIC'], group_id=self.settings['KAFKA_GROUP'], bootstrap_servers=brokers, consumer_timeout_ms=self.settings['KAFKA_CONSUMER_TIMEOUT'], auto_offset_reset=self. settings['KAFKA_CONSUMER_AUTO_OFFSET_RESET'], auto_commit_interval_ms=self. settings['KAFKA_CONSUMER_COMMIT_INTERVAL_MS'], enable_auto_commit=self. settings['KAFKA_CONSUMER_AUTO_COMMIT_ENABLE'], max_partition_fetch_bytes=self. settings['KAFKA_CONSUMER_FETCH_MESSAGE_MAX_BYTES']) except KeyError as e: self.logger.error('Missing setting named ' + str(e), {'ex': traceback.format_exc()}) except: self.logger.error( "Couldn't initialize kafka consumer for topic", { 'ex': traceback.format_exc(), 'topic': self.settings['KAFKA_INCOMING_TOPIC'] }) raise @retry(wait_exponential_multiplier=500, wait_exponential_max=10000) def _create_producer(self): """Tries to establish a Kafka consumer connection""" try: brokers = self.settings['KAFKA_HOSTS'] self.logger.debug("Creating new kafka producer using brokers: " + str(brokers)) return KafkaProducer( bootstrap_servers=brokers, value_serializer=lambda m: json.dumps(m), retries=3, linger_ms=self.settings['KAFKA_PRODUCER_BATCH_LINGER_MS'], buffer_memory=self.settings['KAFKA_PRODUCER_BUFFER_BYTES']) except KeyError as e: self.logger.error('Missing setting named ' + str(e), {'ex': traceback.format_exc()}) except: self.logger.error("Couldn't initialize kafka producer.", {'ex': traceback.format_exc()}) raise def close(self): ''' Call to properly tear down the Kafka Monitor ''' if self.consumer is not None: self.consumer.close()
def main(): # initial main parser setup parser = argparse.ArgumentParser( description='Kafka Dump: Scrapy Cluster Kafka topic dump utility for ' 'debugging.', add_help=False) parser.add_argument('-h', '--help', action=ArgparseHelper, help='show this help message and exit') subparsers = parser.add_subparsers(help='commands', dest='command') # args to use for all commands base_parser = argparse.ArgumentParser(add_help=False) base_parser.add_argument('-kh', '--kafka-host', action='store', required=False, help="The override Kafka host") base_parser.add_argument('-s', '--settings', action='store', required=False, help="The settings file to read from", default="localsettings.py") base_parser.add_argument('-ll', '--log-level', action='store', required=False, help="The log level", default=None, choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']) # list command list_parser = subparsers.add_parser('list', help='List all Kafka topics', parents=[base_parser]) # dump command dump_parser = subparsers.add_parser('dump', help='Dump a Kafka topic', parents=[base_parser]) dump_parser.add_argument('-t', '--topic', action='store', required=True, help="The Kafka topic to read from") dump_parser.add_argument('-c', '--consumer', action='store', required=False, default=None, help="The Kafka consumer id to use") dump_parser.add_argument('-b', '--from-beginning', action='store_const', required=False, const=True, help="Read the topic from the beginning") dump_parser.add_argument('-nb', '--no-body', action='store_const', required=False, const=True, default=False, help="Do not include the raw html 'body' key in" " the json dump of the topic") dump_parser.add_argument('-p', '--pretty', action='store_const', required=False, const=True, default=False, help="Pretty print the json objects consumed") dump_parser.add_argument('-d', '--decode-base64', action='store_const', required=False, const=True, default=False, help="Decode the base64 encoded raw html body") dump_parser.add_argument('-m', '--mongodb', action="store", help="Set mongodb to save webpages") args = vars(parser.parse_args()) wrapper = SettingsWrapper() settings = wrapper.load(args['settings']) kafka_host = args['kafka_host'] if args['kafka_host'] else settings['KAFKA_HOSTS'] log_level = args['log_level'] if args['log_level'] else settings['LOG_LEVEL'] logger = LogFactory.get_instance(level=log_level, name='kafkadump') if args['command'] == 'list': try: logger.debug("Connecting to {0}...".format(kafka_host)) kafka = SimpleClient(kafka_host) logger.info("Connected to {0}".format(kafka_host)) except KafkaUnavailableError as ex: message = "An exception '{0}' occured. Arguments:\n{1!r}" \ .format(type(ex).__name__, ex.args) logger.error(message) sys.exit(1) logger.debug('Running list command') print("Topics:") for topic in list(kafka.topic_partitions.keys()): print("-", topic) kafka.close() return 0 elif args['command'] == 'dump': logger.debug('Running dump command') topic = args["topic"] consumer_id = args["consumer"] try: logger.debug("Getting Kafka consumer") offset = 'earliest' if args["from_beginning"] else 'latest' consumer = KafkaConsumer( topic, group_id=consumer_id, bootstrap_servers=kafka_host, consumer_timeout_ms=settings['KAFKA_CONSUMER_TIMEOUT'], auto_offset_reset=offset, auto_commit_interval_ms=settings['KAFKA_CONSUMER_COMMIT_INTERVAL_MS'], enable_auto_commit=settings['KAFKA_CONSUMER_AUTO_COMMIT_ENABLE'], max_partition_fetch_bytes=settings['KAFKA_CONSUMER_FETCH_MESSAGE_MAX_BYTES']) except NoBrokersAvailable as ex: logger.error('Unable to connect to Kafka') sys.exit(1) num_records = 0 total_bytes = 0 item = None while True: try: for message in consumer: if message is None: logger.debug("no message") break logger.debug("Received message") val = message.value try: item = json.loads(val) if args['decode_base64'] and 'body' in item: item['body'] = str(base64.b64decode(item['body'])) if args['no_body'] and 'body' in item: del item['body'] except BaseException, msg: logger.info("Message is not a JSON object") logger.info("base64 error: ", msg) item = val body_bytes = len(item) if args['pretty']: print(json.dumps(item, indent=4)) else: print(item) num_records = num_records + 1 total_bytes = total_bytes + body_bytes except KeyboardInterrupt: logger.debug("Keyboard interrupt received") break except: logger.error(traceback.print_exc()) break total_mbs = old_div(float(total_bytes), (1024*1024)) if item is not None: print("Last item:") print(json.dumps(item, indent=4)) if num_records > 0: logger.info("Num Records: {n}, Total MBs: {m}, kb per message: {kb}" .format(n=num_records, m=total_mbs, kb=(float(total_bytes) / num_records / 1024))) else: logger.info("No records consumed") num_records = 0 logger.info("Closing Kafka connection") try: consumer.close() except: # Exception is thrown when group_id is None. # See https://github.com/dpkp/kafka-python/issues/619 pass return 0
class KafkaMonitor: def __init__(self, settings_name, unit_test=False): ''' @param settings_name: the local settings file name @param unit_test: whether running unit tests or not ''' self.settings_name = settings_name self.wrapper = SettingsWrapper() self.logger = None self.unit_test = unit_test def _import_class(self, cl): ''' Imports a class from a string @param name: the module and class name in dot notation ''' d = cl.rfind(".") classname = cl[d + 1:len(cl)] m = __import__(cl[0:d], globals(), locals(), [classname]) return getattr(m, classname) def _load_plugins(self): ''' Sets up all plugins, defaults and settings.py ''' plugins = self.settings['PLUGINS'] self.plugins_dict = {} for key in plugins: # skip loading the plugin if its value is None if plugins[key] is None: continue # valid plugin, import and setup self.logger.debug("Trying to load plugin {cls}".format(cls=key)) the_class = self._import_class(key) instance = the_class() instance._set_logger(self.logger) if not self.unit_test: instance.setup(self.settings) the_schema = None print("self.settings['PLUGIN_DIR'] + instance.schema====", self.settings['PLUGIN_DIR'] + instance.schema) with open(self.settings['PLUGIN_DIR'] + instance.schema) as the_file: the_schema = json.load(the_file) mini = {} mini['instance'] = instance mini['schema'] = the_schema self.plugins_dict[plugins[key]] = mini self.plugins_dict = OrderedDict( sorted(self.plugins_dict.items(), key=lambda t: t[0])) def setup(self, level=None, log_file=None, json=None): ''' Load everything up. Note that any arg here will override both default and custom settings @param level: the log level @param log_file: boolean t/f whether to log to a file, else stdout @param json: boolean t/f whether to write the logs in json ''' self.settings = self.wrapper.load(self.settings_name) my_level = level if level else self.settings['LOG_LEVEL'] # negate because logger wants True for std out my_output = not log_file if log_file else self.settings['LOG_STDOUT'] my_json = json if json else self.settings['LOG_JSON'] self.logger = LogFactory.get_instance( json=my_json, stdout=my_output, level=my_level, name=self.settings['LOGGER_NAME'], dir=self.settings['LOG_DIR'], file=self.settings['LOG_FILE'], bytes=self.settings['LOG_MAX_BYTES'], backups=self.settings['LOG_BACKUPS']) self.validator = self.extend_with_default(Draft4Validator) def _setup_stats(self): ''' Sets up the stats collection ''' self.stats_dict = {} redis_conn = redis.Redis(host=self.settings['REDIS_HOST'], port=self.settings['REDIS_PORT']) try: redis_conn.info() self.logger.debug("Connected to Redis in StatsCollector Setup") except ConnectionError: self.logger.warn("Failed to connect to Redis in StatsCollector" " Setup, no stats will be collected") return if self.settings['STATS_TOTAL']: self._setup_stats_total(redis_conn) if self.settings['STATS_PLUGINS']: self._setup_stats_plugins(redis_conn) def _setup_stats_total(self, redis_conn): ''' Sets up the total stats collectors @param redis_conn: the redis connection ''' self.stats_dict['total'] = {} self.stats_dict['fail'] = {} temp_key1 = 'stats:kafka-monitor:total' temp_key2 = 'stats:kafka-monitor:fail' for item in self.settings['STATS_TIMES']: try: time = getattr(StatsCollector, item) self.stats_dict['total'][time] = StatsCollector \ .get_rolling_time_window( redis_conn=redis_conn, key='{k}:{t}'.format(k=temp_key1, t=time), window=time, cycle_time=self.settings['STATS_CYCLE']) self.stats_dict['fail'][time] = StatsCollector \ .get_rolling_time_window( redis_conn=redis_conn, key='{k}:{t}'.format(k=temp_key2, t=time), window=time, cycle_time=self.settings['STATS_CYCLE']) self.logger.debug("Set up total/fail Stats Collector '{i}'"\ .format(i=item)) except AttributeError as e: self.logger.warning("Unable to find Stats Time '{s}'"\ .format(s=item)) total1 = StatsCollector.get_hll_counter( redis_conn=redis_conn, key='{k}:lifetime'.format(k=temp_key1), cycle_time=self.settings['STATS_CYCLE'], roll=False) total2 = StatsCollector.get_hll_counter( redis_conn=redis_conn, key='{k}:lifetime'.format(k=temp_key2), cycle_time=self.settings['STATS_CYCLE'], roll=False) self.logger.debug("Set up total/fail Stats Collector 'lifetime'") self.stats_dict['total']['lifetime'] = total1 self.stats_dict['fail']['lifetime'] = total2 def _setup_stats_plugins(self, redis_conn): ''' Sets up the plugin stats collectors @param redis_conn: the redis connection ''' self.stats_dict['plugins'] = {} for key in self.plugins_dict: plugin_name = self.plugins_dict[key]['instance'].__class__.__name__ temp_key = 'stats:kafka-monitor:{p}'.format(p=plugin_name) self.stats_dict['plugins'][plugin_name] = {} for item in self.settings['STATS_TIMES']: try: time = getattr(StatsCollector, item) self.stats_dict['plugins'][plugin_name][time] = StatsCollector \ .get_rolling_time_window( redis_conn=redis_conn, key='{k}:{t}'.format(k=temp_key, t=time), window=time, cycle_time=self.settings['STATS_CYCLE']) self.logger.debug("Set up {p} plugin Stats Collector '{i}'"\ .format(p=plugin_name, i=item)) except AttributeError: self.logger.warning("Unable to find Stats Time '{s}'"\ .format(s=item)) total = StatsCollector.get_hll_counter( redis_conn=redis_conn, key='{k}:lifetime'.format(k=temp_key), cycle_time=self.settings['STATS_CYCLE'], roll=False) self.logger.debug("Set up {p} plugin Stats Collector 'lifetime'"\ .format(p=plugin_name)) self.stats_dict['plugins'][plugin_name]['lifetime'] = total def _setup_kafka(self): ''' Sets up kafka connections ''' @MethodTimer.timeout(self.settings['KAFKA_CONN_TIMEOUT'], False) def _hidden_setup(): try: self.kafka_conn = KafkaClient(self.settings['KAFKA_HOSTS']) self.kafka_conn.ensure_topic_exists( self.settings['KAFKA_INCOMING_TOPIC']) self.consumer = SimpleConsumer( self.kafka_conn, self.settings['KAFKA_GROUP'], self.settings['KAFKA_INCOMING_TOPIC'], auto_commit=True, iter_timeout=1.0) except KafkaUnavailableError as ex: message = "An exception '{0}' occured. Arguments:\n{1!r}" \ .format(type(ex).__name__, ex.args) self.logger.error(message) sys.exit(1) return True ret_val = _hidden_setup() if ret_val: self.logger.debug("Successfully connected to Kafka") else: self.logger.error("Failed to set up Kafka Connection within" " timeout") # this is essential to running the kafka monitor sys.exit(1) def extend_with_default(self, validator_class): ''' Method to add default fields to our schema validation ( From the docs ) ''' validate_properties = validator_class.VALIDATORS["properties"] def set_defaults(validator, properties, instance, schema): for error in validate_properties( validator, properties, instance, schema, ): yield error for property, subschema in properties.iteritems(): if "default" in subschema: instance.setdefault(property, subschema["default"]) return validators.extend( validator_class, {"properties": set_defaults}, ) def _main_loop(self): ''' Continuous loop that reads from a kafka topic and tries to validate incoming messages ''' self.logger.debug("Processing messages") old_time = 0 while True: self._process_messages() if self.settings['STATS_DUMP'] != 0: new_time = int(time.time() / self.settings['STATS_DUMP']) # only log every X seconds if new_time != old_time: self._dump_stats() old_time = new_time time.sleep(.01) def _process_messages(self): try: for message in self.consumer.get_messages(): if message is None: self.logger.debug("no message") break try: self._increment_total_stat(message.message.value) the_dict = json.loads(message.message.value) print('the_dict', the_dict) found_plugin = False print('self.plugins_dict', self.plugins_dict) for key in self.plugins_dict: obj = self.plugins_dict[key] instance = obj['instance'] print('instance==', instance) schema = obj['schema'] print( 'schema********************************************', schema) try: print('before v = self.validator(schema)') v = self.validator(schema) print('after v = self.validator(schema)') print('the_dict-------', the_dict) v.validate(the_dict) found_plugin = True print('found_plugin====', found_plugin) self._increment_plugin_stat( instance.__class__.__name__, the_dict) print('instance.handle(the_dict)', the_dict) ret = instance.handle(the_dict) # break if nothing is returned if ret is None: break except ValidationError: print(' except ValidationError:======') pass if not found_plugin: extras = {} extras['parsed'] = True extras['valid'] = False extras['data'] = the_dict self.logger.warn( "Did not find schema to validate " "request", extra=extras) self._increment_fail_stat(the_dict) except ValueError: extras = {} extras['parsed'] = False extras['valid'] = False extras['data'] = message.message.value self.logger.warning('Unparseable JSON Received', extra=extras) self._increment_fail_stat(message.message.value) except OffsetOutOfRangeError: # consumer has no idea where they are self.consumer.seek(0, 2) self.logger.error("Kafka offset out of range error") def _increment_total_stat(self, string): ''' Increments the total stat counters @param string: the loaded message object for the counter ''' string = string + str(time.time()) if 'total' in self.stats_dict: self.logger.debug("Incremented total stats") for key in self.stats_dict['total']: if key == 'lifetime': self.stats_dict['total'][key].increment(string) else: self.stats_dict['total'][key].increment() def _increment_fail_stat(self, item): ''' Increments the total stat counters @param item: the loaded message object for HLL counter ''' if isinstance(item, dict): item['ts'] = time.time() elif isinstance(item, str): item = item + str(time.time()) if 'fail' in self.stats_dict: self.logger.debug("Incremented fail stats") for key in self.stats_dict['fail']: if key == 'lifetime': self.stats_dict['fail'][key].increment(item) else: self.stats_dict['fail'][key].increment() def _increment_plugin_stat(self, name, item): ''' Increments the total stat counters @param name: The formal name of the plugin @param dict: the loaded message object for HLL counter ''' item['ts'] = time.time() if 'plugins' in self.stats_dict: self.logger.debug("Incremented plugin '{p}' plugin stats"\ .format(p=name)) for key in self.stats_dict['plugins'][name]: if key == 'lifetime': self.stats_dict['plugins'][name][key].increment(item) else: self.stats_dict['plugins'][name][key].increment() def _dump_stats(self): ''' Dumps the stats out ''' extras = {} if 'total' in self.stats_dict: self.logger.debug("Compiling total/fail dump stats") for key in self.stats_dict['total']: final = 'total_{t}'.format(t=key) extras[final] = self.stats_dict['total'][key].value() for key in self.stats_dict['fail']: final = 'fail_{t}'.format(t=key) extras[final] = self.stats_dict['fail'][key].value() if 'plugins' in self.stats_dict: self.logger.debug("Compiling plugin dump stats") for name in self.stats_dict['plugins']: for key in self.stats_dict['plugins'][name]: final = 'plugin_{n}_{t}'.format(n=name, t=key) extras[final] = self.stats_dict['plugins'][name][ key].value() if not self.logger.json: self.logger.info('Kafka Monitor Stats Dump:\n{0}'.format( json.dumps(extras, indent=4, sort_keys=True))) else: self.logger.info('Kafka Monitor Stats Dump', extra=extras) def run(self): ''' Set up and run ''' self._setup_kafka() self._load_plugins() self._setup_stats() self._main_loop() def feed(self, json_item): ''' Feeds a json item into the Kafka topic @param json_item: The loaded json object ''' @MethodTimer.timeout(self.settings['KAFKA_FEED_TIMEOUT'], False) def _feed(json_item): try: self.kafka_conn = KafkaClient(self.settings['KAFKA_HOSTS']) topic = self.settings['KAFKA_INCOMING_TOPIC'] producer = SimpleProducer(self.kafka_conn) except KafkaUnavailableError: self.logger.error("Unable to connect to Kafka") return False if not self.logger.json: self.logger.info('Feeding JSON into {0}\n{1}'.format( topic, json.dumps(json_item, indent=4))) else: self.logger.info('Feeding JSON into {0}\n'.format(topic), extra={'value': json_item}) self.kafka_conn.ensure_topic_exists(topic) producer.send_messages(topic, json.dumps(json_item)) return True result = _feed(json_item) if result: self.logger.info("Successfully fed item to Kafka") else: self.logger.error("Failed to feed item into Kafka")
class RedisMonitor(object): def __init__(self, settings_name, unit_test=False): ''' @param settings_name: the local settings file name @param unit_test: whether running unit tests or not ''' self.settings_name = settings_name self.redis_conn = None self.wrapper = SettingsWrapper() self.logger = None self.unit_test = unit_test self.my_uuid = str(uuid.uuid4()).split('-')[4] def setup(self, level=None, log_file=None, json=None): ''' Load everything up. Note that any arg here will override both default and custom settings @param level: the log level @param log_file: boolean t/f whether to log to a file, else stdout @param json: boolean t/f whether to write the logs in json ''' self.settings = self.wrapper.load(self.settings_name) my_level = level if level else self.settings['LOG_LEVEL'] # negate because logger wants True for std out my_output = not log_file if log_file else self.settings['LOG_STDOUT'] my_json = json if json else self.settings['LOG_JSON'] self.logger = LogFactory.get_instance(json=my_json, stdout=my_output, level=my_level, name=self.settings['LOGGER_NAME'], dir=self.settings['LOG_DIR'], file=self.settings['LOG_FILE'], bytes=self.settings['LOG_MAX_BYTES'], backups=self.settings['LOG_BACKUPS']) self.redis_conn = redis.StrictRedis(host=self.settings['REDIS_HOST'], port=self.settings['REDIS_PORT'], db=self.settings['REDIS_DB'], password=self.settings['REDIS_PASSWORD'], decode_responses=True, socket_timeout=self.settings.get('REDIS_SOCKET_TIMEOUT'), socket_connect_timeout=self.settings.get('REDIS_SOCKET_TIMEOUT')) # redis_lock needs a redis connection without setting decode_responses # to True self.lock_redis_conn = redis.StrictRedis(host=self.settings['REDIS_HOST'], port=self.settings['REDIS_PORT'], db=self.settings['REDIS_DB'], password=self.settings['REDIS_PASSWORD'], socket_timeout=self.settings.get('REDIS_SOCKET_TIMEOUT'), socket_connect_timeout=self.settings.get('REDIS_SOCKET_TIMEOUT')) try: self.redis_conn.info() self.logger.debug("Successfully connected to Redis") except ConnectionError: self.logger.error("Failed to connect to Redis") # essential to functionality sys.exit(1) self._load_plugins() self._setup_stats() def import_class(self, cl): ''' Imports a class from a string @param name: the module and class name in dot notation ''' d = cl.rfind(".") classname = cl[d+1:len(cl)] m = __import__(cl[0:d], globals(), locals(), [classname]) return getattr(m, classname) def _load_plugins(self): ''' Sets up all plugins and defaults ''' plugins = self.settings['PLUGINS'] self.plugins_dict = {} for key in plugins: # skip loading the plugin if its value is None if plugins[key] is None: continue # valid plugin, import and setup self.logger.debug("Trying to load plugin {cls}" .format(cls=key)) the_class = self.import_class(key) instance = the_class() instance.redis_conn = self.redis_conn instance._set_logger(self.logger) if not self.unit_test: instance.setup(self.settings) the_regex = instance.regex mini = {} mini['instance'] = instance if the_regex is None: raise ImportError() # continue mini['regex'] = the_regex self.plugins_dict[plugins[key]] = mini self.plugins_dict = OrderedDict(sorted(list(self.plugins_dict.items()), key=lambda t: t[0])) def run(self): ''' The external main run loop ''' self._main_loop() def _main_loop(self): ''' The internal while true main loop for the redis monitor ''' self.logger.debug("Running main loop") old_time = 0 while True: for plugin_key in self.plugins_dict: obj = self.plugins_dict[plugin_key] self._process_plugin(obj) if self.settings['STATS_DUMP'] != 0: new_time = int(old_div(time.time(), self.settings['STATS_DUMP'])) # only log every X seconds if new_time != old_time: self._dump_stats() if self.settings['STATS_DUMP_CRAWL']: self._dump_crawl_stats() if self.settings['STATS_DUMP_QUEUE']: self._dump_queue_stats() old_time = new_time self._report_self() time.sleep(self.settings['SLEEP_TIME']) def _process_plugin(self, plugin): ''' Logic to handle each plugin that is active @param plugin: a plugin dict object ''' instance = plugin['instance'] regex = plugin['regex'] for key in self.redis_conn.scan_iter(match=regex): # acquire lock lock = self._create_lock_object(key) try: if lock.acquire(blocking=False): val = self.redis_conn.get(key) self._process_key_val(instance, key, val) except Exception: self.logger.error(traceback.format_exc()) self._increment_fail_stat('{k}:{v}'.format(k=key, v=val)) self._process_failures(key) # remove lock regardless of if exception or was handled ok if lock._held: self.logger.debug("releasing lock") lock.release() def _create_lock_object(self, key): ''' Returns a lock object, split for testing ''' return redis_lock.Lock(self.lock_redis_conn, key, expire=self.settings['REDIS_LOCK_EXPIRATION'], auto_renewal=True) def _get_fail_key(self, key): ''' Returns the fail key string of a normal key ''' return 'lock:{k}:failures'.format(k=key) def _process_failures(self, key): ''' Handles the retrying of the failed key ''' if self.settings['RETRY_FAILURES']: self.logger.debug("going to retry failure") # get the current failure count failkey = self._get_fail_key(key) current = self.redis_conn.get(failkey) if current is None: current = 0 else: current = int(current) if current < self.settings['RETRY_FAILURES_MAX']: self.logger.debug("Incr fail key") current += 1 self.redis_conn.set(failkey, current) else: self.logger.error("Could not process action within" " failure limit") self.redis_conn.delete(failkey) self.redis_conn.delete(key) def _process_key_val(self, instance, key, val): ''' Logic to let the plugin instance process the redis key/val Split out for unit testing @param instance: the plugin instance @param key: the redis key @param val: the key value from redis ''' if instance.check_precondition(key, val): combined = '{k}:{v}'.format(k=key, v=val) self._increment_total_stat(combined) self._increment_plugin_stat( instance.__class__.__name__, combined) instance.handle(key, val) self.redis_conn.delete(key) failkey = self._get_fail_key(key) if self.redis_conn.exists(failkey): self.redis_conn.delete(failkey) def _setup_stats(self): ''' Sets up the stats ''' # stats setup self.stats_dict = {} if self.settings['STATS_TOTAL']: self._setup_stats_total() if self.settings['STATS_PLUGINS']: self._setup_stats_plugins() def _setup_stats_total(self): ''' Sets up the total stats collectors ''' self.stats_dict['total'] = {} self.stats_dict['fail'] = {} temp_key1 = 'stats:redis-monitor:total' temp_key2 = 'stats:redis-monitor:fail' for item in self.settings['STATS_TIMES']: try: time = getattr(StatsCollector, item) self.stats_dict['total'][time] = StatsCollector \ .get_rolling_time_window( redis_conn=self.redis_conn, key='{k}:{t}'.format(k=temp_key1, t=time), window=time, cycle_time=self.settings['STATS_CYCLE']) self.stats_dict['fail'][time] = StatsCollector \ .get_rolling_time_window( redis_conn=self.redis_conn, key='{k}:{t}'.format(k=temp_key2, t=time), window=time, cycle_time=self.settings['STATS_CYCLE']) self.logger.debug("Set up total/fail Stats Collector '{i}'"\ .format(i=item)) except AttributeError as e: self.logger.warning("Unable to find Stats Time '{s}'"\ .format(s=item)) total1 = StatsCollector.get_hll_counter(redis_conn=self.redis_conn, key='{k}:lifetime'.format(k=temp_key1), cycle_time=self.settings['STATS_CYCLE'], roll=False) total2 = StatsCollector.get_hll_counter(redis_conn=self.redis_conn, key='{k}:lifetime'.format(k=temp_key2), cycle_time=self.settings['STATS_CYCLE'], roll=False) self.logger.debug("Set up total/fail Stats Collector 'lifetime'") self.stats_dict['total']['lifetime'] = total1 self.stats_dict['fail']['lifetime'] = total2 def _setup_stats_plugins(self): ''' Sets up the plugin stats collectors ''' self.stats_dict['plugins'] = {} for key in self.plugins_dict: plugin_name = self.plugins_dict[key]['instance'].__class__.__name__ temp_key = 'stats:redis-monitor:{p}'.format(p=plugin_name) self.stats_dict['plugins'][plugin_name] = {} for item in self.settings['STATS_TIMES']: try: time = getattr(StatsCollector, item) self.stats_dict['plugins'][plugin_name][time] = StatsCollector \ .get_rolling_time_window( redis_conn=self.redis_conn, key='{k}:{t}'.format(k=temp_key, t=time), window=time, cycle_time=self.settings['STATS_CYCLE']) self.logger.debug("Set up {p} plugin Stats Collector '{i}'"\ .format(p=plugin_name, i=item)) except AttributeError as e: self.logger.warning("Unable to find Stats Time '{s}'"\ .format(s=item)) total = StatsCollector.get_hll_counter(redis_conn=self.redis_conn, key='{k}:lifetime'.format(k=temp_key), cycle_time=self.settings['STATS_CYCLE'], roll=False) self.logger.debug("Set up {p} plugin Stats Collector 'lifetime'"\ .format(p=plugin_name)) self.stats_dict['plugins'][plugin_name]['lifetime'] = total def _increment_total_stat(self, item): ''' Increments the total stat counters @param item: the unique print for HLL counter ''' item = item + str(time.time()) if 'total' in self.stats_dict: self.logger.debug("Incremented total stats") for key in self.stats_dict['total']: if key == 'lifetime': self.stats_dict['total'][key].increment(item) else: self.stats_dict['total'][key].increment() def _increment_fail_stat(self, item): ''' Increments the total stat counters @param item: the unique print for HLL counter ''' item = item + str(time.time()) if 'fail' in self.stats_dict: self.logger.debug("Incremented fail stats") for key in self.stats_dict['fail']: if key == 'lifetime': self.stats_dict['fail'][key].increment(item) else: self.stats_dict['fail'][key].increment() def _increment_plugin_stat(self, name, item): ''' Increments the total stat counters @param name: The formal name of the plugin @param item: the unique print for HLL counter ''' item = item + str(time.time()) if 'plugins' in self.stats_dict: self.logger.debug("Incremented plugin '{p}' plugin stats"\ .format(p=name)) for key in self.stats_dict['plugins'][name]: if key == 'lifetime': self.stats_dict['plugins'][name][key].increment(item) else: self.stats_dict['plugins'][name][key].increment() def _dump_stats(self): ''' Dumps the stats out ''' extras = {} if 'total' in self.stats_dict: self.logger.debug("Compiling total/fail dump stats") for key in self.stats_dict['total']: final = 'total_{t}'.format(t=key) extras[final] = self.stats_dict['total'][key].value() for key in self.stats_dict['fail']: final = 'fail_{t}'.format(t=key) extras[final] = self.stats_dict['fail'][key].value() if 'plugins' in self.stats_dict: self.logger.debug("Compiling plugin dump stats") for name in self.stats_dict['plugins']: for key in self.stats_dict['plugins'][name]: final = 'plugin_{n}_{t}'.format(n=name, t=key) extras[final] = self.stats_dict['plugins'][name][key].value() if not self.logger.json: self.logger.info('Redis Monitor Stats Dump:\n{0}'.format( json.dumps(extras, indent=4, sort_keys=True))) else: self.logger.info('Redis Monitor Stats Dump', extra=extras) def _dump_crawl_stats(self): ''' Dumps flattened crawling stats so the spiders do not have to ''' extras = {} spiders = {} spider_set = set() total_spider_count = 0 keys = self.redis_conn.keys('stats:crawler:*:*:*') for key in keys: # we only care about the spider elements = key.split(":") spider = elements[3] if spider not in spiders: spiders[spider] = 0 if len(elements) == 6: # got a time based stat response = elements[4] end = elements[5] final = '{s}_{r}_{e}'.format(s=spider, r=response, e=end) if end == 'lifetime': value = self.redis_conn.execute_command("PFCOUNT", key) else: value = self.redis_conn.zcard(key) extras[final] = value elif len(elements) == 5: # got a spider identifier spiders[spider] += 1 total_spider_count += 1 spider_set.add(spider) else: self.logger.warn("Unknown crawler stat key", {"key":key}) # simple counts extras['unique_spider_count'] = len(spider_set) extras['total_spider_count'] = total_spider_count for spider in spiders: extras['{k}_spider_count'.format(k=spider)] = spiders[spider] if not self.logger.json: self.logger.info('Crawler Stats Dump:\n{0}'.format( json.dumps(extras, indent=4, sort_keys=True))) else: self.logger.info('Crawler Stats Dump', extra=extras) def _dump_queue_stats(self): ''' Dumps basic info about the queue lengths for the spider types ''' extras = {} keys = self.redis_conn.keys('*:*:queue') total_backlog = 0 for key in keys: elements = key.split(":") spider = elements[0] domain = elements[1] spider = 'queue_' + spider if spider not in extras: extras[spider] = {} extras[spider]['spider_backlog'] = 0 extras[spider]['num_domains'] = 0 count = self.redis_conn.zcard(key) total_backlog += count extras[spider]['spider_backlog'] += count extras[spider]['num_domains'] += 1 extras['total_backlog'] = total_backlog if not self.logger.json: self.logger.info('Queue Stats Dump:\n{0}'.format( json.dumps(extras, indent=4, sort_keys=True))) else: self.logger.info('Queue Stats Dump', extra=extras) def _report_self(self): ''' Reports the redis monitor uuid to redis ''' key = "stats:redis-monitor:self:{m}:{u}".format( m=socket.gethostname(), u=self.my_uuid) self.redis_conn.set(key, time.time()) self.redis_conn.expire(key, self.settings['HEARTBEAT_TIMEOUT']) def close(self): ''' Closes the Redis Monitor and plugins ''' for plugin_key in self.plugins_dict: obj = self.plugins_dict[plugin_key] instance = obj['instance'] instance.close()
class RestService(object): # static strings SUCCESS = 'SUCCESS' FAILURE = 'FAILURE' UNKNOWN_ERROR = "An error occurred while processing your request." MUST_JSON = "The payload must be valid JSON." DOES_NOT_EXIST = "The desired endpoint does not exist" BAD_SCHEMA = "JSON did not validate against schema." consumer = None producer = None closed = False start_time = 0 _consumer_thread = None _kafka_thread = None _heartbeat_thread = None _redis_thread = None def __init__(self, settings_name): """ @param settings_name: the local settings file name """ self.settings_name = settings_name self.wrapper = SettingsWrapper() self.logger = None self.app = Flask(__name__) self.kafka_connected = False self.redis_connected = False self.my_uuid = str(uuid.uuid4()).split('-')[4] self.uuids = {} self.uuids_lock = threading.Lock() self.validator = self._extend_with_default(Draft4Validator) self.schemas = {} def setup(self, level=None, log_file=None, json=None): """ Load everything up. Note that any arg here will override both default and custom settings @param level: the log level @param log_file: boolean t/f whether to log to a file, else stdout @param json: boolean t/f whether to write the logs in json """ self.settings = self.wrapper.load(self.settings_name) my_level = level if level else self.settings['LOG_LEVEL'] # negate because logger wants True for std out my_output = not log_file if log_file else self.settings['LOG_STDOUT'] my_json = json if json else self.settings['LOG_JSON'] self.logger = LogFactory.get_instance( json=my_json, stdout=my_output, level=my_level, name=self.settings['LOGGER_NAME'], dir=self.settings['LOG_DIR'], file=self.settings['LOG_FILE'], bytes=self.settings['LOG_MAX_BYTES'], backups=self.settings['LOG_BACKUPS']) self._decorate_routes() self._spawn_redis_connection_thread() self._spawn_kafka_connection_thread() # spawn heartbeat processing loop self._heartbeat_thread = Thread(target=self._heartbeat_loop) self._heartbeat_thread.setDaemon(True) self._heartbeat_thread.start() self.start_time = self.get_time() # disable flask logger if self.settings['FLASK_LOGGING_ENABLED'] == False: log = logging.getLogger('werkzeug') log.disabled = True self._load_schemas() def get_time(self): """Returns the current time""" return time.time() def _load_schemas(self): """Loads any schemas for JSON validation""" for filename in os.listdir(self.settings['SCHEMA_DIR']): if filename[-4:] == 'json': name = filename[:-5] with open(self.settings['SCHEMA_DIR'] + filename) as the_file: self.schemas[name] = json.load(the_file) self.logger.debug("Successfully loaded " + filename + " schema") def _extend_with_default(self, validator_class): ''' Method to add default fields to our schema validation ( From the docs ) ''' validate_properties = validator_class.VALIDATORS["properties"] def set_defaults(validator, properties, instance, schema): for error in validate_properties( validator, properties, instance, schema, ): yield error for property, subschema in list(properties.items()): if "default" in subschema: instance.setdefault(property, subschema["default"]) return validators.extend( validator_class, {"properties": set_defaults}, ) def _spawn_redis_connection_thread(self): """Spawns a redis connection thread""" self.logger.debug("Spawn redis connection thread") self.redis_connected = False self._redis_thread = Thread(target=self._setup_redis) self._redis_thread.setDaemon(True) self._redis_thread.start() def _spawn_kafka_connection_thread(self): """Spawns a kafka connection thread""" self.logger.debug("Spawn kafka connection thread") self.kafka_connected = False self._kafka_thread = Thread(target=self._setup_kafka) self._kafka_thread.setDaemon(True) self._kafka_thread.start() def _spawn_kafka_consumer_thread(self): """Spawns a kafka continuous consumer thread""" self.logger.debug("Spawn kafka consumer thread" "") self._consumer_thread = Thread(target=self._consumer_loop) self._consumer_thread.setDaemon(True) self._consumer_thread.start() def _consumer_loop(self): """The main consumer loop""" self.logger.debug("running main consumer thread") while not self.closed: if self.kafka_connected: self._process_messages() time.sleep(self.settings['KAFKA_CONSUMER_SLEEP_TIME']) def _process_messages(self): """Processes messages received from kafka""" try: for message in self.consumer: try: if message is None: self.logger.debug("no message") break loaded_dict = json.loads(message.value) self.logger.debug("got valid kafka message") with self.uuids_lock: if 'uuid' in loaded_dict: if loaded_dict['uuid'] in self.uuids and \ self.uuids[loaded_dict['uuid']] != 'poll': self.logger.debug( "Found Kafka message from request") self.uuids[loaded_dict['uuid']] = loaded_dict else: self.logger.debug("Got poll result") self._send_result_to_redis(loaded_dict) else: self.logger.debug( "Got message not intended for this process") except ValueError: extras = {} if message is not None: extras["data"] = message.value self.logger.warning('Unparseable JSON Received from kafka', extra=extras) self._check_kafka_disconnect() except OffsetOutOfRangeError: # consumer has no idea where they are self.consumer.seek_to_end() self.logger.error("Kafka offset out of range error") def _send_result_to_redis(self, result): """Sends the result of a poll to redis to be used potentially by another process @param result: the result retrieved from kafka""" if self.redis_connected: self.logger.debug("Sending result to redis") try: key = "rest:poll:{u}".format(u=result['uuid']) self.redis_conn.set(key, json.dumps(result)) except ConnectionError: self.logger.error("Lost connection to Redis") self._spawn_redis_connection_thread() else: self.logger.warning( "Unable to send result to redis, not connected") def _check_kafka_disconnect(self): """Checks the kafka connection is still valid""" for node_id in self.consumer._client._conns: conn = self.consumer._client._conns[node_id] if conn.state == ConnectionStates.DISCONNECTED or \ conn.state == ConnectionStates.DISCONNECTING: self._spawn_kafka_connection_thread() break def _heartbeat_loop(self): """A main run loop thread to do work""" self.logger.debug("running main heartbeat thread") while not self.closed: time.sleep(self.settings['SLEEP_TIME']) self._report_self() def _report_self(self): """ Reports the crawler uuid to redis """ if self.redis_connected: self.logger.debug("Reporting self to redis") try: key = "stats:rest:self:{m}:{u}".format(m=socket.gethostname(), u=self.my_uuid) self.redis_conn.set(key, self.get_time()) self.redis_conn.expire(key, self.settings['HEARTBEAT_TIMEOUT']) except ConnectionError: self.logger.error("Lost connection to Redis") self._spawn_redis_connection_thread() else: self.logger.warn("Cannot report self to redis, not connected") @retry(wait_exponential_multiplier=500, wait_exponential_max=10000) def _setup_redis(self): """Returns a Redis Client""" if not self.closed: try: self.logger.debug("Creating redis connection to host " + str(self.settings['REDIS_HOST'])) self.redis_conn = redis.StrictRedis( host=self.settings['REDIS_HOST'], port=self.settings['REDIS_PORT'], db=self.settings['REDIS_DB']) self.redis_conn.info() self.redis_connected = True self.logger.info("Successfully connected to redis") except KeyError as e: self.logger.error('Missing setting named ' + str(e), {'ex': traceback.format_exc()}) except: self.logger.error("Couldn't initialize redis client.", {'ex': traceback.format_exc()}) raise def _setup_kafka(self): """ Sets up kafka connections """ # close older connections if self.consumer is not None: self.logger.debug("Closing existing kafka consumer") self.consumer.close() self.consumer = None if self.producer is not None: self.logger.debug("Closing existing kafka producer") self.producer.flush() self.producer.close(timeout=10) self.producer = None # create new connections self._consumer_thread = None self.logger.debug("Creating kafka connections") self.consumer = self._create_consumer() if not self.closed: self.logger.debug("Kafka Conumer created") self.producer = self._create_producer() if not self.closed: self.logger.debug("Kafka Producer created") if not self.closed: self.kafka_connected = True self.logger.info("Connected successfully to Kafka") self._spawn_kafka_consumer_thread() @retry(wait_exponential_multiplier=500, wait_exponential_max=10000) def _create_consumer(self): """Tries to establing the Kafka consumer connection""" if not self.closed: try: self.logger.debug( "Creating new kafka consumer using brokers: " + str(self.settings['KAFKA_HOSTS']) + ' and topic ' + self.settings['KAFKA_TOPIC_PREFIX'] + ".outbound_firehose") return KafkaConsumer( self.settings['KAFKA_TOPIC_PREFIX'] + ".outbound_firehose", group_id=None, bootstrap_servers=self.settings['KAFKA_HOSTS'], consumer_timeout_ms=self. settings['KAFKA_CONSUMER_TIMEOUT'], auto_offset_reset=self. settings['KAFKA_CONSUMER_AUTO_OFFSET_RESET'], auto_commit_interval_ms=self. settings['KAFKA_CONSUMER_COMMIT_INTERVAL_MS'], enable_auto_commit=self. settings['KAFKA_CONSUMER_AUTO_COMMIT_ENABLE'], max_partition_fetch_bytes=self. settings['KAFKA_CONSUMER_FETCH_MESSAGE_MAX_BYTES']) except KeyError as e: self.logger.error('Missing setting named ' + str(e), {'ex': traceback.format_exc()}) except Exception as e: self.logger.error( "Couldn't initialize kafka consumer for topic", {'ex': traceback.format_exc()}) self.logger.error(str(e)) raise @retry(wait_exponential_multiplier=500, wait_exponential_max=10000) def _create_producer(self): """Tries to establish a Kafka consumer connection""" if not self.closed: try: self.logger.debug( "Creating new kafka producer using brokers: " + str(self.settings['KAFKA_HOSTS'])) return KafkaProducer( bootstrap_servers=self.settings['KAFKA_HOSTS'], value_serializer=lambda v: json.dumps(v).encode('utf-8'), retries=3, linger_ms=self.settings['KAFKA_PRODUCER_BATCH_LINGER_MS'], buffer_memory=self.settings['KAFKA_PRODUCER_BUFFER_BYTES']) except KeyError as e: self.logger.error('Missing setting named ' + str(e), {'ex': traceback.format_exc()}) except: self.logger.error("Couldn't initialize kafka producer.", {'ex': traceback.format_exc()}) raise def run(self): """Main flask run loop""" self.logger.info("Running main flask method on port " + str(self.settings['FLASK_PORT'])) self.app.run(host='0.0.0.0', port=self.settings['FLASK_PORT']) def _create_ret_object(self, status=SUCCESS, data=None, error=False, error_message=None, error_cause=None): """ Create generic reponse objects. :param str status: The SUCCESS or FAILURE of the request :param obj data: The data to return :param bool error: Set to True to add Error response :param str error_message: The generic error message :param str error_cause: The cause of the error :returns: A dictionary of values """ ret = {} if status == self.FAILURE: ret['status'] = self.FAILURE else: ret['status'] = self.SUCCESS ret['data'] = data if error: ret['error'] = {} if error_message is not None: ret['error']['message'] = error_message if error_cause is not None: ret['error']['cause'] = error_cause else: ret['error'] = None return ret def _close_thread(self, thread, thread_name): """Closes daemon threads @param thread: the thread to close @param thread_name: a human readable name of the thread """ if thread is not None and thread.isAlive(): self.logger.debug( "Waiting for {} thread to close".format(thread_name)) thread.join(timeout=self.settings['DAEMON_THREAD_JOIN_TIMEOUT']) if thread.isAlive(): self.logger.warn("{} daemon thread unable to be shutdown" " within timeout".format(thread_name)) def close(self): """ Cleans up anything from the process """ self.logger.info("Closing Rest Service") self.closed = True # close threads self._close_thread(self._redis_thread, "Redis setup") self._close_thread(self._heartbeat_thread, "Heartbeat") self._close_thread(self._kafka_thread, "Kafka setup") self._close_thread(self._consumer_thread, "Consumer") # close kafka if self.consumer is not None: self.logger.debug("Closing kafka consumer") self.consumer.close() if self.producer is not None: self.logger.debug("Closing kafka producer") self.producer.close(timeout=10) def _calculate_health(self): """Returns a string representation of the node health @returns: GREEN if fully connected, YELLOW if partially connected, RED if not connected """ if self.redis_connected and self.kafka_connected: return "GREEN" elif self.redis_connected or self.kafka_connected: return "YELLOW" else: return "RED" def _kafka_success(self, response): ''' Callback for successful send ''' self.logger.debug("Sent message to Kafka") def _kafka_failure(self, response): ''' Callback for failed send ''' self.logger.error("Failed to send message to Kafka") self._spawn_kafka_connection_thread() def _feed_to_kafka(self, json_item): """Sends a request to Kafka :param json_item: The json item to send :returns: A boolean indicating whther the data was sent successfully or not """ @MethodTimer.timeout(self.settings['KAFKA_FEED_TIMEOUT'], False) def _feed(json_item): try: self.logger.debug("Sending json to kafka at " + str(self.settings['KAFKA_PRODUCER_TOPIC'])) future = self.producer.send( self.settings['KAFKA_PRODUCER_TOPIC'], json_item) future.add_callback(self._kafka_success) future.add_errback(self._kafka_failure) self.producer.flush() return True except Exception as e: self.logger.error("Lost connection to Kafka") self._spawn_kafka_connection_thread() return False return _feed(json_item) # Routes -------------------- def _decorate_routes(self): """ Decorates the routes to use within the flask app """ self.logger.debug("Decorating routes") # self.app.add_url_rule('/', 'catch', self.catch, methods=['GET'], # defaults={'path': ''}) self.app.add_url_rule('/<path:path>', 'catch', self.catch, methods=['GET', 'POST'], defaults={'path': ''}) self.app.add_url_rule('/', 'index', self.index, methods=['POST', 'GET']) self.app.add_url_rule('/feed', 'feed', self.feed, methods=['POST']) self.app.add_url_rule('/poll', 'poll', self.poll, methods=['POST']) @log_call('Non-existant route called') @error_catch def catch(self, path): return self._create_ret_object(self.FAILURE, None, True, self.DOES_NOT_EXIST), 404 @log_call('\'index\' endpoint called') @error_catch def index(self): data = { "kafka_connected": self.kafka_connected, "redis_connected": self.redis_connected, "uptime_sec": int(self.get_time() - self.start_time), "my_id": self.my_uuid, "node_health": self._calculate_health() } return data @validate_json @log_call('\'feed\' endpoint called') @error_catch def feed(self): # proof of concept to write things to kafka if self.kafka_connected: json_item = request.get_json() self.wait_for_response = False result = self._feed_to_kafka(json_item) if 'uuid' in json_item: self.wait_for_response = True with self.uuids_lock: self.uuids[json_item['uuid']] = None if result: true_response = None if self.wait_for_response: self.logger.debug("expecting kafka response for request") the_time = self.get_time() found_item = False while not found_item and int(self.get_time( ) - the_time) <= self.settings['WAIT_FOR_RESPONSE_TIME']: if self.uuids[json_item['uuid']] is not None: found_item = True true_response = self.uuids[json_item['uuid']] with self.uuids_lock: del self.uuids[json_item['uuid']] else: with self.uuids_lock: # key still exists, means we didnt find get our # response in time if json_item['uuid'] in self.uuids: self.uuids[json_item['uuid']] = 'poll' self.logger.debug("Did not find response, " "adding to poll") if found_item: self.logger.debug( "Got successful reponse back from kafka") else: self.logger.warn("Did not get response within timeout " "from kafka. If the request is still " "running, use the `/poll` API") true_response = {"poll_id": json_item['uuid']} else: self.logger.debug("Not expecting response from kafka") return self._create_ret_object(self.SUCCESS, true_response) self.logger.warn("Unable to write request to Kafka, not connected") return self._create_ret_object(self.FAILURE, None, True, "Unable to connect to Kafka"), 500 @validate_json @validate_schema('poll') @log_call('\'poll\' endpoint called') @error_catch def poll(self): """Retrieves older requests that may not make it back quick enough""" if self.redis_connected: json_item = request.get_json() result = None try: key = "rest:poll:{u}".format(u=json_item['poll_id']) result = self.redis_conn.get(key) if result is not None: result = json.loads(result) self.logger.debug("Found previous poll") self.redis_conn.delete(key) return self._create_ret_object(self.SUCCESS, result) else: self.logger.debug("poll key does not exist") return self._create_ret_object( self.FAILURE, None, True, "Could not find matching poll_id"), 404 except ConnectionError: self.logger.error("Lost connection to Redis") self._spawn_redis_connection_thread() except ValueError: extras = {"value": result} self.logger.warning('Unparseable JSON Received from redis', extra=extras) self.redis_conn.delete(key) return self._create_ret_object( self.FAILURE, None, True, "Unparseable JSON Received " "from redis"), 500 self.logger.warn("Unable to poll redis, not connected") return self._create_ret_object(self.FAILURE, None, True, "Unable to connect to Redis"), 500
class Dispatcher: def __init__(self, tasks, server): self.tasks = tasks # 初始URL种子队列 self.server = server self.wrapper = SettingsWrapper() self.spiders = [] # 当前运行爬虫节点 self.spider_count = 0 # 当前运行爬虫节点个数 self.chose = None # 一致性哈希分布 self.settings = None self.logger = None def setup(self): """从配置文件中加载配置信息""" self.settings = self.wrapper.load('settings.py') self.logger = LogFactory.get_instance(json=self.settings['LOG_JSON'], stdout=self.settings['LOG_STDOUT'], level=self.settings['LOG_LEVEL'], name=self.settings['LOGGER_NAME'], dir=self.settings['LOG_DIR'], file=self.settings['LOG_FILE'], bytes=self.settings['LOG_MAX_BYTES'], backups=self.settings['LOG_BACKUPS']) def initial_seeds(self): """初始化调度器""" while True: initial_len = self.server.llen('seeds') if initial_len: break time.sleep(180) continue self.logger.debug('获取初始种子列表.........') while True: tasks = self.server.lrange('seeds', 0, -1) self.server.ltrim('seeds', -1, 0) self.tasks.extend(tasks) if self.tasks: break self.logger.debug('获取初始爬虫进程个数.........') self.spiders = self.server.keys('stats:spider:*:*') # spiders列表 self.spider_count = len(self.spiders) if self.spider_count: self.logger.debug('调用一致性哈希算法布局爬虫节点位置.......') job_ids = [] for spider in self.spiders: job_ids.append(spider.split(':')[3]) self.chose = ketama.Continuum(job_ids) self.logger.debug('分配初始种子URLs队列........') for task_json in self.tasks: task = pickle.loads(task_json) if 'url' in task and 'spider_type' in task: extract = tldextract.TLDExtract() url = task['url'] spider_type = task['spider_type'] domain = extract(url).domain job_id = self.chose[url.encode('utf-8')] queue_key = '{spider_type}:{job_id}:{domain}:queue'.format(spider_type=spider_type, job_id=job_id, domain=domain) priority = task['priority'] self.server.zadd(queue_key, pickle.dumps(task), priority) else: self.logger.error("please input url and spider_type that you want to crawl!") def spider_state_watcher(self): """监测爬虫节点是否有变化""" self.spiders = self.server.keys('stats:spider:*:*') spider_count_now = len(self.spiders) if spider_count_now != self.spider_count: self.spider_count = spider_count_now return True def center_node_dispather(self): """主节点任务调度""" while True: self.logger.debug('获取新加入的URLs.........') tasks = [] if self.server.llen('seeds'): tasks.append(self.server.lpop('seeds')) self.tasks.extend(tasks) state = self.spider_state_watcher() if state: self.logger.debug('遍历爬虫节点并依次暂停当前运行的爬虫..........') spider_ids = [] spider_ip_ids = [] for spider_key in self.spiders: spider_ids.append(spider_key.split(':')[3]) spider_ip_ids.append((spider_key.split(':')[2], spider_key.split(':')[3])) for spider_ip_id in spider_ip_ids: key = '{job}:status'.format(job=spider_ip_id[1]) self.server.set(key, 'pause') time.sleep(4) self.logger.debug('由于爬虫节点状态改变,调整哈希分布...........') self.chose = ketama.Continuum(spider_ids) self.logger.debug('调整爬虫节点所负责的站点数据抓取任务, 请勿在此段时间启动额外的爬虫..........') queue_keys = self.server.keys('*:queue') for queue_key in queue_keys: tasks.extend(self.server.zrange(queue_key, 0, -1)) # 获取所有爬虫队列中的urls self.server.zremrangebyrank(queue_key, 0, -1) # 清空爬虫队列 self.logger.debug('恢复先前暂停的爬虫节点.......') for spider_ip_id in spider_ip_ids: key = '{job}:status'.format(job=spider_ip_id[1]) self.server.set(key, 'running') self.logger.debug('等待!, 重新分配URLs..............') for task_json in tasks: task = pickle.loads(task_json) if 'url' in task and 'spider_type' in task: extract = tldextract.TLDExtract() url = task['url'] spider_type = task['spider_type'] domain = extract(url).domain job_id = self.chose[url.encode('utf-8')] queue_key = '{spider_type}:{job_id}:{domain}:queue'.format(spider_type=spider_type, job_id=job_id, domain=domain) priority = task['priority'] self.server.zadd(queue_key, pickle.dumps(task), priority) else: self.logger.error("please input url and spider_type that you want to crawl!") def run(self): """启动调度器""" self.initial_seeds() self.center_node_dispather()