Exemple #1
0
 def __init__(self, settings_name, unit_test=False):
     '''
     @param settings_name: the local settings file name
     @param unit_test: whether running unit tests or not
     '''
     self.settings_name = settings_name
     self.wrapper = SettingsWrapper()
     self.logger = None
     self.unit_test = unit_test
Exemple #2
0
 def __init__(self, settings_name, unit_test=False):
     '''
     @param settings_name: the local settings file name
     @param unit_test: whether running unit tests or not
     '''
     self.settings_name = settings_name
     self.wrapper = SettingsWrapper()
     self.logger = None
     self.unit_test = unit_test
     self.my_uuid = str(uuid.uuid4()).split('-')[4]
Exemple #3
0
    def __init__(self, tasks, redis_conn):

        self.tasks = tasks  # 初始URL种子队列
        self.redis_conn = redis_conn
        self.wrapper = SettingsWrapper()

        self.spiders = []  # 当前运行爬虫节点
        self.spiders_weights = None  # 当前爬虫节点的权值
        self.settings = None
        self.logger = None
Exemple #4
0
    def __init__(self, tasks, server):

        self.tasks = tasks  # 初始URL种子队列
        self.server = server
        self.wrapper = SettingsWrapper()

        self.spiders = []  # 当前运行爬虫节点
        self.spider_count = 0  # 当前运行爬虫节点个数
        self.chose = None  # 一致性哈希分布
        self.settings = None
        self.logger = None
Exemple #5
0
 def __init__(self, settings_name):
     """
     @param settings_name: the local settings file name
     """
     self.settings_name = settings_name
     self.wrapper = SettingsWrapper()
     self.logger = None
     self.app = Flask(__name__)
     self.kafka_connected = False
     self.redis_connected = False
     self.my_uuid = str(uuid.uuid4()).split('-')[4]
     self.uuids = {}
     self.uuids_lock = threading.Lock()
     self.validator = self._extend_with_default(Draft4Validator)
     self.schemas = {}
Exemple #6
0
 def __init__(self, settings_name, unit_test=False):
     '''
     @param settings_name: the local settings file name
     @param unit_test: whether running unit tests or not
     '''
     self.settings_name = settings_name
     self.wrapper = SettingsWrapper()
     self.logger = None
     self.unit_test = unit_test
Exemple #7
0
 def __init__(self, settings_name, unit_test=False):
     '''
     @param settings_name: the local settings file name
     @param unit_test: whether running unit tests or not
     '''
     self.settings_name = settings_name
     self.wrapper = SettingsWrapper()
     self.logger = None
     self.unit_test = unit_test
     self.my_uuid = str(uuid.uuid4()).split('-')[4]
Exemple #8
0
 def __init__(self, settings_name):
     """
     @param settings_name: the local settings file name
     """
     self.settings_name = settings_name
     self.wrapper = SettingsWrapper()
     self.logger = None
     self.app = Flask(__name__)
     self.kafka_connected = False
     self.redis_connected = False
     self.my_uuid = str(uuid.uuid4()).split('-')[4]
     self.uuids = {}
     self.uuids_lock = threading.Lock()
     self.validator = self._extend_with_default(Draft4Validator)
     self.schemas = {}
Exemple #9
0
class KafkaMonitor(object):

    consumer = None

    def __init__(self, settings_name, unit_test=False):
        '''
        @param settings_name: the local settings file name
        @param unit_test: whether running unit tests or not
        '''
        self.settings_name = settings_name
        self.wrapper = SettingsWrapper()
        self.logger = None
        self.unit_test = unit_test
        self.my_uuid = str(uuid.uuid4()).split('-')[4]

    def _import_class(self, cl):
        '''
        Imports a class from a string

        @param name: the module and class name in dot notation
        '''
        d = cl.rfind(".")
        classname = cl[d+1:len(cl)]
        m = __import__(cl[0:d], globals(), locals(), [classname])
        return getattr(m, classname)

    def _load_plugins(self):
        '''
        Sets up all plugins, defaults and settings.py
        '''
        plugins = self.settings['PLUGINS']

        self.plugins_dict = {}
        for key in plugins:
            # skip loading the plugin if its value is None
            if plugins[key] is None:
                continue
            # valid plugin, import and setup
            self.logger.debug("Trying to load plugin {cls}".format(cls=key))
            the_class = self._import_class(key)
            instance = the_class()
            instance._set_logger(self.logger)
            if not self.unit_test:
                instance.setup(self.settings)
            the_schema = None
            with open(self.settings['PLUGIN_DIR'] + instance.schema) as the_file:
                the_schema = json.load(the_file)

            mini = {}
            mini['instance'] = instance
            mini['schema'] = the_schema
            self.logger.debug("Successfully loaded plugin {cls}".format(cls=key))
            self.plugins_dict[plugins[key]] = mini

        self.plugins_dict = OrderedDict(sorted(list(self.plugins_dict.items()),
                                               key=lambda t: t[0]))

    def setup(self, level=None, log_file=None, json=None):
        '''
        Load everything up. Note that any arg here will override both
        default and custom settings

        @param level: the log level
        @param log_file: boolean t/f whether to log to a file, else stdout
        @param json: boolean t/f whether to write the logs in json
        '''
        self.settings = self.wrapper.load(self.settings_name)

        my_level = level if level else self.settings['LOG_LEVEL']
        # negate because logger wants True for std out
        my_output = not log_file if log_file else self.settings['LOG_STDOUT']
        my_json = json if json else self.settings['LOG_JSON']
        self.logger = LogFactory.get_instance(json=my_json, stdout=my_output,
                                              level=my_level,
                                              name=self.settings['LOGGER_NAME'],
                                              dir=self.settings['LOG_DIR'],
                                              file=self.settings['LOG_FILE'],
                                              bytes=self.settings['LOG_MAX_BYTES'],
                                              backups=self.settings['LOG_BACKUPS'])

        self.validator = self.extend_with_default(Draft4Validator)

    def _setup_stats(self):
        '''
        Sets up the stats collection
        '''
        self.stats_dict = {}

        redis_conn = redis.Redis(host=self.settings['REDIS_HOST'],
                                 port=self.settings['REDIS_PORT'],
                                 db=self.settings.get('REDIS_DB'))

        try:
            redis_conn.info()
            self.logger.debug("Connected to Redis in StatsCollector Setup")
            self.redis_conn = redis_conn
        except ConnectionError:
            self.logger.warn("Failed to connect to Redis in StatsCollector"
                             " Setup, no stats will be collected")
            return

        if self.settings['STATS_TOTAL']:
            self._setup_stats_total(redis_conn)

        if self.settings['STATS_PLUGINS']:
            self._setup_stats_plugins(redis_conn)

    def _setup_stats_total(self, redis_conn):
        '''
        Sets up the total stats collectors

        @param redis_conn: the redis connection
        '''
        self.stats_dict['total'] = {}
        self.stats_dict['fail'] = {}
        temp_key1 = 'stats:kafka-monitor:total'
        temp_key2 = 'stats:kafka-monitor:fail'
        for item in self.settings['STATS_TIMES']:
            try:
                time = getattr(StatsCollector, item)
                self.stats_dict['total'][time] = StatsCollector \
                        .get_rolling_time_window(
                                redis_conn=redis_conn,
                                key='{k}:{t}'.format(k=temp_key1, t=time),
                                window=time,
                                cycle_time=self.settings['STATS_CYCLE'])
                self.stats_dict['fail'][time] = StatsCollector \
                        .get_rolling_time_window(
                                redis_conn=redis_conn,
                                key='{k}:{t}'.format(k=temp_key2, t=time),
                                window=time,
                                cycle_time=self.settings['STATS_CYCLE'])
                self.logger.debug("Set up total/fail Stats Collector '{i}'"\
                        .format(i=item))
            except AttributeError as e:
                self.logger.warning("Unable to find Stats Time '{s}'"\
                        .format(s=item))
        total1 = StatsCollector.get_hll_counter(redis_conn=redis_conn,
                                                key='{k}:lifetime'.format(k=temp_key1),
                                                cycle_time=self.settings['STATS_CYCLE'],
                                                roll=False)
        total2 = StatsCollector.get_hll_counter(redis_conn=redis_conn,
                                                key='{k}:lifetime'.format(k=temp_key2),
                                                cycle_time=self.settings['STATS_CYCLE'],
                                                roll=False)
        self.logger.debug("Set up total/fail Stats Collector 'lifetime'")
        self.stats_dict['total']['lifetime'] = total1
        self.stats_dict['fail']['lifetime'] = total2

    def _setup_stats_plugins(self, redis_conn):
        '''
        Sets up the plugin stats collectors

        @param redis_conn: the redis connection
        '''
        self.stats_dict['plugins'] = {}
        for key in self.plugins_dict:
            plugin_name = self.plugins_dict[key]['instance'].__class__.__name__
            temp_key = 'stats:kafka-monitor:{p}'.format(p=plugin_name)
            self.stats_dict['plugins'][plugin_name] = {}
            for item in self.settings['STATS_TIMES']:
                try:
                    time = getattr(StatsCollector, item)

                    self.stats_dict['plugins'][plugin_name][time] = StatsCollector \
                            .get_rolling_time_window(
                                    redis_conn=redis_conn,
                                    key='{k}:{t}'.format(k=temp_key, t=time),
                                    window=time,
                                    cycle_time=self.settings['STATS_CYCLE'])
                    self.logger.debug("Set up {p} plugin Stats Collector '{i}'"\
                            .format(p=plugin_name, i=item))
                except AttributeError:
                    self.logger.warning("Unable to find Stats Time '{s}'"\
                            .format(s=item))
            total = StatsCollector.get_hll_counter(redis_conn=redis_conn,
                                                   key='{k}:lifetime'.format(k=temp_key),
                                                   cycle_time=self.settings['STATS_CYCLE'],
                                                   roll=False)
            self.logger.debug("Set up {p} plugin Stats Collector 'lifetime'"\
                            .format(p=plugin_name))
            self.stats_dict['plugins'][plugin_name]['lifetime'] = total

    def _setup_kafka(self):
        '''
        Sets up kafka connections
        '''
        self.consumer = self._create_consumer()
        self.logger.debug("Successfully connected to Kafka")

    def extend_with_default(self, validator_class):
        '''
        Method to add default fields to our schema validation
        ( From the docs )
        '''
        validate_properties = validator_class.VALIDATORS["properties"]

        def set_defaults(validator, properties, instance, schema):
            for error in validate_properties(
                validator, properties, instance, schema,
            ):
                yield error

            for property, subschema in list(properties.items()):
                if "default" in subschema:
                    instance.setdefault(property, subschema["default"])

        return validators.extend(
            validator_class, {"properties": set_defaults},
        )

    def _main_loop(self):
        '''
        Continuous loop that reads from a kafka topic and tries to validate
        incoming messages
        '''
        self.logger.debug("Processing messages")
        old_time = 0
        while True:
            self._process_messages()
            if self.settings['STATS_DUMP'] != 0:
                new_time = int(old_div(time.time(), self.settings['STATS_DUMP']))
                # only log every X seconds
                if new_time != old_time:
                    self._dump_stats()
                    old_time = new_time

            self._report_self()
            time.sleep(self.settings['SLEEP_TIME'])

    def _process_messages(self):
        try:
            for message in self.consumer:
                if message is None:
                    self.logger.debug("no message")
                    break
                try:
                    self._increment_total_stat(message.value)
                    loaded_dict = json.loads(message.value)
                    found_plugin = False
                    for key in self.plugins_dict:
                        # to prevent reference modification
                        the_dict = copy.deepcopy(loaded_dict)
                        obj = self.plugins_dict[key]
                        instance = obj['instance']
                        schema = obj['schema']
                        try:
                            self.validator(schema).validate(the_dict)
                            found_plugin = True
                            self._increment_plugin_stat(
                                    instance.__class__.__name__,
                                    the_dict)
                            ret = instance.handle(the_dict)
                            # break if nothing is returned
                            if ret is None:
                                break
                        except ValidationError:
                            pass
                    if not found_plugin:
                        extras = {}
                        extras['parsed'] = True
                        extras['valid'] = False
                        extras['data'] = the_dict
                        self.logger.warn("Did not find schema to validate "
                                         "request", extra=extras)
                        self._increment_fail_stat(the_dict)

                except ValueError:
                    extras = {}
                    extras['parsed'] = False
                    extras['valid'] = False
                    extras['data'] = message.value
                    self.logger.warning('Unparseable JSON Received',
                                        extra=extras)
                    self._increment_fail_stat(message.value)
        except OffsetOutOfRangeError:
            # consumer has no idea where they are
            self.consumer.seek_to_end()
            self.logger.error("Kafka offset out of range error")

    def _increment_total_stat(self, string):
        '''
        Increments the total stat counters

        @param string: the loaded message object for the counter
        '''
        string = string + str(time.time())
        if 'total' in self.stats_dict:
            self.logger.debug("Incremented total stats")
            for key in self.stats_dict['total']:
                if key == 'lifetime':
                    self.stats_dict['total'][key].increment(string)
                else:
                    self.stats_dict['total'][key].increment()

    def _increment_fail_stat(self, item):
        '''
        Increments the total stat counters

        @param item: the loaded message object for HLL counter
        '''
        if isinstance(item, dict):
            item['ts'] = time.time()
        elif isinstance(item, str):
            item = item + str(time.time())

        if 'fail' in self.stats_dict:
            self.logger.debug("Incremented fail stats")
            for key in self.stats_dict['fail']:
                if key == 'lifetime':
                    self.stats_dict['fail'][key].increment(item)
                else:
                    self.stats_dict['fail'][key].increment()

    def _increment_plugin_stat(self, name, item):
        '''
        Increments the total stat counters

        @param name: The formal name of the plugin
        @param dict: the loaded message object for HLL counter
        '''
        item['ts'] = time.time()
        if 'plugins' in self.stats_dict:
            self.logger.debug("Incremented plugin '{p}' plugin stats"\
                    .format(p=name))
            for key in self.stats_dict['plugins'][name]:
                if key == 'lifetime':
                    self.stats_dict['plugins'][name][key].increment(item)
                else:
                    self.stats_dict['plugins'][name][key].increment()

    def _dump_stats(self):
        '''
        Dumps the stats out
        '''
        extras = {}
        if 'total' in self.stats_dict:
            self.logger.debug("Compiling total/fail dump stats")
            for key in self.stats_dict['total']:
                final = 'total_{t}'.format(t=key)
                extras[final] = self.stats_dict['total'][key].value()
            for key in self.stats_dict['fail']:
                final = 'fail_{t}'.format(t=key)
                extras[final] = self.stats_dict['fail'][key].value()

        if 'plugins' in self.stats_dict:
            self.logger.debug("Compiling plugin dump stats")
            for name in self.stats_dict['plugins']:
                for key in self.stats_dict['plugins'][name]:
                    final = 'plugin_{n}_{t}'.format(n=name, t=key)
                    extras[final] = self.stats_dict['plugins'][name][key].value()

        if not self.logger.json:
            self.logger.info('Kafka Monitor Stats Dump:\n{0}'.format(
                    json.dumps(extras, indent=4, sort_keys=True)))
        else:
            self.logger.info('Kafka Monitor Stats Dump', extra=extras)

    def run(self):
        '''
        Set up and run
        '''
        self._setup_kafka()
        self._load_plugins()
        self._setup_stats()
        self._main_loop()

    def _report_self(self):
        '''
        Reports the kafka monitor uuid to redis
        '''
        key = "stats:kafka-monitor:self:{m}:{u}".format(
            m=socket.gethostname(),
            u=self.my_uuid)
        self.redis_conn.set(key, time.time())
        self.redis_conn.expire(key, self.settings['HEARTBEAT_TIMEOUT'])

    def feed(self, json_item):
        '''
        Feeds a json item into the Kafka topic

        @param json_item: The loaded json object
        '''
        @MethodTimer.timeout(self.settings['KAFKA_FEED_TIMEOUT'], False)
        def _feed(json_item):
            producer = self._create_producer()
            topic = self.settings['KAFKA_INCOMING_TOPIC']
            if not self.logger.json:
                self.logger.info('Feeding JSON into {0}\n{1}'.format(
                    topic, json.dumps(json_item, indent=4)))
            else:
                self.logger.info('Feeding JSON into {0}\n'.format(topic),
                                 extra={'value': json_item})

            if producer is not None:
                producer.send(topic, json_item)
                producer.flush()
                producer.close(timeout=10)
                return True
            else:
                return False

        result = _feed(json_item)

        if result:
            self.logger.info("Successfully fed item to Kafka")
        else:
            self.logger.error("Failed to feed item into Kafka")

    @retry(wait_exponential_multiplier=500, wait_exponential_max=10000)
    def _create_consumer(self):
        """Tries to establing the Kafka consumer connection"""
        try:
            brokers = self.settings['KAFKA_HOSTS']
            self.logger.debug("Creating new kafka consumer using brokers: " +
                               str(brokers) + ' and topic ' +
                               self.settings['KAFKA_INCOMING_TOPIC'])

            return KafkaConsumer(
                self.settings['KAFKA_INCOMING_TOPIC'],
                group_id=self.settings['KAFKA_GROUP'],
                bootstrap_servers=brokers,
                consumer_timeout_ms=self.settings['KAFKA_CONSUMER_TIMEOUT'],
                auto_offset_reset=self.settings['KAFKA_CONSUMER_AUTO_OFFSET_RESET'],
                auto_commit_interval_ms=self.settings['KAFKA_CONSUMER_COMMIT_INTERVAL_MS'],
                enable_auto_commit=self.settings['KAFKA_CONSUMER_AUTO_COMMIT_ENABLE'],
                max_partition_fetch_bytes=self.settings['KAFKA_CONSUMER_FETCH_MESSAGE_MAX_BYTES'])
        except KeyError as e:
            self.logger.error('Missing setting named ' + str(e),
                               {'ex': traceback.format_exc()})
        except:
            self.logger.error("Couldn't initialize kafka consumer for topic",
                               {'ex': traceback.format_exc(),
                                'topic': self.settings['KAFKA_INCOMING_TOPIC']})
            raise

    @retry(wait_exponential_multiplier=500, wait_exponential_max=10000)
    def _create_producer(self):
        """Tries to establish a Kafka consumer connection"""
        try:
            brokers = self.settings['KAFKA_HOSTS']
            self.logger.debug("Creating new kafka producer using brokers: " +
                               str(brokers))

            return KafkaProducer(bootstrap_servers=brokers,
                                 value_serializer=lambda m: json.dumps(m),
                                 retries=3,
                                 linger_ms=self.settings['KAFKA_PRODUCER_BATCH_LINGER_MS'],
                                 buffer_memory=self.settings['KAFKA_PRODUCER_BUFFER_BYTES'])
        except KeyError as e:
            self.logger.error('Missing setting named ' + str(e),
                               {'ex': traceback.format_exc()})
        except:
            self.logger.error("Couldn't initialize kafka producer.",
                               {'ex': traceback.format_exc()})
            raise

    def close(self):
        '''
        Call to properly tear down the Kafka Monitor
        '''
        if self.consumer is not None:
            self.consumer.close()
Exemple #10
0
class RestService(object):

    # static strings
    SUCCESS = 'SUCCESS'
    FAILURE = 'FAILURE'
    UNKNOWN_ERROR = "An error occurred while processing your request."
    MUST_JSON = "The payload must be valid JSON."
    DOES_NOT_EXIST = "The desired endpoint does not exist"
    BAD_SCHEMA = "JSON did not validate against schema."

    consumer = None
    producer = None
    closed = False
    start_time = 0
    _consumer_thread = None
    _kafka_thread = None
    _heartbeat_thread = None
    _redis_thread = None

    def __init__(self, settings_name):
        """
        @param settings_name: the local settings file name
        """
        self.settings_name = settings_name
        self.wrapper = SettingsWrapper()
        self.logger = None
        self.app = Flask(__name__)
        self.kafka_connected = False
        self.redis_connected = False
        self.my_uuid = str(uuid.uuid4()).split('-')[4]
        self.uuids = {}
        self.uuids_lock = threading.Lock()
        self.validator = self._extend_with_default(Draft4Validator)
        self.schemas = {}

    def setup(self, level=None, log_file=None, json=None):
        """
        Load everything up. Note that any arg here will override both
        default and custom settings

        @param level: the log level
        @param log_file: boolean t/f whether to log to a file, else stdout
        @param json: boolean t/f whether to write the logs in json
        """
        self.settings = self.wrapper.load(self.settings_name)

        my_level = level if level else self.settings['LOG_LEVEL']
        # negate because logger wants True for std out
        my_output = not log_file if log_file else self.settings['LOG_STDOUT']
        my_json = json if json else self.settings['LOG_JSON']
        self.logger = LogFactory.get_instance(
            json=my_json,
            stdout=my_output,
            level=my_level,
            name=self.settings['LOGGER_NAME'],
            dir=self.settings['LOG_DIR'],
            file=self.settings['LOG_FILE'],
            bytes=self.settings['LOG_MAX_BYTES'],
            backups=self.settings['LOG_BACKUPS'])

        self._decorate_routes()
        self._spawn_redis_connection_thread()
        self._spawn_kafka_connection_thread()

        # spawn heartbeat processing loop
        self._heartbeat_thread = Thread(target=self._heartbeat_loop)
        self._heartbeat_thread.setDaemon(True)
        self._heartbeat_thread.start()

        self.start_time = self.get_time()

        # disable flask logger
        if self.settings['FLASK_LOGGING_ENABLED'] == False:
            log = logging.getLogger('werkzeug')
            log.disabled = True

        self._load_schemas()

    def get_time(self):
        """Returns the current time"""
        return time.time()

    def _load_schemas(self):
        """Loads any schemas for JSON validation"""
        for filename in os.listdir(self.settings['SCHEMA_DIR']):
            if filename[-4:] == 'json':
                name = filename[:-5]
                with open(self.settings['SCHEMA_DIR'] + filename) as the_file:
                    self.schemas[name] = json.load(the_file)
                    self.logger.debug("Successfully loaded " + filename +
                                      " schema")

    def _extend_with_default(self, validator_class):
        '''
        Method to add default fields to our schema validation
        ( From the docs )
        '''
        validate_properties = validator_class.VALIDATORS["properties"]

        def set_defaults(validator, properties, instance, schema):
            for error in validate_properties(
                    validator,
                    properties,
                    instance,
                    schema,
            ):
                yield error

            for property, subschema in list(properties.items()):
                if "default" in subschema:
                    instance.setdefault(property, subschema["default"])

        return validators.extend(
            validator_class,
            {"properties": set_defaults},
        )

    def _spawn_redis_connection_thread(self):
        """Spawns a redis connection thread"""
        self.logger.debug("Spawn redis connection thread")
        self.redis_connected = False
        self._redis_thread = Thread(target=self._setup_redis)
        self._redis_thread.setDaemon(True)
        self._redis_thread.start()

    def _spawn_kafka_connection_thread(self):
        """Spawns a kafka connection thread"""
        self.logger.debug("Spawn kafka connection thread")
        self.kafka_connected = False
        self._kafka_thread = Thread(target=self._setup_kafka)
        self._kafka_thread.setDaemon(True)
        self._kafka_thread.start()

    def _spawn_kafka_consumer_thread(self):
        """Spawns a kafka continuous consumer thread"""
        self.logger.debug("Spawn kafka consumer thread" "")
        self._consumer_thread = Thread(target=self._consumer_loop)
        self._consumer_thread.setDaemon(True)
        self._consumer_thread.start()

    def _consumer_loop(self):
        """The main consumer loop"""
        self.logger.debug("running main consumer thread")
        while not self.closed:
            if self.kafka_connected:
                self._process_messages()
            time.sleep(self.settings['KAFKA_CONSUMER_SLEEP_TIME'])

    def _process_messages(self):
        """Processes messages received from kafka"""
        try:
            for message in self.consumer:
                try:
                    if message is None:
                        self.logger.debug("no message")
                        break
                    loaded_dict = json.loads(message.value)
                    self.logger.debug("got valid kafka message")

                    with self.uuids_lock:
                        if 'uuid' in loaded_dict:
                            if loaded_dict['uuid'] in self.uuids and \
                                    self.uuids[loaded_dict['uuid']] != 'poll':
                                self.logger.debug(
                                    "Found Kafka message from request")
                                self.uuids[loaded_dict['uuid']] = loaded_dict
                            else:
                                self.logger.debug("Got poll result")
                                self._send_result_to_redis(loaded_dict)
                        else:
                            self.logger.debug(
                                "Got message not intended for this process")
                except ValueError:
                    extras = {}
                    if message is not None:
                        extras["data"] = message.value
                    self.logger.warning('Unparseable JSON Received from kafka',
                                        extra=extras)

            self._check_kafka_disconnect()

        except OffsetOutOfRangeError:
            # consumer has no idea where they are
            self.consumer.seek_to_end()
            self.logger.error("Kafka offset out of range error")

    def _send_result_to_redis(self, result):
        """Sends the result of a poll to redis to be used potentially by
        another process

        @param result: the result retrieved from kafka"""
        if self.redis_connected:
            self.logger.debug("Sending result to redis")
            try:
                key = "rest:poll:{u}".format(u=result['uuid'])
                self.redis_conn.set(key, json.dumps(result))
            except ConnectionError:
                self.logger.error("Lost connection to Redis")
                self._spawn_redis_connection_thread()
        else:
            self.logger.warning(
                "Unable to send result to redis, not connected")

    def _check_kafka_disconnect(self):
        """Checks the kafka connection is still valid"""
        for node_id in self.consumer._client._conns:
            conn = self.consumer._client._conns[node_id]
            if conn.state == ConnectionStates.DISCONNECTED or \
                    conn.state == ConnectionStates.DISCONNECTING:
                self._spawn_kafka_connection_thread()
                break

    def _heartbeat_loop(self):
        """A main run loop thread to do work"""
        self.logger.debug("running main heartbeat thread")
        while not self.closed:
            time.sleep(self.settings['SLEEP_TIME'])
            self._report_self()

    def _report_self(self):
        """
        Reports the crawler uuid to redis
        """
        if self.redis_connected:
            self.logger.debug("Reporting self to redis")
            try:
                key = "stats:rest:self:{m}:{u}".format(m=socket.gethostname(),
                                                       u=self.my_uuid)
                self.redis_conn.set(key, self.get_time())
                self.redis_conn.expire(key, self.settings['HEARTBEAT_TIMEOUT'])
            except ConnectionError:
                self.logger.error("Lost connection to Redis")
                self._spawn_redis_connection_thread()
        else:
            self.logger.warn("Cannot report self to redis, not connected")

    @retry(wait_exponential_multiplier=500, wait_exponential_max=10000)
    def _setup_redis(self):
        """Returns a Redis Client"""
        if not self.closed:
            try:
                self.logger.debug("Creating redis connection to host " +
                                  str(self.settings['REDIS_HOST']))
                self.redis_conn = redis.StrictRedis(
                    host=self.settings['REDIS_HOST'],
                    port=self.settings['REDIS_PORT'],
                    db=self.settings['REDIS_DB'])
                self.redis_conn.info()
                self.redis_connected = True
                self.logger.info("Successfully connected to redis")
            except KeyError as e:
                self.logger.error('Missing setting named ' + str(e),
                                  {'ex': traceback.format_exc()})
            except:
                self.logger.error("Couldn't initialize redis client.",
                                  {'ex': traceback.format_exc()})
                raise

    def _setup_kafka(self):
        """
        Sets up kafka connections
        """
        # close older connections
        if self.consumer is not None:
            self.logger.debug("Closing existing kafka consumer")
            self.consumer.close()
            self.consumer = None
        if self.producer is not None:
            self.logger.debug("Closing existing kafka producer")
            self.producer.flush()
            self.producer.close(timeout=10)
            self.producer = None

        # create new connections
        self._consumer_thread = None
        self.logger.debug("Creating kafka connections")
        self.consumer = self._create_consumer()
        if not self.closed:
            self.logger.debug("Kafka Conumer created")
        self.producer = self._create_producer()
        if not self.closed:
            self.logger.debug("Kafka Producer created")

        if not self.closed:
            self.kafka_connected = True
            self.logger.info("Connected successfully to Kafka")
            self._spawn_kafka_consumer_thread()

    @retry(wait_exponential_multiplier=500, wait_exponential_max=10000)
    def _create_consumer(self):
        """Tries to establing the Kafka consumer connection"""
        if not self.closed:
            try:
                self.logger.debug(
                    "Creating new kafka consumer using brokers: " +
                    str(self.settings['KAFKA_HOSTS']) + ' and topic ' +
                    self.settings['KAFKA_TOPIC_PREFIX'] + ".outbound_firehose")

                return KafkaConsumer(
                    self.settings['KAFKA_TOPIC_PREFIX'] + ".outbound_firehose",
                    group_id=None,
                    bootstrap_servers=self.settings['KAFKA_HOSTS'],
                    consumer_timeout_ms=self.
                    settings['KAFKA_CONSUMER_TIMEOUT'],
                    auto_offset_reset=self.
                    settings['KAFKA_CONSUMER_AUTO_OFFSET_RESET'],
                    auto_commit_interval_ms=self.
                    settings['KAFKA_CONSUMER_COMMIT_INTERVAL_MS'],
                    enable_auto_commit=self.
                    settings['KAFKA_CONSUMER_AUTO_COMMIT_ENABLE'],
                    max_partition_fetch_bytes=self.
                    settings['KAFKA_CONSUMER_FETCH_MESSAGE_MAX_BYTES'])
            except KeyError as e:
                self.logger.error('Missing setting named ' + str(e),
                                  {'ex': traceback.format_exc()})
            except Exception as e:
                self.logger.error(
                    "Couldn't initialize kafka consumer for topic",
                    {'ex': traceback.format_exc()})
                self.logger.error(str(e))
                raise

    @retry(wait_exponential_multiplier=500, wait_exponential_max=10000)
    def _create_producer(self):
        """Tries to establish a Kafka consumer connection"""
        if not self.closed:
            try:
                self.logger.debug(
                    "Creating new kafka producer using brokers: " +
                    str(self.settings['KAFKA_HOSTS']))

                return KafkaProducer(
                    bootstrap_servers=self.settings['KAFKA_HOSTS'],
                    value_serializer=lambda v: json.dumps(v).encode('utf-8'),
                    retries=3,
                    linger_ms=self.settings['KAFKA_PRODUCER_BATCH_LINGER_MS'],
                    buffer_memory=self.settings['KAFKA_PRODUCER_BUFFER_BYTES'])
            except KeyError as e:
                self.logger.error('Missing setting named ' + str(e),
                                  {'ex': traceback.format_exc()})
            except:
                self.logger.error("Couldn't initialize kafka producer.",
                                  {'ex': traceback.format_exc()})
                raise

    def run(self):
        """Main flask run loop"""
        self.logger.info("Running main flask method on port " +
                         str(self.settings['FLASK_PORT']))
        self.app.run(host='0.0.0.0', port=self.settings['FLASK_PORT'])

    def _create_ret_object(self,
                           status=SUCCESS,
                           data=None,
                           error=False,
                           error_message=None,
                           error_cause=None):
        """
        Create generic reponse objects.

        :param str status: The SUCCESS or FAILURE of the request
        :param obj data: The data to return
        :param bool error: Set to True to add Error response
        :param str error_message: The generic error message
        :param str error_cause: The cause of the error
        :returns: A dictionary of values
        """
        ret = {}
        if status == self.FAILURE:
            ret['status'] = self.FAILURE
        else:
            ret['status'] = self.SUCCESS
        ret['data'] = data

        if error:
            ret['error'] = {}
            if error_message is not None:
                ret['error']['message'] = error_message
            if error_cause is not None:
                ret['error']['cause'] = error_cause
        else:
            ret['error'] = None
        return ret

    def _close_thread(self, thread, thread_name):
        """Closes daemon threads

        @param thread: the thread to close
        @param thread_name: a human readable name of the thread
        """
        if thread is not None and thread.isAlive():
            self.logger.debug(
                "Waiting for {} thread to close".format(thread_name))
            thread.join(timeout=self.settings['DAEMON_THREAD_JOIN_TIMEOUT'])
            if thread.isAlive():
                self.logger.warn("{} daemon thread unable to be shutdown"
                                 " within timeout".format(thread_name))

    def close(self):
        """
        Cleans up anything from the process
        """
        self.logger.info("Closing Rest Service")
        self.closed = True

        # close threads
        self._close_thread(self._redis_thread, "Redis setup")
        self._close_thread(self._heartbeat_thread, "Heartbeat")
        self._close_thread(self._kafka_thread, "Kafka setup")
        self._close_thread(self._consumer_thread, "Consumer")

        # close kafka
        if self.consumer is not None:
            self.logger.debug("Closing kafka consumer")
            self.consumer.close()
        if self.producer is not None:
            self.logger.debug("Closing kafka producer")
            self.producer.close(timeout=10)

    def _calculate_health(self):
        """Returns a string representation of the node health

        @returns: GREEN if fully connected, YELLOW if partially connected,
                  RED if not connected
        """
        if self.redis_connected and self.kafka_connected:
            return "GREEN"
        elif self.redis_connected or self.kafka_connected:
            return "YELLOW"
        else:
            return "RED"

    def _kafka_success(self, response):
        '''
        Callback for successful send
        '''
        self.logger.debug("Sent message to Kafka")

    def _kafka_failure(self, response):
        '''
        Callback for failed send
        '''
        self.logger.error("Failed to send message to Kafka")
        self._spawn_kafka_connection_thread()

    def _feed_to_kafka(self, json_item):
        """Sends a request to Kafka

        :param json_item: The json item to send
        :returns: A boolean indicating whther the data was sent successfully or not
        """
        @MethodTimer.timeout(self.settings['KAFKA_FEED_TIMEOUT'], False)
        def _feed(json_item):
            try:
                self.logger.debug("Sending json to kafka at " +
                                  str(self.settings['KAFKA_PRODUCER_TOPIC']))
                future = self.producer.send(
                    self.settings['KAFKA_PRODUCER_TOPIC'], json_item)
                future.add_callback(self._kafka_success)
                future.add_errback(self._kafka_failure)

                self.producer.flush()

                return True

            except Exception as e:
                self.logger.error("Lost connection to Kafka")
                self._spawn_kafka_connection_thread()
                return False

        return _feed(json_item)

    # Routes --------------------

    def _decorate_routes(self):
        """
        Decorates the routes to use within the flask app
        """
        self.logger.debug("Decorating routes")
        # self.app.add_url_rule('/', 'catch', self.catch, methods=['GET'],
        #                        defaults={'path': ''})
        self.app.add_url_rule('/<path:path>',
                              'catch',
                              self.catch,
                              methods=['GET', 'POST'],
                              defaults={'path': ''})
        self.app.add_url_rule('/',
                              'index',
                              self.index,
                              methods=['POST', 'GET'])
        self.app.add_url_rule('/feed', 'feed', self.feed, methods=['POST'])
        self.app.add_url_rule('/poll', 'poll', self.poll, methods=['POST'])

    @log_call('Non-existant route called')
    @error_catch
    def catch(self, path):
        return self._create_ret_object(self.FAILURE, None, True,
                                       self.DOES_NOT_EXIST), 404

    @log_call('\'index\' endpoint called')
    @error_catch
    def index(self):
        data = {
            "kafka_connected": self.kafka_connected,
            "redis_connected": self.redis_connected,
            "uptime_sec": int(self.get_time() - self.start_time),
            "my_id": self.my_uuid,
            "node_health": self._calculate_health()
        }

        return data

    @validate_json
    @log_call('\'feed\' endpoint called')
    @error_catch
    def feed(self):
        # proof of concept to write things to kafka
        if self.kafka_connected:
            json_item = request.get_json()
            self.wait_for_response = False
            result = self._feed_to_kafka(json_item)

            if 'uuid' in json_item:
                self.wait_for_response = True
                with self.uuids_lock:
                    self.uuids[json_item['uuid']] = None

            if result:
                true_response = None
                if self.wait_for_response:
                    self.logger.debug("expecting kafka response for request")
                    the_time = self.get_time()
                    found_item = False
                    while not found_item and int(self.get_time(
                    ) - the_time) <= self.settings['WAIT_FOR_RESPONSE_TIME']:
                        if self.uuids[json_item['uuid']] is not None:
                            found_item = True
                            true_response = self.uuids[json_item['uuid']]
                            with self.uuids_lock:
                                del self.uuids[json_item['uuid']]
                    else:
                        with self.uuids_lock:
                            # key still exists, means we didnt find get our
                            # response in time
                            if json_item['uuid'] in self.uuids:
                                self.uuids[json_item['uuid']] = 'poll'
                                self.logger.debug("Did not find response, "
                                                  "adding to poll")
                    if found_item:
                        self.logger.debug(
                            "Got successful reponse back from kafka")
                    else:
                        self.logger.warn("Did not get response within timeout "
                                         "from kafka. If the request is still "
                                         "running, use the `/poll` API")
                        true_response = {"poll_id": json_item['uuid']}
                else:
                    self.logger.debug("Not expecting response from kafka")

                return self._create_ret_object(self.SUCCESS, true_response)

        self.logger.warn("Unable to write request to Kafka, not connected")
        return self._create_ret_object(self.FAILURE, None, True,
                                       "Unable to connect to Kafka"), 500

    @validate_json
    @validate_schema('poll')
    @log_call('\'poll\' endpoint called')
    @error_catch
    def poll(self):
        """Retrieves older requests that may not make it back quick
        enough"""
        if self.redis_connected:
            json_item = request.get_json()
            result = None
            try:
                key = "rest:poll:{u}".format(u=json_item['poll_id'])
                result = self.redis_conn.get(key)

                if result is not None:
                    result = json.loads(result)
                    self.logger.debug("Found previous poll")
                    self.redis_conn.delete(key)
                    return self._create_ret_object(self.SUCCESS, result)
                else:
                    self.logger.debug("poll key does not exist")
                    return self._create_ret_object(
                        self.FAILURE, None, True,
                        "Could not find matching poll_id"), 404
            except ConnectionError:
                self.logger.error("Lost connection to Redis")
                self._spawn_redis_connection_thread()
            except ValueError:
                extras = {"value": result}
                self.logger.warning('Unparseable JSON Received from redis',
                                    extra=extras)
                self.redis_conn.delete(key)
                return self._create_ret_object(
                    self.FAILURE, None, True, "Unparseable JSON Received "
                    "from redis"), 500
        self.logger.warn("Unable to poll redis, not connected")
        return self._create_ret_object(self.FAILURE, None, True,
                                       "Unable to connect to Redis"), 500
Exemple #11
0
class RedisMonitor(object):

    def __init__(self, settings_name, unit_test=False):
        '''
        @param settings_name: the local settings file name
        @param unit_test: whether running unit tests or not
        '''
        self.settings_name = settings_name
        self.redis_conn = None
        self.wrapper = SettingsWrapper()
        self.logger = None
        self.unit_test = unit_test
        self.my_uuid = str(uuid.uuid4()).split('-')[4]

    def setup(self, level=None, log_file=None, json=None):
        '''
        Load everything up. Note that any arg here will override both
        default and custom settings

        @param level: the log level
        @param log_file: boolean t/f whether to log to a file, else stdout
        @param json: boolean t/f whether to write the logs in json
        '''
        self.settings = self.wrapper.load(self.settings_name)

        my_level = level if level else self.settings['LOG_LEVEL']
        # negate because logger wants True for std out
        my_output = not log_file if log_file else self.settings['LOG_STDOUT']
        my_json = json if json else self.settings['LOG_JSON']
        self.logger = LogFactory.get_instance(json=my_json,
                                              stdout=my_output, level=my_level,
                                              name=self.settings['LOGGER_NAME'],
                                              dir=self.settings['LOG_DIR'],
                                              file=self.settings['LOG_FILE'],
                                              bytes=self.settings['LOG_MAX_BYTES'],
                                              backups=self.settings['LOG_BACKUPS'])

        self.redis_conn = redis.StrictRedis(host=self.settings['REDIS_HOST'],
                                            port=self.settings['REDIS_PORT'],
                                            db=self.settings['REDIS_DB'],
                                            password=self.settings['REDIS_PASSWORD'],
                                            decode_responses=True,
                                            socket_timeout=self.settings.get('REDIS_SOCKET_TIMEOUT'),
                                            socket_connect_timeout=self.settings.get('REDIS_SOCKET_TIMEOUT'))
        # redis_lock needs a redis connection without setting decode_responses
        # to True
        self.lock_redis_conn = redis.StrictRedis(host=self.settings['REDIS_HOST'],
                                                 port=self.settings['REDIS_PORT'],
                                                 db=self.settings['REDIS_DB'],
                                                 password=self.settings['REDIS_PASSWORD'],
                                                 socket_timeout=self.settings.get('REDIS_SOCKET_TIMEOUT'),
                                                 socket_connect_timeout=self.settings.get('REDIS_SOCKET_TIMEOUT'))

        try:
            self.redis_conn.info()
            self.logger.debug("Successfully connected to Redis")
        except ConnectionError:
            self.logger.error("Failed to connect to Redis")
            # essential to functionality
            sys.exit(1)

        self._load_plugins()
        self._setup_stats()

    def import_class(self, cl):
        '''
        Imports a class from a string

        @param name: the module and class name in dot notation
        '''
        d = cl.rfind(".")
        classname = cl[d+1:len(cl)]
        m = __import__(cl[0:d], globals(), locals(), [classname])
        return getattr(m, classname)

    def _load_plugins(self):
        '''
        Sets up all plugins and defaults
        '''
        plugins = self.settings['PLUGINS']

        self.plugins_dict = {}
        for key in plugins:
            # skip loading the plugin if its value is None
            if plugins[key] is None:
                continue
            # valid plugin, import and setup
            self.logger.debug("Trying to load plugin {cls}"
                              .format(cls=key))
            the_class = self.import_class(key)
            instance = the_class()
            instance.redis_conn = self.redis_conn
            instance._set_logger(self.logger)
            if not self.unit_test:
                instance.setup(self.settings)
            the_regex = instance.regex

            mini = {}
            mini['instance'] = instance
            if the_regex is None:
                raise ImportError()
                # continue
            mini['regex'] = the_regex

            self.plugins_dict[plugins[key]] = mini

        self.plugins_dict = OrderedDict(sorted(list(self.plugins_dict.items()),
                                               key=lambda t: t[0]))

    def run(self):
        '''
        The external main run loop
        '''
        self._main_loop()

    def _main_loop(self):
        '''
        The internal while true main loop for the redis monitor
        '''
        self.logger.debug("Running main loop")
        old_time = 0
        while True:
            for plugin_key in self.plugins_dict:
                obj = self.plugins_dict[plugin_key]
                self._process_plugin(obj)

            if self.settings['STATS_DUMP'] != 0:
                new_time = int(old_div(time.time(), self.settings['STATS_DUMP']))
                # only log every X seconds
                if new_time != old_time:
                    self._dump_stats()

                    if self.settings['STATS_DUMP_CRAWL']:
                        self._dump_crawl_stats()

                    if self.settings['STATS_DUMP_QUEUE']:
                        self._dump_queue_stats()

                    old_time = new_time
            self._report_self()
            time.sleep(self.settings['SLEEP_TIME'])

    def _process_plugin(self, plugin):
        '''
        Logic to handle each plugin that is active

        @param plugin: a plugin dict object
        '''
        instance = plugin['instance']
        regex = plugin['regex']
        for key in self.redis_conn.scan_iter(match=regex):
            # acquire lock
            lock = self._create_lock_object(key)

            try:
                if lock.acquire(blocking=False):
                    val = self.redis_conn.get(key)
                    self._process_key_val(instance, key, val)
            except Exception:
                self.logger.error(traceback.format_exc())
                self._increment_fail_stat('{k}:{v}'.format(k=key, v=val))

                self._process_failures(key)

            # remove lock regardless of if exception or was handled ok
            if lock._held:
                self.logger.debug("releasing lock")
                lock.release()

    def _create_lock_object(self, key):
        '''
        Returns a lock object, split for testing
        '''
        return redis_lock.Lock(self.lock_redis_conn, key,
                               expire=self.settings['REDIS_LOCK_EXPIRATION'],
                               auto_renewal=True)

    def _get_fail_key(self, key):
        '''
        Returns the fail key string of a normal key
        '''
        return 'lock:{k}:failures'.format(k=key)

    def _process_failures(self, key):
        '''
        Handles the retrying of the failed key
        '''
        if self.settings['RETRY_FAILURES']:
            self.logger.debug("going to retry failure")
            # get the current failure count
            failkey = self._get_fail_key(key)
            current = self.redis_conn.get(failkey)
            if current is None:
                current = 0
            else:
                current = int(current)
            if current < self.settings['RETRY_FAILURES_MAX']:
                self.logger.debug("Incr fail key")
                current += 1
                self.redis_conn.set(failkey, current)
            else:
                self.logger.error("Could not process action within"
                                  " failure limit")
                self.redis_conn.delete(failkey)
                self.redis_conn.delete(key)

    def _process_key_val(self, instance, key, val):
        '''
        Logic to let the plugin instance process the redis key/val
        Split out for unit testing

        @param instance: the plugin instance
        @param key: the redis key
        @param val: the key value from redis
        '''
        if instance.check_precondition(key, val):
            combined = '{k}:{v}'.format(k=key, v=val)
            self._increment_total_stat(combined)
            self._increment_plugin_stat(
                instance.__class__.__name__,
                combined)
            instance.handle(key, val)
            self.redis_conn.delete(key)
            failkey = self._get_fail_key(key)
            if self.redis_conn.exists(failkey):
                self.redis_conn.delete(failkey)

    def _setup_stats(self):
        '''
        Sets up the stats
        '''
        # stats setup
        self.stats_dict = {}

        if self.settings['STATS_TOTAL']:
            self._setup_stats_total()

        if self.settings['STATS_PLUGINS']:
            self._setup_stats_plugins()

    def _setup_stats_total(self):
        '''
        Sets up the total stats collectors
        '''
        self.stats_dict['total'] = {}
        self.stats_dict['fail'] = {}
        temp_key1 = 'stats:redis-monitor:total'
        temp_key2 = 'stats:redis-monitor:fail'
        for item in self.settings['STATS_TIMES']:
            try:
                time = getattr(StatsCollector, item)
                self.stats_dict['total'][time] = StatsCollector \
                        .get_rolling_time_window(
                                redis_conn=self.redis_conn,
                                key='{k}:{t}'.format(k=temp_key1, t=time),
                                window=time,
                                cycle_time=self.settings['STATS_CYCLE'])
                self.stats_dict['fail'][time] = StatsCollector \
                        .get_rolling_time_window(
                                redis_conn=self.redis_conn,
                                key='{k}:{t}'.format(k=temp_key2, t=time),
                                window=time,
                                cycle_time=self.settings['STATS_CYCLE'])
                self.logger.debug("Set up total/fail Stats Collector '{i}'"\
                        .format(i=item))
            except AttributeError as e:
                self.logger.warning("Unable to find Stats Time '{s}'"\
                        .format(s=item))
        total1 = StatsCollector.get_hll_counter(redis_conn=self.redis_conn,
                        key='{k}:lifetime'.format(k=temp_key1),
                        cycle_time=self.settings['STATS_CYCLE'],
                        roll=False)
        total2 = StatsCollector.get_hll_counter(redis_conn=self.redis_conn,
                        key='{k}:lifetime'.format(k=temp_key2),
                        cycle_time=self.settings['STATS_CYCLE'],
                        roll=False)
        self.logger.debug("Set up total/fail Stats Collector 'lifetime'")
        self.stats_dict['total']['lifetime'] = total1
        self.stats_dict['fail']['lifetime'] = total2

    def _setup_stats_plugins(self):
        '''
        Sets up the plugin stats collectors
        '''
        self.stats_dict['plugins'] = {}
        for key in self.plugins_dict:
            plugin_name = self.plugins_dict[key]['instance'].__class__.__name__
            temp_key = 'stats:redis-monitor:{p}'.format(p=plugin_name)
            self.stats_dict['plugins'][plugin_name] = {}
            for item in self.settings['STATS_TIMES']:
                try:
                    time = getattr(StatsCollector, item)

                    self.stats_dict['plugins'][plugin_name][time] = StatsCollector \
                            .get_rolling_time_window(
                                    redis_conn=self.redis_conn,
                                    key='{k}:{t}'.format(k=temp_key, t=time),
                                    window=time,
                                    cycle_time=self.settings['STATS_CYCLE'])
                    self.logger.debug("Set up {p} plugin Stats Collector '{i}'"\
                            .format(p=plugin_name, i=item))
                except AttributeError as e:
                    self.logger.warning("Unable to find Stats Time '{s}'"\
                            .format(s=item))
            total = StatsCollector.get_hll_counter(redis_conn=self.redis_conn,
                            key='{k}:lifetime'.format(k=temp_key),
                            cycle_time=self.settings['STATS_CYCLE'],
                            roll=False)
            self.logger.debug("Set up {p} plugin Stats Collector 'lifetime'"\
                            .format(p=plugin_name))
            self.stats_dict['plugins'][plugin_name]['lifetime'] = total

    def _increment_total_stat(self, item):
        '''
        Increments the total stat counters

        @param item: the unique print for HLL counter
        '''
        item = item + str(time.time())
        if 'total' in self.stats_dict:
            self.logger.debug("Incremented total stats")
            for key in self.stats_dict['total']:
                if key == 'lifetime':
                    self.stats_dict['total'][key].increment(item)
                else:
                    self.stats_dict['total'][key].increment()

    def _increment_fail_stat(self, item):
        '''
        Increments the total stat counters

        @param item: the unique print for HLL counter
        '''
        item = item + str(time.time())
        if 'fail' in self.stats_dict:
            self.logger.debug("Incremented fail stats")
            for key in self.stats_dict['fail']:
                if key == 'lifetime':
                    self.stats_dict['fail'][key].increment(item)
                else:
                    self.stats_dict['fail'][key].increment()

    def _increment_plugin_stat(self, name, item):
        '''
        Increments the total stat counters

        @param name: The formal name of the plugin
        @param item: the unique print for HLL counter
        '''
        item = item + str(time.time())
        if 'plugins' in self.stats_dict:
            self.logger.debug("Incremented plugin '{p}' plugin stats"\
                    .format(p=name))
            for key in self.stats_dict['plugins'][name]:
                if key == 'lifetime':
                    self.stats_dict['plugins'][name][key].increment(item)
                else:
                    self.stats_dict['plugins'][name][key].increment()

    def _dump_stats(self):
        '''
        Dumps the stats out
        '''
        extras = {}
        if 'total' in self.stats_dict:
            self.logger.debug("Compiling total/fail dump stats")
            for key in self.stats_dict['total']:
                final = 'total_{t}'.format(t=key)
                extras[final] = self.stats_dict['total'][key].value()
            for key in self.stats_dict['fail']:
                final = 'fail_{t}'.format(t=key)
                extras[final] = self.stats_dict['fail'][key].value()

        if 'plugins' in self.stats_dict:
            self.logger.debug("Compiling plugin dump stats")
            for name in self.stats_dict['plugins']:
                for key in self.stats_dict['plugins'][name]:
                    final = 'plugin_{n}_{t}'.format(n=name, t=key)
                    extras[final] = self.stats_dict['plugins'][name][key].value()

        if not self.logger.json:
            self.logger.info('Redis Monitor Stats Dump:\n{0}'.format(
                    json.dumps(extras, indent=4, sort_keys=True)))
        else:
            self.logger.info('Redis Monitor Stats Dump', extra=extras)

    def _dump_crawl_stats(self):
        '''
        Dumps flattened crawling stats so the spiders do not have to
        '''
        extras = {}
        spiders = {}

        spider_set = set()
        total_spider_count = 0

        keys = self.redis_conn.keys('stats:crawler:*:*:*')
        for key in keys:
            # we only care about the spider
            elements = key.split(":")
            spider = elements[3]

            if spider not in spiders:
                spiders[spider] = 0

            if len(elements) == 6:
                # got a time based stat
                response = elements[4]
                end = elements[5]

                final = '{s}_{r}_{e}'.format(s=spider, r=response, e=end)

                if end == 'lifetime':
                    value = self.redis_conn.execute_command("PFCOUNT", key)
                else:
                    value = self.redis_conn.zcard(key)

                extras[final] = value

            elif len(elements) == 5:
                # got a spider identifier
                spiders[spider] += 1
                total_spider_count += 1
                spider_set.add(spider)

            else:
                self.logger.warn("Unknown crawler stat key", {"key":key})

        # simple counts
        extras['unique_spider_count'] = len(spider_set)
        extras['total_spider_count'] = total_spider_count

        for spider in spiders:
            extras['{k}_spider_count'.format(k=spider)] = spiders[spider]

        if not self.logger.json:
            self.logger.info('Crawler Stats Dump:\n{0}'.format(
                    json.dumps(extras, indent=4, sort_keys=True)))
        else:
            self.logger.info('Crawler Stats Dump', extra=extras)

    def _dump_queue_stats(self):
        '''
        Dumps basic info about the queue lengths for the spider types
        '''
        extras = {}
        keys = self.redis_conn.keys('*:*:queue')
        total_backlog = 0
        for key in keys:
            elements = key.split(":")
            spider = elements[0]
            domain = elements[1]
            spider = 'queue_' + spider

            if spider not in extras:
                extras[spider] = {}
                extras[spider]['spider_backlog'] = 0
                extras[spider]['num_domains'] = 0

            count = self.redis_conn.zcard(key)
            total_backlog += count
            extras[spider]['spider_backlog'] += count
            extras[spider]['num_domains'] += 1

        extras['total_backlog'] = total_backlog

        if not self.logger.json:
            self.logger.info('Queue Stats Dump:\n{0}'.format(
                    json.dumps(extras, indent=4, sort_keys=True)))
        else:
            self.logger.info('Queue Stats Dump', extra=extras)

    def _report_self(self):
        '''
        Reports the redis monitor uuid to redis
        '''
        key = "stats:redis-monitor:self:{m}:{u}".format(
            m=socket.gethostname(),
            u=self.my_uuid)
        self.redis_conn.set(key, time.time())
        self.redis_conn.expire(key, self.settings['HEARTBEAT_TIMEOUT'])

    def close(self):
        '''
        Closes the Redis Monitor and plugins
        '''
        for plugin_key in self.plugins_dict:
            obj = self.plugins_dict[plugin_key]
            instance = obj['instance']
            instance.close()
Exemple #12
0
import argparse
from scutils.settings_wrapper import SettingsWrapper

# set up arg parser
parser = argparse.ArgumentParser(
    description='Example SettingsWrapper parser.\n')
parser.add_argument('-s', '--settings', action='store', required=False,
                    help="The default settings file",
                    default="settings.py")
parser.add_argument('-o', '--override-settings', action='store', required=False,
                    help="The override settings file",
                    default="localsettings.py")
parser.add_argument('-v', '--variable', action='store', required=False,
                    help="The variable to print out",
                    default=None)
args = vars(parser.parse_args())

# load up settings
wrapper = SettingsWrapper()
my_settings = wrapper.load(default=args['settings'],
                           local=args['override_settings'])

if args['variable'] is not None:
    if args['variable'] in my_settings:
        print(args['variable'], '=', my_settings[args['variable']])
    else:
        print(args['variable'], "not in loaded settings")
else:
    print("Full settings:", my_settings)
 def setUp(self):
     self.wrapper = SettingsWrapper()
     self.wrapper.my_settings = {}
Exemple #14
0
def main():
    # initial main parser setup
    parser = argparse.ArgumentParser(
        description='Kafka Dump: Scrapy Cluster Kafka topic dump utility for '
                    'debugging.', add_help=False)
    parser.add_argument('-h', '--help', action=ArgparseHelper,
                        help='show this help message and exit')

    subparsers = parser.add_subparsers(help='commands', dest='command')

    # args to use for all commands
    base_parser = argparse.ArgumentParser(add_help=False)
    base_parser.add_argument('-kh', '--kafka-host', action='store', required=False,
                        help="The override Kafka host")
    base_parser.add_argument('-s', '--settings', action='store', required=False,
                        help="The settings file to read from",
                        default="localsettings.py")
    base_parser.add_argument('-ll', '--log-level', action='store', required=False,
                        help="The log level", default=None,
                        choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'])

    # list command
    list_parser = subparsers.add_parser('list', help='List all Kafka topics',
                                        parents=[base_parser])

    # dump command
    dump_parser = subparsers.add_parser('dump', help='Dump a Kafka topic',
                                        parents=[base_parser])
    dump_parser.add_argument('-t', '--topic', action='store', required=True,
                             help="The Kafka topic to read from")
    dump_parser.add_argument('-c', '--consumer', action='store',
                             required=False, default=None,
                             help="The Kafka consumer id to use")
    dump_parser.add_argument('-b', '--from-beginning', action='store_const',
                             required=False, const=True,
                             help="Read the topic from the beginning")
    dump_parser.add_argument('-nb', '--no-body', action='store_const',
                             required=False, const=True, default=False,
                             help="Do not include the raw html 'body' key in"
                             " the json dump of the topic")
    dump_parser.add_argument('-p', '--pretty', action='store_const',
                             required=False, const=True, default=False,
                             help="Pretty print the json objects consumed")
    dump_parser.add_argument('-d', '--decode-base64', action='store_const',
                             required=False, const=True, default=False,
                             help="Decode the base64 encoded raw html body")
    dump_parser.add_argument('-m', '--mongodb', action="store", help="Set mongodb to save webpages")

    args = vars(parser.parse_args())

    wrapper = SettingsWrapper()
    settings = wrapper.load(args['settings'])

    kafka_host = args['kafka_host'] if args['kafka_host'] else settings['KAFKA_HOSTS']
    log_level = args['log_level'] if args['log_level'] else settings['LOG_LEVEL']
    logger = LogFactory.get_instance(level=log_level, name='kafkadump')

    if args['command'] == 'list':
        try:
            logger.debug("Connecting to {0}...".format(kafka_host))
            kafka = SimpleClient(kafka_host)
            logger.info("Connected to {0}".format(kafka_host))
        except KafkaUnavailableError as ex:
            message = "An exception '{0}' occured. Arguments:\n{1!r}" \
                .format(type(ex).__name__, ex.args)
            logger.error(message)
            sys.exit(1)
        logger.debug('Running list command')
        print("Topics:")
        for topic in list(kafka.topic_partitions.keys()):
            print("-", topic)
        kafka.close()
        return 0
    elif args['command'] == 'dump':
        logger.debug('Running dump command')
        topic = args["topic"]
        consumer_id = args["consumer"]

        try:
            logger.debug("Getting Kafka consumer")

            offset = 'earliest' if args["from_beginning"] else 'latest'

            consumer = KafkaConsumer(
                topic,
                group_id=consumer_id,
                bootstrap_servers=kafka_host,
                consumer_timeout_ms=settings['KAFKA_CONSUMER_TIMEOUT'],
                auto_offset_reset=offset,
                auto_commit_interval_ms=settings['KAFKA_CONSUMER_COMMIT_INTERVAL_MS'],
                enable_auto_commit=settings['KAFKA_CONSUMER_AUTO_COMMIT_ENABLE'],
                max_partition_fetch_bytes=settings['KAFKA_CONSUMER_FETCH_MESSAGE_MAX_BYTES'])
        except NoBrokersAvailable as ex:
                logger.error('Unable to connect to Kafka')
                sys.exit(1)

        num_records = 0
        total_bytes = 0
        item = None

        while True:
            try:
                for message in consumer:
                    if message is None:
                        logger.debug("no message")
                        break
                    logger.debug("Received message")
                    val = message.value
                    try:
                        item = json.loads(val)
                        if args['decode_base64'] and 'body' in item:
                            item['body'] = str(base64.b64decode(item['body']))

                        if args['no_body'] and 'body' in item:
                            del item['body']
                    except BaseException, msg:
                        logger.info("Message is not a JSON object")
                        logger.info("base64 error: ", msg)
                        item = val
                    body_bytes = len(item)

                    if args['pretty']:
                        print(json.dumps(item, indent=4))
                    else:
                        print(item)
                    num_records = num_records + 1
                    total_bytes = total_bytes + body_bytes
            except KeyboardInterrupt:
                logger.debug("Keyboard interrupt received")
                break
            except:
                logger.error(traceback.print_exc())
                break

        total_mbs = old_div(float(total_bytes), (1024*1024))
        if item is not None:
            print("Last item:")
            print(json.dumps(item, indent=4))
        if num_records > 0:
            logger.info("Num Records: {n}, Total MBs: {m}, kb per message: {kb}"
                    .format(n=num_records, m=total_mbs,
                            kb=(float(total_bytes) / num_records / 1024)))
        else:
            logger.info("No records consumed")
            num_records = 0

        logger.info("Closing Kafka connection")
        try:
            consumer.close()
        except:
            # Exception is thrown when group_id is None.
            # See https://github.com/dpkp/kafka-python/issues/619
            pass
        return 0
class JayRedisMonitor:

    def __init__(self, settings_name, unit_test=False):
        '''
        @param settings_name: the local settings file name
        @param unit_test: whether running unit tests or not
        '''
        self.settings_name = settings_name
        self.redis_conn = None
        self.wrapper = SettingsWrapper()
        self.logger = None
        self.unit_test = unit_test

    def setup(self, level=None, log_file=None, json=None):
        '''
        Load everything up. Note that any arg here will override both
        default and custom settings

        @param level: the log level
        @param log_file: boolean t/f whether to log to a file, else stdout
        @param json: boolean t/f whether to write the logs in json
        '''
        self.settings = self.wrapper.load(self.settings_name)

        my_level = level if level else self.settings['LOG_LEVEL']
        # negate because logger wants True for std out
        my_output = not log_file if log_file else self.settings['LOG_STDOUT']
        my_json = json if json else self.settings['LOG_JSON']
        self.logger = LogFactory.get_instance(json=my_json,
                                              stdout=my_output,
                                              level=my_level,
                                              #name=self.settings['LOGGER_NAME'],
                                              name = "jay-redis-monitor",
                                              dir=self.settings['LOG_DIR'],
                                              #file=self.settings['LOG_FILE'],
                                              file="jay_redis_monitor.log",
                                              bytes=self.settings['LOG_MAX_BYTES'],
                                              backups=self.settings['LOG_BACKUPS'])

        self.redis_conn = redis.Redis(host=self.settings['REDIS_HOST'],
                                      port=self.settings['REDIS_PORT'])
        try:
            self.redis_conn.info()
            self.logger.debug("Successfully connected to Redis")
        except ConnectionError:
            self.logger.error("Failed to connect to Redis")
            # essential to functionality
            sys.exit(1)



    def run(self):
        '''
        The external main run loop
        '''
        self._main_loop()

    def _main_loop(self):
        '''
        The internal while true main loop for the redis monitor
        '''
        self.logger.debug("Running main loop")
        print 'Running main loop'
        jaystats = JayStatsMonitor()
        jaystats.setup(self.logger,self.redis_conn)
        jayinfo = JayInfoMonitor()
        jayinfo.setup(self.logger,self.redis_conn)

        while True:
            jaystats.handle()
            jayinfo.handle()




            time.sleep(1)
Exemple #16
0
class RestService(object):

    # static strings
    SUCCESS = 'SUCCESS'
    FAILURE = 'FAILURE'
    UNKNOWN_ERROR = "An error occurred while processing your request."
    MUST_JSON = "The payload must be valid JSON."
    DOES_NOT_EXIST = "The desired endpoint does not exist"
    BAD_SCHEMA = "JSON did not validate against schema."

    consumer = None
    producer = None
    closed = False
    start_time = 0
    _consumer_thread = None
    _kafka_thread = None
    _heartbeat_thread = None
    _redis_thread = None

    def __init__(self, settings_name):
        """
        @param settings_name: the local settings file name
        """
        self.settings_name = settings_name
        self.wrapper = SettingsWrapper()
        self.logger = None
        self.app = Flask(__name__)
        self.kafka_connected = False
        self.redis_connected = False
        self.my_uuid = str(uuid.uuid4()).split('-')[4]
        self.uuids = {}
        self.uuids_lock = threading.Lock()
        self.validator = self._extend_with_default(Draft4Validator)
        self.schemas = {}

    def setup(self, level=None, log_file=None, json=None):
        """
        Load everything up. Note that any arg here will override both
        default and custom settings

        @param level: the log level
        @param log_file: boolean t/f whether to log to a file, else stdout
        @param json: boolean t/f whether to write the logs in json
        """
        self.settings = self.wrapper.load(self.settings_name)

        my_level = level if level else self.settings['LOG_LEVEL']
        # negate because logger wants True for std out
        my_output = not log_file if log_file else self.settings['LOG_STDOUT']
        my_json = json if json else self.settings['LOG_JSON']
        self.logger = LogFactory.get_instance(json=my_json, stdout=my_output,
                                              level=my_level,
                                              name=self.settings['LOGGER_NAME'],
                                              dir=self.settings['LOG_DIR'],
                                              file=self.settings['LOG_FILE'],
                                              bytes=self.settings['LOG_MAX_BYTES'],
                                              backups=self.settings['LOG_BACKUPS'])

        self._decorate_routes()
        self._spawn_redis_connection_thread()
        self._spawn_kafka_connection_thread()

        # spawn heartbeat processing loop
        self._heartbeat_thread = Thread(target=self._heartbeat_loop)
        self._heartbeat_thread.setDaemon(True)
        self._heartbeat_thread.start()

        self.start_time = self.get_time()

        # disable flask logger
        if self.settings['FLASK_LOGGING_ENABLED'] == False:
            log = logging.getLogger('werkzeug')
            log.disabled = True

        self._load_schemas()

    def get_time(self):
        """Returns the current time"""
        return time.time()

    def _load_schemas(self):
        """Loads any schemas for JSON validation"""
        for filename in os.listdir(self.settings['SCHEMA_DIR']):
            if filename[-4:] == 'json':
                name = filename[:-5]
                with open(self.settings['SCHEMA_DIR'] + filename) as the_file:
                    self.schemas[name] = json.load(the_file)
                    self.logger.debug("Successfully loaded " + filename + " schema")

    def _extend_with_default(self, validator_class):
        '''
        Method to add default fields to our schema validation
        ( From the docs )
        '''
        validate_properties = validator_class.VALIDATORS["properties"]

        def set_defaults(validator, properties, instance, schema):
            for error in validate_properties(
                validator, properties, instance, schema,
            ):
                yield error

            for property, subschema in list(properties.items()):
                if "default" in subschema:
                    instance.setdefault(property, subschema["default"])

        return validators.extend(
            validator_class, {"properties": set_defaults},
        )

    def _spawn_redis_connection_thread(self):
        """Spawns a redis connection thread"""
        self.logger.debug("Spawn redis connection thread")
        self.redis_connected = False
        self._redis_thread = Thread(target=self._setup_redis)
        self._redis_thread.setDaemon(True)
        self._redis_thread.start()

    def _spawn_kafka_connection_thread(self):
        """Spawns a kafka connection thread"""
        self.logger.debug("Spawn kafka connection thread")
        self.kafka_connected = False
        self._kafka_thread = Thread(target=self._setup_kafka)
        self._kafka_thread.setDaemon(True)
        self._kafka_thread.start()

    def _spawn_kafka_consumer_thread(self):
        """Spawns a kafka continuous consumer thread"""
        self.logger.debug("Spawn kafka consumer thread""")
        self._consumer_thread = Thread(target=self._consumer_loop)
        self._consumer_thread.setDaemon(True)
        self._consumer_thread.start()

    def _consumer_loop(self):
        """The main consumer loop"""
        self.logger.debug("running main consumer thread")
        while not self.closed:
            if self.kafka_connected:
                self._process_messages()
            time.sleep(self.settings['KAFKA_CONSUMER_SLEEP_TIME'])

    def _process_messages(self):
        """Processes messages received from kafka"""
        try:
            for message in self.consumer:
                try:
                    if message is None:
                        self.logger.debug("no message")
                        break
                    loaded_dict = json.loads(message.value)
                    self.logger.debug("got valid kafka message")

                    with self.uuids_lock:
                        if 'uuid' in loaded_dict:
                            if loaded_dict['uuid'] in self.uuids and \
                                    self.uuids[loaded_dict['uuid']] != 'poll':
                                self.logger.debug("Found Kafka message from request")
                                self.uuids[loaded_dict['uuid']] = loaded_dict
                            else:
                                self.logger.debug("Got poll result")
                                self._send_result_to_redis(loaded_dict)
                        else:
                            self.logger.debug("Got message not intended for this process")
                except ValueError:
                    extras = {}
                    if message is not None:
                            extras["data"] = message.value
                    self.logger.warning('Unparseable JSON Received from kafka',
                                                extra=extras)

            self._check_kafka_disconnect()

        except OffsetOutOfRangeError:
            # consumer has no idea where they are
            self.consumer.seek_to_end()
            self.logger.error("Kafka offset out of range error")

    def _send_result_to_redis(self, result):
        """Sends the result of a poll to redis to be used potentially by
        another process

        @param result: the result retrieved from kafka"""
        if self.redis_connected:
            self.logger.debug("Sending result to redis")
            try:
                key = "rest:poll:{u}".format(u=result['uuid'])
                self.redis_conn.set(key, json.dumps(result))
            except ConnectionError:
                self.logger.error("Lost connection to Redis")
                self._spawn_redis_connection_thread()
        else:
            self.logger.warning("Unable to send result to redis, not connected")

    def _check_kafka_disconnect(self):
        """Checks the kafka connection is still valid"""
        for node_id in self.consumer._client._conns:
            conn = self.consumer._client._conns[node_id]
            if conn.state == ConnectionStates.DISCONNECTED or \
                    conn.state == ConnectionStates.DISCONNECTING:
                self._spawn_kafka_connection_thread()
                break

    def _heartbeat_loop(self):
        """A main run loop thread to do work"""
        self.logger.debug("running main heartbeat thread")
        while not self.closed:
            time.sleep(self.settings['SLEEP_TIME'])
            self._report_self()

    def _report_self(self):
        """
        Reports the crawler uuid to redis
        """
        if self.redis_connected:
            self.logger.debug("Reporting self to redis")
            try:
                key = "stats:rest:self:{m}:{u}".format(
                    m=socket.gethostname(),
                    u=self.my_uuid)
                self.redis_conn.set(key, self.get_time())
                self.redis_conn.expire(key, self.settings['HEARTBEAT_TIMEOUT'])
            except ConnectionError:
                self.logger.error("Lost connection to Redis")
                self._spawn_redis_connection_thread()
        else:
            self.logger.warn("Cannot report self to redis, not connected")


    @retry(wait_exponential_multiplier=500, wait_exponential_max=10000)
    def _setup_redis(self):
        """Returns a Redis Client"""
        if not self.closed:
            try:
                self.logger.debug("Creating redis connection to host " +
                                  str(self.settings['REDIS_HOST']))
                self.redis_conn = redis.StrictRedis(host=self.settings['REDIS_HOST'],
                                              port=self.settings['REDIS_PORT'],
                                              db=self.settings['REDIS_DB'])
                self.redis_conn.info()
                self.redis_connected = True
                self.logger.info("Successfully connected to redis")
            except KeyError as e:
                self.logger.error('Missing setting named ' + str(e),
                                   {'ex': traceback.format_exc()})
            except:
                self.logger.error("Couldn't initialize redis client.",
                                   {'ex': traceback.format_exc()})
                raise

    def _setup_kafka(self):
        """
        Sets up kafka connections
        """
        # close older connections
        if self.consumer is not None:
            self.logger.debug("Closing existing kafka consumer")
            self.consumer.close()
            self.consumer = None
        if self.producer is not None:
            self.logger.debug("Closing existing kafka producer")
            self.producer.flush()
            self.producer.close(timeout=10)
            self.producer = None

        # create new connections
        self._consumer_thread = None
        self.logger.debug("Creating kafka connections")
        self.consumer = self._create_consumer()
        if not self.closed:
            self.logger.debug("Kafka Conumer created")
        self.producer = self._create_producer()
        if not self.closed:
            self.logger.debug("Kafka Producer created")

        if not self.closed:
            self.kafka_connected = True
            self.logger.info("Connected successfully to Kafka")
            self._spawn_kafka_consumer_thread()

    @retry(wait_exponential_multiplier=500, wait_exponential_max=10000)
    def _create_consumer(self):
        """Tries to establing the Kafka consumer connection"""
        if not self.closed:
            try:
                self.logger.debug("Creating new kafka consumer using brokers: " +
                                   str(self.settings['KAFKA_HOSTS']) + ' and topic ' +
                                   self.settings['KAFKA_TOPIC_PREFIX'] +
                                   ".outbound_firehose")

                return KafkaConsumer(
                    self.settings['KAFKA_TOPIC_PREFIX'] + ".outbound_firehose",
                    group_id=None,
                    bootstrap_servers=self.settings['KAFKA_HOSTS'],
                    consumer_timeout_ms=self.settings['KAFKA_CONSUMER_TIMEOUT'],
                    auto_offset_reset=self.settings['KAFKA_CONSUMER_AUTO_OFFSET_RESET'],
                    auto_commit_interval_ms=self.settings['KAFKA_CONSUMER_COMMIT_INTERVAL_MS'],
                    enable_auto_commit=self.settings['KAFKA_CONSUMER_AUTO_COMMIT_ENABLE'],
                    max_partition_fetch_bytes=self.settings['KAFKA_CONSUMER_FETCH_MESSAGE_MAX_BYTES'])
            except KeyError as e:
                self.logger.error('Missing setting named ' + str(e),
                                   {'ex': traceback.format_exc()})
            except:
                self.logger.error("Couldn't initialize kafka consumer for topic",
                                   {'ex': traceback.format_exc()})
                raise

    @retry(wait_exponential_multiplier=500, wait_exponential_max=10000)
    def _create_producer(self):
        """Tries to establish a Kafka consumer connection"""
        if not self.closed:
            try:
                self.logger.debug("Creating new kafka producer using brokers: " +
                                   str(self.settings['KAFKA_HOSTS']))

                return KafkaProducer(bootstrap_servers=self.settings['KAFKA_HOSTS'],
                                     value_serializer=lambda v: json.dumps(v).encode('utf-8'),
                                     retries=3,
                                     linger_ms=self.settings['KAFKA_PRODUCER_BATCH_LINGER_MS'],
                                     buffer_memory=self.settings['KAFKA_PRODUCER_BUFFER_BYTES'])
            except KeyError as e:
                self.logger.error('Missing setting named ' + str(e),
                                   {'ex': traceback.format_exc()})
            except:
                self.logger.error("Couldn't initialize kafka producer.",
                                   {'ex': traceback.format_exc()})
                raise

    def run(self):
        """Main flask run loop"""
        self.logger.info("Running main flask method on port " +
                         str(self.settings['FLASK_PORT']))
        self.app.run(host='0.0.0.0', port=self.settings['FLASK_PORT'])

    def _create_ret_object(self, status=SUCCESS, data=None, error=False,
                           error_message=None, error_cause=None):
        """
        Create generic reponse objects.

        :param str status: The SUCCESS or FAILURE of the request
        :param obj data: The data to return
        :param bool error: Set to True to add Error response
        :param str error_message: The generic error message
        :param str error_cause: The cause of the error
        :returns: A dictionary of values
        """
        ret = {}
        if status == self.FAILURE:
            ret['status'] = self.FAILURE
        else:
            ret['status'] = self.SUCCESS
        ret['data'] = data

        if error:
            ret['error'] = {}
            if error_message is not None:
                ret['error']['message'] = error_message
            if error_cause is not None:
                ret['error']['cause'] = error_cause
        else:
            ret['error'] = None
        return ret

    def _close_thread(self, thread, thread_name):
        """Closes daemon threads

        @param thread: the thread to close
        @param thread_name: a human readable name of the thread
        """
        if thread is not None and thread.isAlive():
            self.logger.debug("Waiting for {} thread to close".format(thread_name))
            thread.join(timeout=self.settings['DAEMON_THREAD_JOIN_TIMEOUT'])
            if thread.isAlive():
                self.logger.warn("{} daemon thread unable to be shutdown"
                                 " within timeout".format(thread_name))

    def close(self):
        """
        Cleans up anything from the process
        """
        self.logger.info("Closing Rest Service")
        self.closed = True

        # close threads
        self._close_thread(self._redis_thread, "Redis setup")
        self._close_thread(self._heartbeat_thread, "Heartbeat")
        self._close_thread(self._kafka_thread, "Kafka setup")
        self._close_thread(self._consumer_thread, "Consumer")

        # close kafka
        if self.consumer is not None:
            self.logger.debug("Closing kafka consumer")
            self.consumer.close()
        if self.producer is not None:
            self.logger.debug("Closing kafka producer")
            self.producer.close(timeout=10)

    def _calculate_health(self):
        """Returns a string representation of the node health

        @returns: GREEN if fully connected, YELLOW if partially connected,
                  RED if not connected
        """
        if self.redis_connected and self.kafka_connected:
            return "GREEN"
        elif self.redis_connected or self.kafka_connected:
            return "YELLOW"
        else:
            return "RED"

    def _kafka_success(self, response):
        '''
        Callback for successful send
        '''
        self.logger.debug("Sent message to Kafka")

    def _kafka_failure(self, response):
        '''
        Callback for failed send
        '''
        self.logger.error("Failed to send message to Kafka")
        self._spawn_kafka_connection_thread()

    def _feed_to_kafka(self, json_item):
        """Sends a request to Kafka

        :param json_item: The json item to send
        :returns: A boolean indicating whther the data was sent successfully or not
        """
        @MethodTimer.timeout(self.settings['KAFKA_FEED_TIMEOUT'], False)
        def _feed(json_item):
            try:
                self.logger.debug("Sending json to kafka at " +
                                  str(self.settings['KAFKA_PRODUCER_TOPIC']))
                future = self.producer.send(self.settings['KAFKA_PRODUCER_TOPIC'],
                                   json_item)
                future.add_callback(self._kafka_success)
                future.add_errback(self._kafka_failure)

                self.producer.flush()

                return True

            except Exception as e:
                self.logger.error("Lost connection to Kafka")
                self._spawn_kafka_connection_thread()
                return False

        return _feed(json_item)

    # Routes --------------------

    def _decorate_routes(self):
        """
        Decorates the routes to use within the flask app
        """
        self.logger.debug("Decorating routes")
        # self.app.add_url_rule('/', 'catch', self.catch, methods=['GET'],
        #                        defaults={'path': ''})
        self.app.add_url_rule('/<path:path>', 'catch', self.catch,
                              methods=['GET', 'POST'], defaults={'path': ''})
        self.app.add_url_rule('/', 'index', self.index,
                              methods=['POST', 'GET'])
        self.app.add_url_rule('/feed', 'feed', self.feed,
                              methods=['POST'])
        self.app.add_url_rule('/poll', 'poll', self.poll,
                              methods=['POST'])

    @log_call('Non-existant route called')
    @error_catch
    def catch(self, path):
        return self._create_ret_object(self.FAILURE, None, True,
                                       self.DOES_NOT_EXIST), 404

    @log_call('\'index\' endpoint called')
    @error_catch
    def index(self):
        data = {
            "kafka_connected": self.kafka_connected,
            "redis_connected": self.redis_connected,
            "uptime_sec": int(self.get_time() - self.start_time),
            "my_id": self.my_uuid,
            "node_health": self._calculate_health()
        }

        return data

    @validate_json
    @log_call('\'feed\' endpoint called')
    @error_catch
    def feed(self):
        # proof of concept to write things to kafka
        if self.kafka_connected:
            json_item = request.get_json()
            self.wait_for_response = False
            result = self._feed_to_kafka(json_item)

            if 'uuid' in json_item:
                    self.wait_for_response = True
                    with self.uuids_lock:
                        self.uuids[json_item['uuid']] = None

            if result:
                true_response = None
                if self.wait_for_response:
                    self.logger.debug("expecting kafka response for request")
                    the_time = self.get_time()
                    found_item = False
                    while not found_item and int(self.get_time() - the_time) <= self.settings['WAIT_FOR_RESPONSE_TIME']:
                        if self.uuids[json_item['uuid']] is not None:
                            found_item = True
                            true_response = self.uuids[json_item['uuid']]
                            with self.uuids_lock:
                                del self.uuids[json_item['uuid']]
                    else:
                        with self.uuids_lock:
                            # key still exists, means we didnt find get our
                            # response in time
                            if json_item['uuid'] in self.uuids:
                                self.uuids[json_item['uuid']] = 'poll'
                                self.logger.debug("Did not find response, "
                                                  "adding to poll")
                    if found_item:
                        self.logger.debug("Got successful reponse back from kafka")
                    else:
                        self.logger.warn("Did not get response within timeout "
                                         "from kafka. If the request is still "
                                         "running, use the `/poll` API")
                        true_response = {
                            "poll_id": json_item['uuid']
                        }
                else:
                    self.logger.debug("Not expecting response from kafka")

                return self._create_ret_object(self.SUCCESS, true_response)

        self.logger.warn("Unable to write request to Kafka, not connected")
        return self._create_ret_object(self.FAILURE, None, True,
                                       "Unable to connect to Kafka"), 500

    @validate_json
    @validate_schema('poll')
    @log_call('\'poll\' endpoint called')
    @error_catch
    def poll(self):
        """Retrieves older requests that may not make it back quick
        enough"""
        if self.redis_connected:
            json_item = request.get_json()
            result = None
            try:
                key = "rest:poll:{u}".format(u=json_item['poll_id'])
                result = self.redis_conn.get(key)

                if result is not None:
                    result = json.loads(result)
                    self.logger.debug("Found previous poll")
                    self.redis_conn.delete(key)
                    return self._create_ret_object(self.SUCCESS, result)
                else:
                    self.logger.debug("poll key does not exist")
                    return self._create_ret_object(self.FAILURE, None, True,
                                       "Could not find matching poll_id"), 404
            except ConnectionError:
                self.logger.error("Lost connection to Redis")
                self._spawn_redis_connection_thread()
            except ValueError:
                extras = {
                    "value": result
                }
                self.logger.warning('Unparseable JSON Received from redis',
                                                extra=extras)
                self.redis_conn.delete(key)
                return self._create_ret_object(self.FAILURE, None, True,
                                               "Unparseable JSON Received "
                                               "from redis"), 500
        self.logger.warn("Unable to poll redis, not connected")
        return self._create_ret_object(self.FAILURE, None, True,
                                       "Unable to connect to Redis"), 500
Exemple #17
0
def main():
    # initial main parser setup
    parser = argparse.ArgumentParser(
        description='Kafka Dump: Scrapy Cluster Kafka topic dump utility for '
        'debugging.',
        add_help=False)
    parser.add_argument('-h',
                        '--help',
                        action=ArgparseHelper,
                        help='show this help message and exit')

    subparsers = parser.add_subparsers(help='commands', dest='command')

    # args to use for all commands
    base_parser = argparse.ArgumentParser(add_help=False)
    base_parser.add_argument('-kh',
                             '--kafka-host',
                             action='store',
                             required=False,
                             help="The override Kafka host")
    base_parser.add_argument('-s',
                             '--settings',
                             action='store',
                             required=False,
                             help="The settings file to read from",
                             default="localsettings.py")
    base_parser.add_argument(
        '-ll',
        '--log-level',
        action='store',
        required=False,
        help="The log level",
        default=None,
        choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'])

    # list command
    list_parser = subparsers.add_parser('list',
                                        help='List all Kafka topics',
                                        parents=[base_parser])

    # dump command
    dump_parser = subparsers.add_parser('dump',
                                        help='Dump a Kafka topic',
                                        parents=[base_parser])
    dump_parser.add_argument('-t',
                             '--topic',
                             action='store',
                             required=True,
                             help="The Kafka topic to read from")
    dump_parser.add_argument('-c',
                             '--consumer',
                             action='store',
                             required=False,
                             default='default',
                             help="The Kafka consumer id to use")
    dump_parser.add_argument('-b',
                             '--from-beginning',
                             action='store_const',
                             required=False,
                             const=True,
                             help="Read the topic from the beginning")
    dump_parser.add_argument('-nb',
                             '--no-body',
                             action='store_const',
                             required=False,
                             const=True,
                             default=False,
                             help="Do not include the raw html 'body' key in"
                             " the json dump of the topic")
    dump_parser.add_argument('-p',
                             '--pretty',
                             action='store_const',
                             required=False,
                             const=True,
                             default=False,
                             help="Pretty print the json objects consumed")
    dump_parser.add_argument('-d',
                             '--decode-base64',
                             action='store_const',
                             required=False,
                             const=True,
                             default=False,
                             help="Decode the base64 encoded raw html body")

    args = vars(parser.parse_args())

    wrapper = SettingsWrapper()
    settings = wrapper.load(args['settings'])

    kafka_host = args['kafka_host'] if args['kafka_host'] else settings[
        'KAFKA_HOSTS']
    log_level = args['log_level'] if args['log_level'] else settings[
        'LOG_LEVEL']
    logger = LogFactory.get_instance(level=log_level, name='kafkadump')

    logger.debug("Connecting to {0}...".format(kafka_host))
    try:
        kafka = KafkaClient(kafka_host)
        logger.info("Connected to {0}".format(kafka_host))
    except KafkaUnavailableError as ex:
        message = "An exception '{0}' occured. Arguments:\n{1!r}" \
            .format(type(ex).__name__, ex.args)
        logger.error(message)
        sys.exit(1)

    if args['command'] == 'list':
        logger.debug('Running list command')
        print "Topics:"
        for topic in kafka.topic_partitions.keys():
            print "-", topic
        return 0
    elif args['command'] == 'dump':
        logger.debug('Running dump command')
        topic = args["topic"]
        consumer_id = args["consumer"]

        @MethodTimer.timeout(5, None)
        def _hidden():
            try:
                logger.debug("Ensuring topic {t} exists".format(t=topic))
                kafka.ensure_topic_exists(topic)

                logger.debug("Getting Kafka consumer")
                consumer = SimpleConsumer(kafka,
                                          consumer_id,
                                          topic,
                                          buffer_size=1024 * 100,
                                          fetch_size_bytes=1024 * 100,
                                          max_buffer_size=None)
                return consumer
            except KafkaUnavailableError as ex:
                message = "An exception '{0}' occured. Arguments:\n{1!r}" \
                    .format(type(ex).__name__, ex.args)
                logger.error(message)
                sys.exit(1)

        consumer = _hidden()

        if consumer is None:
            logger.error("Could not fully connect to Kafka within the timeout")
            sys.exit(1)

        if args["from_beginning"]:
            logger.debug("Seeking to beginning")
            consumer.seek(0, 0)
        else:
            logger.debug("Reading from the end")
            consumer.seek(0, 2)

        num_records = 0
        total_bytes = 0
        item = None

        while True:
            try:
                for message in consumer.get_messages():
                    if message is None:
                        logger.debug("no message")
                        break
                    logger.debug("Received message")
                    val = message.message.value
                    try:
                        item = json.loads(val)
                        if args['decode_base64'] and 'body' in item:
                            item['body'] = base64.b64decode(item['body'])

                        if args['no_body'] and 'body' in item:
                            del item['body']
                    except ValueError:
                        logger.info("Message is not a JSON object")
                        item = val
                    body_bytes = len(item)

                    if args['pretty']:
                        print json.dumps(item, indent=4)
                    else:
                        print item
                    num_records = num_records + 1
                    total_bytes = total_bytes + body_bytes
            except KeyboardInterrupt:
                logger.debug("Keyboard interrupt received")
                break
            except:
                logger.error(traceback.print_exc())
                break

        total_mbs = float(total_bytes) / (1024 * 1024)
        if item is not None:
            print "Last item:"
            print json.dumps(item, indent=4)
        if num_records > 0:
            logger.info(
                "Num Records: {n}, Total MBs: {m}, kb per message: {kb}".
                format(n=num_records,
                       m=total_mbs,
                       kb=(float(total_bytes) / num_records / 1024)))
        else:
            logger.info("No records consumed")
            num_records = 0

        logger.info("Closing Kafka connection")
        kafka.close()
        return 0
Exemple #18
0
class JayRedisMonitor:
    def __init__(self, settings_name, unit_test=False):
        '''
        @param settings_name: the local settings file name
        @param unit_test: whether running unit tests or not
        '''
        self.settings_name = settings_name
        self.redis_conn = None
        self.wrapper = SettingsWrapper()
        self.logger = None
        self.unit_test = unit_test

    def setup(self, level=None, log_file=None, json=None):
        '''
        Load everything up. Note that any arg here will override both
        default and custom settings

        @param level: the log level
        @param log_file: boolean t/f whether to log to a file, else stdout
        @param json: boolean t/f whether to write the logs in json
        '''
        self.settings = self.wrapper.load(self.settings_name)

        my_level = level if level else self.settings['LOG_LEVEL']
        # negate because logger wants True for std out
        my_output = not log_file if log_file else self.settings['LOG_STDOUT']
        my_json = json if json else self.settings['LOG_JSON']
        self.logger = LogFactory.get_instance(
            json=my_json,
            stdout=my_output,
            level=my_level,
            #name=self.settings['LOGGER_NAME'],
            name="jay-redis-monitor",
            dir=self.settings['LOG_DIR'],
            #file=self.settings['LOG_FILE'],
            file="jay_redis_monitor.log",
            bytes=self.settings['LOG_MAX_BYTES'],
            backups=self.settings['LOG_BACKUPS'])

        self.redis_conn = redis.Redis(host=self.settings['REDIS_HOST'],
                                      port=self.settings['REDIS_PORT'])
        try:
            self.redis_conn.info()
            self.logger.debug("Successfully connected to Redis")
        except ConnectionError:
            self.logger.error("Failed to connect to Redis")
            # essential to functionality
            sys.exit(1)

    def run(self):
        '''
        The external main run loop
        '''
        self._main_loop()

    def _main_loop(self):
        '''
        The internal while true main loop for the redis monitor
        '''
        self.logger.debug("Running main loop")
        print 'Running main loop'
        jaystats = JayStatsMonitor()
        jaystats.setup(self.logger, self.redis_conn)
        jayinfo = JayInfoMonitor()
        jayinfo.setup(self.logger, self.redis_conn)

        while True:
            jaystats.handle()
            jayinfo.handle()

            time.sleep(1)
Exemple #19
0
class RedisMonitor:

    def __init__(self, settings_name, unit_test=False):
        '''
        @param settings_name: the local settings file name
        @param unit_test: whether running unit tests or not
        '''
        self.settings_name = settings_name
        self.redis_conn = None
        self.wrapper = SettingsWrapper()
        self.logger = None
        self.unit_test = unit_test

    def setup(self, level=None, log_file=None, json=None):
        '''
        Load everything up. Note that any arg here will override both
        default and custom settings

        @param level: the log level
        @param log_file: boolean t/f whether to log to a file, else stdout
        @param json: boolean t/f whether to write the logs in json
        '''
        self.settings = self.wrapper.load(self.settings_name)

        my_level = level if level else self.settings['LOG_LEVEL']
        # negate because logger wants True for std out
        my_output = not log_file if log_file else self.settings['LOG_STDOUT']
        my_json = json if json else self.settings['LOG_JSON']
        self.logger = LogFactory.get_instance(json=my_json,
                                              stdout=my_output,
                                              level=my_level,
                                              name=self.settings['LOGGER_NAME'],
                                              dir=self.settings['LOG_DIR'],
                                              file=self.settings['LOG_FILE'],
                                              bytes=self.settings['LOG_MAX_BYTES'],
                                              backups=self.settings['LOG_BACKUPS'])

        self.redis_conn = redis.Redis(host=self.settings['REDIS_HOST'],
                                      port=self.settings['REDIS_PORT'])
        try:
            self.redis_conn.info()
            self.logger.debug("Successfully connected to Redis")
        except ConnectionError:
            self.logger.error("Failed to connect to Redis")
            # essential to functionality
            sys.exit(1)

        self._load_plugins()
        self._setup_stats()

    def import_class(self, cl):
        '''
        Imports a class from a string

        @param name: the module and class name in dot notation
        '''
        d = cl.rfind(".")
        classname = cl[d+1:len(cl)]
        m = __import__(cl[0:d], globals(), locals(), [classname])
        return getattr(m, classname)

    def _load_plugins(self):
        '''
        Sets up all plugins and defaults
        '''
        plugins = self.settings['PLUGINS']

        self.plugins_dict = {}
        for key in plugins:
            # skip loading the plugin if its value is None
            if plugins[key] is None:
                continue
            # valid plugin, import and setup
            self.logger.debug("Trying to load plugin {cls}"
                              .format(cls=key))
            the_class = self.import_class(key)
            instance = the_class()
            instance.redis_conn = self.redis_conn
            instance._set_logger(self.logger)
            if not self.unit_test:
                instance.setup(self.settings)
            the_regex = instance.regex

            mini = {}
            mini['instance'] = instance
            if the_regex is None:
                raise ImportError()
                # continue
            mini['regex'] = the_regex

            self.plugins_dict[plugins[key]] = mini

        self.plugins_dict = OrderedDict(sorted(self.plugins_dict.items(),
                                               key=lambda t: t[0]))

    def run(self):
        '''
        The external main run loop
        '''
        self._main_loop()

    def _main_loop(self):
        '''
        The internal while true main loop for the redis monitor
        '''
        self.logger.debug("Running main loop")
        print 'Running main loop'
        old_time = 0
        while True:
            for plugin_key in self.plugins_dict:
                obj = self.plugins_dict[plugin_key]
                self._process_plugin(obj)

            if self.settings['STATS_DUMP'] != 0:
                new_time = int(time.time() / self.settings['STATS_DUMP'])
                # only log every X seconds
                if new_time != old_time:
                    self._dump_stats()

                    if self.settings['STATS_DUMP_CRAWL']:
                        self._dump_crawl_stats()

                    if self.settings['STATS_DUMP_QUEUE']:
                        self._dump_queue_stats()

                    old_time = new_time

            time.sleep(0.1)

    def _process_plugin(self, plugin):
        '''
        Logic to handle each plugin that is active

        @param plugin: a plugin dict object
        '''
        instance = plugin['instance']
        regex = plugin['regex']
        for key in self.redis_conn.scan_iter(match=regex):
            val = self.redis_conn.get(key)
            try:
                self._process_key_val(instance, key, val)
            except Exception:
                self.logger.error(traceback.format_exc())
                self._increment_fail_stat('{k}:{v}'.format(k=key, v=val))
                self.redis_conn.delete(key)

    def _process_key_val(self, instance, key, val):
        '''
        Logic to let the plugin instance process the redis key/val
        Split out for unit testing

        @param instance: the plugin instance
        @param key: the redis key
        @param val: the key value from redis
        '''
        if instance.check_precondition(key, val):
            combined = '{k}:{v}'.format(k=key, v=val)
            self._increment_total_stat(combined)
            self._increment_plugin_stat(
                instance.__class__.__name__,
                combined)
            instance.handle(key, val)
            self.redis_conn.delete(key)

    def _setup_stats(self):
        '''
        Sets up the stats
        '''
        # stats setup
        self.stats_dict = {}

        if self.settings['STATS_TOTAL']:
            self._setup_stats_total()

        if self.settings['STATS_PLUGINS']:
            self._setup_stats_plugins()

    def _setup_stats_total(self):
        '''
        Sets up the total stats collectors
        '''
        self.stats_dict['total'] = {}
        self.stats_dict['fail'] = {}
        temp_key1 = 'stats:redis-monitor:total'
        temp_key2 = 'stats:redis-monitor:fail'
        for item in self.settings['STATS_TIMES']:
            try:
                time = getattr(StatsCollector, item)
                self.stats_dict['total'][time] = StatsCollector \
                        .get_rolling_time_window(
                                redis_conn=self.redis_conn,
                                key='{k}:{t}'.format(k=temp_key1, t=time),
                                window=time,
                                cycle_time=self.settings['STATS_CYCLE'])

                self.stats_dict['fail'][time] = StatsCollector \
                        .get_rolling_time_window(
                                redis_conn=self.redis_conn,
                                key='{k}:{t}'.format(k=temp_key2, t=time),
                                window=time,
                                cycle_time=self.settings['STATS_CYCLE'])

                self.logger.debug("Set up total/fail Stats Collector '{i}'"\
                        .format(i=item))
            except AttributeError as e:
                self.logger.warning("Unable to find Stats Time '{s}'"\
                        .format(s=item))
        total1 = StatsCollector.get_hll_counter(redis_conn=self.redis_conn,
                        key='{k}:lifetime'.format(k=temp_key1),
                        cycle_time=self.settings['STATS_CYCLE'],
                        roll=False)
        total2 = StatsCollector.get_hll_counter(redis_conn=self.redis_conn,
                        key='{k}:lifetime'.format(k=temp_key2),
                        cycle_time=self.settings['STATS_CYCLE'],
                        roll=False)
        self.logger.debug("Set up total/fail Stats Collector 'lifetime'")
        self.stats_dict['total']['lifetime'] = total1
        self.stats_dict['fail']['lifetime'] = total2

    def _setup_stats_plugins(self):
        '''
        Sets up the plugin stats collectors
        '''
        self.stats_dict['plugins'] = {}
        for key in self.plugins_dict:
            plugin_name = self.plugins_dict[key]['instance'].__class__.__name__
            temp_key = 'stats:redis-monitor:{p}'.format(p=plugin_name)
            self.stats_dict['plugins'][plugin_name] = {}
            for item in self.settings['STATS_TIMES']:
                try:
                    time = getattr(StatsCollector, item)

                    self.stats_dict['plugins'][plugin_name][time] = StatsCollector \
                            .get_rolling_time_window(
                                    redis_conn=self.redis_conn,
                                    key='{k}:{t}'.format(k=temp_key, t=time),
                                    window=time,
                                    cycle_time=self.settings['STATS_CYCLE'])
                    self.logger.debug("Set up {p} plugin Stats Collector '{i}'"\
                            .format(p=plugin_name, i=item))
                except AttributeError as e:
                    self.logger.warning("Unable to find Stats Time '{s}'"\
                            .format(s=item))
            total = StatsCollector.get_hll_counter(redis_conn=self.redis_conn,
                            key='{k}:lifetime'.format(k=temp_key),
                            cycle_time=self.settings['STATS_CYCLE'],
                            roll=False)
            self.logger.debug("Set up {p} plugin Stats Collector 'lifetime'"\
                            .format(p=plugin_name))
            self.stats_dict['plugins'][plugin_name]['lifetime'] = total

    def _increment_total_stat(self, item):
        '''
        Increments the total stat counters

        @param item: the unique print for HLL counter
        '''
        item = item + str(time.time())
        if 'total' in self.stats_dict:
            self.logger.debug("Incremented total stats")
            for key in self.stats_dict['total']:
                if key == 'lifetime':
                    self.stats_dict['total'][key].increment(item)
                else:
                    self.stats_dict['total'][key].increment()

    def _increment_fail_stat(self, item):
        '''
        Increments the total stat counters

        @param item: the unique print for HLL counter
        '''
        item = item + str(time.time())
        if 'fail' in self.stats_dict:
            self.logger.debug("Incremented fail stats")
            for key in self.stats_dict['fail']:
                if key == 'lifetime':
                    self.stats_dict['fail'][key].increment(item)
                else:
                    self.stats_dict['fail'][key].increment()

    def _increment_plugin_stat(self, name, item):
        '''
        Increments the total stat counters

        @param name: The formal name of the plugin
        @param item: the unique print for HLL counter
        '''
        item = item + str(time.time())
        if 'plugins' in self.stats_dict:
            self.logger.debug("Incremented plugin '{p}' plugin stats"\
                    .format(p=name))
            for key in self.stats_dict['plugins'][name]:
                if key == 'lifetime':
                    self.stats_dict['plugins'][name][key].increment(item)
                else:
                    self.stats_dict['plugins'][name][key].increment()

    def _dump_stats(self):
        '''
        Dumps the stats out
        '''
        extras = {}
        if 'total' in self.stats_dict:
            self.logger.debug("Compiling total/fail dump stats")
            for key in self.stats_dict['total']:
                final = 'total_{t}'.format(t=key)
                extras[final] = self.stats_dict['total'][key].value()
            for key in self.stats_dict['fail']:
                final = 'fail_{t}'.format(t=key)
                extras[final] = self.stats_dict['fail'][key].value()

        if 'plugins' in self.stats_dict:
            self.logger.debug("Compiling plugin dump stats")
            for name in self.stats_dict['plugins']:
                for key in self.stats_dict['plugins'][name]:
                    final = 'plugin_{n}_{t}'.format(n=name, t=key)
                    extras[final] = self.stats_dict['plugins'][name][key].value()

        if not self.logger.json:
            self.logger.info('Redis Monitor Stats Dump:\n{0}'.format(
                    json.dumps(extras, indent=4, sort_keys=True)))
        else:
            self.logger.info('Redis Monitor Stats Dump', extra=extras)

    def _dump_crawl_stats(self):
        '''
        Dumps flattened crawling stats so the spiders do not have to
        '''
        extras = {}
        spiders = {}

        spider_set = set()
        total_spider_count = 0

        keys = self.redis_conn.keys('stats:crawler:*:*:*')
        for key in keys:
            # we only care about the spider
            elements = key.split(":")
            spider = elements[3]

            if spider not in spiders:
                spiders[spider] = 0

            if len(elements) == 6:
                # got a time based stat
                response = elements[4]
                end = elements[5]

                final = '{s}_{r}_{e}'.format(s=spider, r=response, e=end)

                if end == 'lifetime':
                    value = self.redis_conn.execute_command("PFCOUNT", key)
                else:
                    value = self.redis_conn.zcard(key)

                extras[final] = value

            elif len(elements) == 5:
                # got a spider identifier
                spiders[spider] += 1
                total_spider_count += 1
                spider_set.add(spider)

            else:
                self.logger.warn("Unknown crawler stat key", {"key":key})

        # simple counts
        extras['unique_spider_count'] = len(spider_set)
        extras['total_spider_count'] = total_spider_count

        for spider in spiders:
            extras['{k}_spider_count'.format(k=spider)] = spiders[spider]

        if not self.logger.json:
            self.logger.info('Crawler Stats Dump:\n{0}'.format(
                    json.dumps(extras, indent=4, sort_keys=True)))
        else:
            self.logger.info('Crawler Stats Dump', extra=extras)

    def _dump_queue_stats(self):
        '''
        Dumps basic info about the queue lengths for the spider types
        '''
        extras = {}
        keys = self.redis_conn.keys('*:*:queue')
        total_backlog = 0
        for key in keys:
            elements = key.split(":")
            spider = elements[0]
            domain = elements[1]
            spider = 'queue_' + spider

            if spider not in extras:
                extras[spider] = {}
                extras[spider]['spider_backlog'] = 0
                extras[spider]['num_domains'] = 0

            count = self.redis_conn.zcard(key)
            total_backlog += count
            extras[spider]['spider_backlog'] += count
            extras[spider]['num_domains'] += 1

        extras['total_backlog'] = total_backlog

        if not self.logger.json:
            self.logger.info('Queue Stats Dump:\n{0}'.format(
                    json.dumps(extras, indent=4, sort_keys=True)))
        else:
            self.logger.info('Queue Stats Dump', extra=extras)
Exemple #20
0
class Dispatcher:
    def __init__(self, tasks, redis_conn):

        self.tasks = tasks  # 初始URL种子队列
        self.redis_conn = redis_conn
        self.wrapper = SettingsWrapper()

        self.spiders = []  # 当前运行爬虫节点
        self.spiders_weights = None  # 当前爬虫节点的权值
        self.settings = None
        self.logger = None

    def setup(self):
        """从配置文件中加载配置信息"""
        self.settings = self.wrapper.load('settings.py')
        self.logger = LogFactory.get_instance(
            json=self.settings['LOG_JSON'],
            stdout=self.settings['LOG_STDOUT'],
            level=self.settings['LOG_LEVEL'],
            name=self.settings['LOGGER_NAME'],
            dir=self.settings['LOG_DIR'],
            file=self.settings['LOG_FILE'],
            bytes=self.settings['LOG_MAX_BYTES'],
            backups=self.settings['LOG_BACKUPS'])

    def schedule_seeds(self):
        """
        分配初始种子队列
        :return:
        """
        # 加载算法
        robin = Robin()

        count = 0  # 计数器: 每100次执行一次批量操作
        pip = self.redis_conn.pipeline()

        while True:
            task_json = self.redis_conn.lpop('seeds')
            if not task_json:
                time.sleep(3)  # 等待3秒再向redis查询
                continue

            self.logger.debug('分配初始种子URLs........')
            task = pickle.loads(task_json)
            url = task['url']
            spider_type = task['spider_type']
            domain = tldextract.extract(url).domain

            # 更新加权调度算法中最大爬虫节点序号值
            spiders_weights = copy.deepcopy(self.spiders_weights)
            max_job = max(spiders_weights.keys())
            if max_job != robin.get_max_job():
                robin.update_max_job(max_job)

            job_id = robin.choose_spider(spiders_weights)  # 调用加权轮叫算法选择一个爬虫节点

            if job_id == -1:  # 爬虫节点出现故障
                pass
            queue_key = '{spider_type}:{job_id}:{domain}:queue'.format(
                spider_type=spider_type, job_id=job_id, domain=domain)
            priority = task['priority']
            pip.zadd(queue_key, pickle.dumps(task), priority)
            count += 1
            if count == 100:
                pip.execute()
                count = 0

    def get_spiders_weights(self, interval):
        """
        定时获取各个爬虫节点的权值
        :return:
        """
        spiders_weights = dict()
        for key in self.redis_conn.keys('weight:spider:*:*'):
            job_id = int(key.split(':')[3])
            spiders_weights[job_id] = int(self.redis_conn.get(key))
        self.spiders_weights = spiders_weights
        t = Timer(interval, self.get_spiders_weights, (interval, ))
        t.start()

    def schedule_tasks(self):
        """主节点任务调度"""

        # 加载算法
        robin = Robin()

        count = 0  # 计数器: 每500次执行一次批量操作
        pip = self.redis_conn.pipeline()
        while True:
            self.logger.debug('将新加入的URLs分配给各个爬虫节点.........')
            task_json = self.redis_conn.lpop('tasks')

            task = pickle.loads(task_json)
            url = task['url']
            spider_type = task['spider_type']
            domain = tldextract.extract(url).domain

            # 更新加权调度算法中最大爬虫节点序号值
            spiders_weights = copy.deepcopy(self.spiders_weights)
            max_job = max(spiders_weights.keys())
            if max_job != robin.get_max_job():
                robin.update_max_job(max_job)

            job_id = robin.choose_spider(spiders_weights)  # 调用加权轮叫算法选择一个爬虫节点
            queue_key = '{spider_type}:{job_id}:{domain}:queue'.format(
                spider_type=spider_type, job_id=job_id, domain=domain)
            priority = task['priority']
            pip.zadd(queue_key, pickle.dumps(task), priority)
            count += 1
            if count == 500:
                pip.execute()
                count = 0

    def run(self):
        self.get_spiders_weights(3)

        # 分配初始种子队列
        schedule_seeds_thread = Thread(target=self.schedule_seeds)
        schedule_seeds_thread.start()
Exemple #21
0
 def setUp(self):
     self.wrapper = SettingsWrapper()
Exemple #22
0
class TestSettingsWrapper(TestCase):

    defaults = {"STRING": "stuff", "DICT": {"value": "other stuff"}}

    def setUp(self):
        self.wrapper = SettingsWrapper()

    def test_no_defaults(self):
        self.wrapper._load_defaults()
        sets = self.wrapper.settings()
        self.assertEqual(sets, {})

    def test_load_default(self):
        self.wrapper._load_defaults("test_default_settings.py")
        sets = self.wrapper.settings()
        self.assertEqual(sets, self.defaults)

    def test_no_override(self):
        # test no prior defaults
        self.wrapper.my_settings = {}
        self.wrapper._load_custom()
        sets = self.wrapper.settings()
        self.assertEqual(sets, {})

        self.wrapper._load_defaults("test_default_settings.py")
        self.wrapper._load_custom()
        sets = self.wrapper.settings()
        self.assertEqual(sets, self.defaults)

    def test_override_default(self):
        self.wrapper._load_defaults("test_default_settings.py")
        self.wrapper._load_custom("test_override_defaults.py")
        sets = self.wrapper.settings()
        actual = {
            'NEW_DICT': {
                'other': 'stuff'
            },
            'MY_STRING': 'cool',
            'DICT': {
                'append': 'value',
                'value': 'override'
            },
            'STRING': 'my stuff',
            'NEW_LIST': ['item1']
        }
        self.assertEqual(sets, actual)

    def test_load_string(self):
        s = """STRING = \"my stuff\"\nMY_STRING = \"cool\"\nNEW_LIST = [\'item2\']"""

        real = {
            'STRING': 'my stuff',
            'MY_STRING': 'cool',
            'NEW_LIST': ['item2']
        }

        sets = self.wrapper.load_from_string(s)
        self.assertItemsEqual(real, sets)
Exemple #23
0
class Feed:
    def __init__(self, settings_name, unit_test=False):
        '''
        @param settings_name: the local settings file name
        @param unit_test: whether running unit tests or not
        '''
        self.settings_name = settings_name
        self.wrapper = SettingsWrapper()
        self.logger = None
        self.unit_test = unit_test

    def setup(self, level=None, log_file=None, json=None):
        '''
        Load everything up. Note that any arg here will override both
        default and custom settings

        @param level: the log level
        @param log_file: boolean t/f whether to log to a file, else stdout
        @param json: boolean t/f whether to write the logs in json
        '''
        self.settings = self.wrapper.load(self.settings_name)

        my_level = level if level else self.settings['LOG_LEVEL']
        # negate because logger wants True for std out
        my_output = not log_file if log_file else self.settings['LOG_STDOUT']
        my_json = json if json else self.settings['LOG_JSON']
        self.logger = LogFactory.get_instance(
            json=my_json,
            stdout=my_output,
            level=my_level,
            name=self.settings['LOGGER_NAME'],
            dir=self.settings['LOG_DIR'],
            file=self.settings['LOG_FILE'],
            bytes=self.settings['LOG_MAX_BYTES'],
            backups=self.settings['LOG_BACKUPS'])

        self.validator = self.extend_with_default(Draft4Validator)

    def extend_with_default(self, validator_class):
        '''
        Method to add default fields to our schema validation
        ( From the docs )
        '''
        validate_properties = validator_class.VALIDATORS["properties"]

        def set_defaults(validator, properties, instance, schema):
            for error in validate_properties(
                    validator,
                    properties,
                    instance,
                    schema,
            ):
                yield error

            for property, subschema in properties.iteritems():
                if "default" in subschema:
                    instance.setdefault(property, subschema["default"])

        return validators.extend(
            validator_class,
            {"properties": set_defaults},
        )

    def feed(self, json_item):
        instance = ScraperHandler()
        instance._set_logger(self.logger)
        instance.setup(self.settings)
        the_schema = None
        with open(self.settings['PLUGIN_DIR'] + instance.schema) as the_file:
            the_schema = json.load(the_file)

        the_dict = json_item
        ret = True
        try:
            self.validator(the_schema).validate(the_dict)
            instance.handle(the_dict)
            self.logger.info("Successfully fed item to Kafka")
        except ValidationError:
            self.logger.error("Failed to feed item into Kafka")
Exemple #24
0
def main():
    # initial main parser setup
    parser = argparse.ArgumentParser(
        description='Kafka Dump: Scrapy Cluster Kafka topic dump utility for '
                    'debugging.', add_help=False)
    parser.add_argument('-h', '--help', action=ArgparseHelper,
                        help='show this help message and exit')

    subparsers = parser.add_subparsers(help='commands', dest='command')

    # args to use for all commands
    base_parser = argparse.ArgumentParser(add_help=False)
    base_parser.add_argument('-kh', '--kafka-host', action='store', required=False,
                        help="The override Kafka host")
    base_parser.add_argument('-s', '--settings', action='store', required=False,
                        help="The settings file to read from",
                        default="localsettings.py")
    base_parser.add_argument('-ll', '--log-level', action='store', required=False,
                        help="The log level", default=None,
                        choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'])

    # list command
    list_parser = subparsers.add_parser('list', help='List all Kafka topics',
                                        parents=[base_parser])

    # dump command
    dump_parser = subparsers.add_parser('dump', help='Dump a Kafka topic',
                                        parents=[base_parser])
    dump_parser.add_argument('-t', '--topic', action='store', required=True,
                             help="The Kafka topic to read from")
    dump_parser.add_argument('-c', '--consumer', action='store',
                             required=False, default='default',
                             help="The Kafka consumer id to use")
    dump_parser.add_argument('-b', '--from-beginning', action='store_const',
                             required=False, const=True,
                             help="Read the topic from the beginning")
    dump_parser.add_argument('-nb', '--no-body', action='store_const',
                             required=False, const=True, default=False,
                             help="Do not include the raw html 'body' key in"
                             " the json dump of the topic")
    dump_parser.add_argument('-p', '--pretty', action='store_const',
                             required=False, const=True, default=False,
                             help="Pretty print the json objects consumed")
    dump_parser.add_argument('-d', '--decode-base64', action='store_const',
                             required=False, const=True, default=False,
                             help="Decode the base64 encoded raw html body")

    args = vars(parser.parse_args())

    wrapper = SettingsWrapper()
    settings = wrapper.load(args['settings'])

    kafka_host = args['kafka_host'] if args['kafka_host'] else settings['KAFKA_HOSTS']
    log_level = args['log_level'] if args['log_level'] else settings['LOG_LEVEL']
    logger = LogFactory.get_instance(level=log_level, name='kafkadump')

    logger.debug("Connecting to {0}...".format(kafka_host))
    try:
        kafka = KafkaClient(kafka_host)
        logger.info("Connected to {0}".format(kafka_host))
    except KafkaUnavailableError as ex:
        message = "An exception '{0}' occured. Arguments:\n{1!r}" \
            .format(type(ex).__name__, ex.args)
        logger.error(message)
        sys.exit(1)

    if args['command'] == 'list':
        logger.debug('Running list command')
        print "Topics:"
        for topic in kafka.topic_partitions.keys():
            print "-", topic
        return 0
    elif args['command'] == 'dump':
        logger.debug('Running dump command')
        topic = args["topic"]
        consumer_id = args["consumer"]

        @MethodTimer.timeout(5, None)
        def _hidden():
            try:
                logger.debug("Ensuring topic {t} exists".format(t=topic))
                kafka.ensure_topic_exists(topic)

                logger.debug("Getting Kafka consumer")
                consumer = SimpleConsumer(kafka, consumer_id, topic,
                                      buffer_size=1024*100,
                                      fetch_size_bytes=1024*100,
                                      max_buffer_size=None
                                      )
                return consumer
            except KafkaUnavailableError as ex:
                    message = "An exception '{0}' occured. Arguments:\n{1!r}" \
                        .format(type(ex).__name__, ex.args)
                    logger.error(message)
                    sys.exit(1)

        consumer = _hidden()

        if consumer is None:
            logger.error("Could not fully connect to Kafka within the timeout")
            sys.exit(1)

        if args["from_beginning"]:
            logger.debug("Seeking to beginning")
            consumer.seek(0, 0)
        else:
            logger.debug("Reading from the end")
            consumer.seek(0, 2)

        num_records = 0
        total_bytes = 0
        item = None

        while True:
            try:
                for message in consumer.get_messages():
                    if message is None:
                        logger.debug("no message")
                        break
                    logger.debug("Received message")
                    val = message.message.value
                    try:
                        item = json.loads(val)
                        if args['decode_base64'] and 'body' in item:
                            item['body'] = base64.b64decode(item['body'])

                        if args['no_body'] and 'body' in item:
                            del item['body']
                    except ValueError:
                        logger.info("Message is not a JSON object")
                        item = val
                    body_bytes = len(item)

                    if args['pretty']:
                        print json.dumps(item, indent=4)
                    else:
                        print item
                    num_records = num_records + 1
                    total_bytes = total_bytes + body_bytes
            except KeyboardInterrupt:
                logger.debug("Keyboard interrupt received")
                break
            except:
                logger.error(traceback.print_exc())
                break

        total_mbs = float(total_bytes) / (1024*1024)
        if item is not None:
            print "Last item:"
            print json.dumps(item, indent=4)
        if num_records > 0:
            logger.info("Num Records: {n}, Total MBs: {m}, kb per message: {kb}"
                    .format(n=num_records, m=total_mbs,
                            kb=(float(total_bytes) / num_records / 1024)))
        else:
            logger.info("No records consumed")
            num_records = 0

        logger.info("Closing Kafka connection")
        kafka.close()
        return 0
Exemple #25
0
import argparse
from scutils.settings_wrapper import SettingsWrapper

# set up arg parser
parser = argparse.ArgumentParser(
    description='Example SettingsWrapper parser.\n')
parser.add_argument('-s', '--settings', action='store', required=False,
                    help="The default settings file",
                    default="settings.py")
parser.add_argument('-o', '--override-settings', action='store', required=False,
                    help="The override settings file",
                    default="localsettings.py")
parser.add_argument('-v', '--variable', action='store', required=False,
                    help="The variable to print out",
                    default=None)
args = vars(parser.parse_args())

# load up settings
wrapper = SettingsWrapper()
my_settings = wrapper.load(default=args['settings'],
                           local=args['override_settings'])

if args['variable'] is not None:
    if args['variable'] in my_settings:
        print args['variable'], '=', my_settings[args['variable']]
    else:
        print args['variable'], "not in loaded settings"
else:
    print "Full settings:", my_settings
Exemple #26
0
class KafkaMonitor(object):

    consumer = None

    def __init__(self, settings_name, unit_test=False):
        '''
        @param settings_name: the local settings file name
        @param unit_test: whether running unit tests or not
        '''
        self.settings_name = settings_name
        self.wrapper = SettingsWrapper()
        self.logger = None
        self.unit_test = unit_test
        self.my_uuid = str(uuid.uuid4()).split('-')[4]

    def _import_class(self, cl):
        '''
        Imports a class from a string

        @param name: the module and class name in dot notation
        '''
        d = cl.rfind(".")
        classname = cl[d + 1:len(cl)]
        m = __import__(cl[0:d], globals(), locals(), [classname])
        return getattr(m, classname)

    def _load_plugins(self):
        '''
        Sets up all plugins, defaults and settings.py
        '''
        plugins = self.settings['PLUGINS']

        self.plugins_dict = {}
        for key in plugins:
            # skip loading the plugin if its value is None
            if plugins[key] is None:
                continue
            # valid plugin, import and setup
            self.logger.debug("Trying to load plugin {cls}".format(cls=key))
            the_class = self._import_class(key)
            instance = the_class()  # 插件类的实例
            instance._set_logger(self.logger)
            if not self.unit_test:
                instance.setup(self.settings)
            the_schema = None
            with open(self.settings['PLUGIN_DIR'] +
                      instance.schema) as the_file:
                the_schema = json.load(the_file)

            mini = {}  # 每个插件的实例 和 对应的schema(请求的规范)
            mini['instance'] = instance
            mini['schema'] = the_schema
            self.logger.debug(
                "Successfully loaded plugin {cls}".format(cls=key))
            self.plugins_dict[
                plugins[key]] = mini  # {'插件名':{'插件实例':'','schema':''} ,}

        self.plugins_dict = OrderedDict(
            sorted(list(self.plugins_dict.items()), key=lambda t: t[0]))

    def setup(self, level=None, log_file=None, json=None):
        '''
        Load everything up. Note that any arg here will override both
        default and custom settings

        @param level: the log level
        @param log_file: boolean t/f whether to log to a file, else stdout
        @param json: boolean t/f whether to write the logs in json
        '''
        self.settings = self.wrapper.load(self.settings_name)

        my_level = level if level else self.settings['LOG_LEVEL']
        # negate because logger wants True for std out
        my_output = not log_file if log_file else self.settings['LOG_STDOUT']
        my_json = json if json else self.settings['LOG_JSON']
        self.logger = LogFactory.get_instance(
            json=my_json,
            stdout=my_output,
            level=my_level,
            name=self.settings['LOGGER_NAME'],
            dir=self.settings['LOG_DIR'],
            file=self.settings['LOG_FILE'],
            bytes=self.settings['LOG_MAX_BYTES'],
            backups=self.settings['LOG_BACKUPS'])

        self.validator = self.extend_with_default(Draft4Validator)

    def _setup_stats(self):
        '''
        Sets up the stats collection
        '''
        self.stats_dict = {}

        redis_conn = redis.Redis(host=self.settings['REDIS_HOST'],
                                 port=self.settings['REDIS_PORT'],
                                 db=self.settings.get('REDIS_DB'))

        try:
            redis_conn.info()
            self.logger.debug("Connected to Redis in StatsCollector Setup")
            self.redis_conn = redis_conn
        except ConnectionError:
            self.logger.warn("Failed to connect to Redis in StatsCollector"
                             " Setup, no stats will be collected")
            return

        if self.settings['STATS_TOTAL']:
            self._setup_stats_total(redis_conn)

        if self.settings['STATS_PLUGINS']:
            self._setup_stats_plugins(redis_conn)

    def _setup_stats_total(self, redis_conn):
        '''
        Sets up the total stats collectors

        @param redis_conn: the redis connection
        '''
        self.stats_dict['total'] = {}
        self.stats_dict['fail'] = {}
        temp_key1 = 'stats:kafka-monitor:total'
        temp_key2 = 'stats:kafka-monitor:fail'
        for item in self.settings['STATS_TIMES']:
            try:
                time = getattr(StatsCollector, item)
                self.stats_dict['total'][time] = StatsCollector \
                        .get_rolling_time_window(
                                redis_conn=redis_conn,
                                key='{k}:{t}'.format(k=temp_key1, t=time),
                                window=time,
                                cycle_time=self.settings['STATS_CYCLE'])
                self.stats_dict['fail'][time] = StatsCollector \
                        .get_rolling_time_window(
                                redis_conn=redis_conn,
                                key='{k}:{t}'.format(k=temp_key2, t=time),
                                window=time,
                                cycle_time=self.settings['STATS_CYCLE'])
                self.logger.debug("Set up total/fail Stats Collector '{i}'"\
                        .format(i=item))
            except AttributeError as e:
                self.logger.warning("Unable to find Stats Time '{s}'"\
                        .format(s=item))
        total1 = StatsCollector.get_hll_counter(
            redis_conn=redis_conn,
            key='{k}:lifetime'.format(k=temp_key1),
            cycle_time=self.settings['STATS_CYCLE'],
            roll=False)
        total2 = StatsCollector.get_hll_counter(
            redis_conn=redis_conn,
            key='{k}:lifetime'.format(k=temp_key2),
            cycle_time=self.settings['STATS_CYCLE'],
            roll=False)
        self.logger.debug("Set up total/fail Stats Collector 'lifetime'")
        self.stats_dict['total']['lifetime'] = total1
        self.stats_dict['fail']['lifetime'] = total2

    def _setup_stats_plugins(self, redis_conn):
        '''
        Sets up the plugin stats collectors

        @param redis_conn: the redis connection
        '''
        self.stats_dict['plugins'] = {}
        for key in self.plugins_dict:
            plugin_name = self.plugins_dict[key]['instance'].__class__.__name__
            temp_key = 'stats:kafka-monitor:{p}'.format(p=plugin_name)
            self.stats_dict['plugins'][plugin_name] = {}
            for item in self.settings['STATS_TIMES']:
                try:
                    time = getattr(StatsCollector, item)

                    self.stats_dict['plugins'][plugin_name][time] = StatsCollector \
                            .get_rolling_time_window(
                                    redis_conn=redis_conn,
                                    key='{k}:{t}'.format(k=temp_key, t=time),
                                    window=time,
                                    cycle_time=self.settings['STATS_CYCLE'])
                    self.logger.debug("Set up {p} plugin Stats Collector '{i}'"\
                            .format(p=plugin_name, i=item))
                except AttributeError:
                    self.logger.warning("Unable to find Stats Time '{s}'"\
                            .format(s=item))
            total = StatsCollector.get_hll_counter(
                redis_conn=redis_conn,
                key='{k}:lifetime'.format(k=temp_key),
                cycle_time=self.settings['STATS_CYCLE'],
                roll=False)
            self.logger.debug("Set up {p} plugin Stats Collector 'lifetime'"\
                            .format(p=plugin_name))
            self.stats_dict['plugins'][plugin_name]['lifetime'] = total

    def _setup_kafka(self):
        '''
        Sets up kafka connections
        '''
        self.consumer = self._create_consumer()
        self.logger.debug("Successfully connected to Kafka")

    def extend_with_default(self, validator_class):
        '''
        Method to add default fields to our schema validation
        ( From the docs )
        '''
        validate_properties = validator_class.VALIDATORS["properties"]

        def set_defaults(validator, properties, instance, schema):
            for error in validate_properties(
                    validator,
                    properties,
                    instance,
                    schema,
            ):
                yield error

            for property, subschema in list(properties.items()):
                if "default" in subschema:
                    instance.setdefault(property, subschema["default"])

        return validators.extend(
            validator_class,
            {"properties": set_defaults},
        )

    def _main_loop(self):
        '''
        Continuous loop that reads from a kafka topic and tries to validate
        incoming messages
        '''
        self.logger.debug("Processing messages")
        old_time = 0
        while True:
            self._process_messages()
            if self.settings['STATS_DUMP'] != 0:
                new_time = int(
                    old_div(time.time(), self.settings['STATS_DUMP']))
                # only log every X seconds
                if new_time != old_time:
                    self._dump_stats()
                    old_time = new_time

            self._report_self()
            time.sleep(self.settings['SLEEP_TIME'])

    # 生产者已经把json_item = {u'uuid': u'abc123', u'appid': u'stuff'}加入到dome.incoming topic 中
    # 这里创建消费者去topic 中取消息,通过将 消息格式 与 插件 匹配 让插件处理消息(scraper插件实现让crawl消息存入redis中)
    def _process_messages(self):
        try:
            # self.consumer = <kafka.consumer.group.KafkaConsumer object at 0x2648f10>
            for message in self.consumer:
                # message: ConsumerRecord(
                #     topic=u'demo.incoming', partition=0, offset=13,
                #     timestamp=1515581029167, timestamp_type=0, key=None,
                #     value='{"url": "istresearch.com", "crawlid": "abc123", "appid": "madisonTest"}',
                #     checksum=-342948442, serialized_key_size=-1, serialized_value_size=71
                #     )

                #  ConsumerRecord(
                # topic=u'demo.incoming', partition=0, offset=14,
                # timestamp=1515581538398, timestamp_type=0, key=None,
                # value='{"uuid": "abc123", "appid": "stuff"}',
                # checksum=-439167079, serialized_key_size=-1, serialized_value_size=36)

                if message is None:
                    self.logger.debug("no message")
                    break
                try:
                    self._increment_total_stat(message.value)
                    loaded_dict = json.loads(
                        message.value)  # {"uuid": "abc123", "appid": "stuff"}
                    found_plugin = False
                    for key in self.plugins_dict:  # {'插件名':{'插件实例':'','schema':''} ,}
                        # to prevent reference modification
                        the_dict = copy.deepcopy(loaded_dict)
                        obj = self.plugins_dict[key]
                        instance = obj['instance']
                        schema = obj['schema']
                        try:
                            self.validator(schema).validate(the_dict)  # 匹配插件
                            found_plugin = True
                            self._increment_plugin_stat(
                                instance.__class__.__name__, the_dict)
                            # the_dict: {u'allowed_domains': None, u'allow_regex': None,
                            # u'crawlid': u'abc123',
                            # u'url': u'istresearch.com', u'expires': 0,
                            # 'ts': 1515581029.185904, u'priority': 1,
                            # u'deny_regex': None, u'cookie': None,
                            # u'attrs': None, u'appid': u'madisonTest',
                            # u'spiderid': u'link', u'useragent': None, u'deny_extensions': None,
                            # u'maxdepth': 0}

                            #{u'stats': u'all', u'uuid': u'abc123', 'ts': 1515581538.40857, u'appid': u'stuff'}

                            ret = instance.handle(the_dict)
                            # break if nothing is returned
                            if ret is None:
                                break
                        except ValidationError:
                            pass
                    if not found_plugin:
                        extras = {}
                        extras['parsed'] = True
                        extras['valid'] = False
                        extras['data'] = the_dict
                        self.logger.warn(
                            "Did not find schema to validate "
                            "request",
                            extra=extras)
                        self._increment_fail_stat(the_dict)

                except ValueError:
                    extras = {}
                    extras['parsed'] = False
                    extras['valid'] = False
                    extras['data'] = message.value
                    self.logger.warning('Unparseable JSON Received',
                                        extra=extras)
                    self._increment_fail_stat(message.value)
        except OffsetOutOfRangeError:
            # consumer has no idea where they are
            self.consumer.seek_to_end()
            self.logger.error("Kafka offset out of range error")

    def _increment_total_stat(self, string):
        '''
        Increments the total stat counters

        @param string: the loaded message object for the counter
        '''
        string = string + str(time.time())
        if 'total' in self.stats_dict:
            self.logger.debug("Incremented total stats")
            for key in self.stats_dict['total']:
                if key == 'lifetime':
                    self.stats_dict['total'][key].increment(string)
                else:
                    self.stats_dict['total'][key].increment()

    def _increment_fail_stat(self, item):
        '''
        Increments the total stat counters

        @param item: the loaded message object for HLL counter
        '''
        if isinstance(item, dict):
            item['ts'] = time.time()
        elif isinstance(item, str):
            item = item + str(time.time())

        if 'fail' in self.stats_dict:
            self.logger.debug("Incremented fail stats")
            for key in self.stats_dict['fail']:
                if key == 'lifetime':
                    self.stats_dict['fail'][key].increment(item)
                else:
                    self.stats_dict['fail'][key].increment()

    def _increment_plugin_stat(self, name, item):
        '''
        Increments the total stat counters

        @param name: The formal name of the plugin
        @param dict: the loaded message object for HLL counter
        '''
        item['ts'] = time.time()
        if 'plugins' in self.stats_dict:
            self.logger.debug("Incremented plugin '{p}' plugin stats"\
                    .format(p=name))
            for key in self.stats_dict['plugins'][name]:
                if key == 'lifetime':
                    self.stats_dict['plugins'][name][key].increment(item)
                else:
                    self.stats_dict['plugins'][name][key].increment()

    def _dump_stats(self):
        '''
        Dumps the stats out
        '''
        extras = {}
        if 'total' in self.stats_dict:
            self.logger.debug("Compiling total/fail dump stats")
            for key in self.stats_dict['total']:
                final = 'total_{t}'.format(t=key)
                extras[final] = self.stats_dict['total'][key].value()
            for key in self.stats_dict['fail']:
                final = 'fail_{t}'.format(t=key)
                extras[final] = self.stats_dict['fail'][key].value()

        if 'plugins' in self.stats_dict:
            self.logger.debug("Compiling plugin dump stats")
            for name in self.stats_dict['plugins']:
                for key in self.stats_dict['plugins'][name]:
                    final = 'plugin_{n}_{t}'.format(n=name, t=key)
                    extras[final] = self.stats_dict['plugins'][name][
                        key].value()

        if not self.logger.json:
            self.logger.info('Kafka Monitor Stats Dump:\n{0}'.format(
                json.dumps(extras, indent=4, sort_keys=True)))
        else:
            self.logger.info('Kafka Monitor Stats Dump', extra=extras)

    def run(self):
        '''
        Set up and run
        '''
        self._setup_kafka()
        self._load_plugins()
        self._setup_stats()
        self._main_loop()

    def _report_self(self):
        '''
        Reports the kafka monitor uuid to redis
        '''
        key = "stats:kafka-monitor:self:{m}:{u}".format(m=socket.gethostname(),
                                                        u=self.my_uuid)
        self.redis_conn.set(key, time.time())
        self.redis_conn.expire(key, self.settings['HEARTBEAT_TIMEOUT'])

    def feed(self, json_item):
        '''
        Feeds a json item into the Kafka topic

        @param json_item: The loaded json object
        '''
        @MethodTimer.timeout(self.settings['KAFKA_FEED_TIMEOUT'], False)
        def _feed(json_item):
            producer = self._create_producer()
            topic = self.settings['KAFKA_INCOMING_TOPIC']
            if not self.logger.json:
                self.logger.info('Feeding JSON into {0}\n{1}'.format(
                    topic, json.dumps(json_item, indent=4)))
            else:
                self.logger.info('Feeding JSON into {0}\n'.format(topic),
                                 extra={'value': json_item})

            if producer is not None:
                producer.send(topic, json_item)
                producer.flush()
                producer.close(timeout=10)
                return True
            else:
                return False

        result = _feed(json_item)

        if result:
            self.logger.info("Successfully fed item to Kafka")
        else:
            self.logger.error("Failed to feed item into Kafka")

    @retry(wait_exponential_multiplier=500, wait_exponential_max=10000)
    def _create_consumer(self):
        """Tries to establing the Kafka consumer connection"""
        try:
            brokers = self.settings['KAFKA_HOSTS']
            self.logger.debug("Creating new kafka consumer using brokers: " +
                              str(brokers) + ' and topic ' +
                              self.settings['KAFKA_INCOMING_TOPIC'])

            return KafkaConsumer(
                self.settings['KAFKA_INCOMING_TOPIC'],
                group_id=self.settings['KAFKA_GROUP'],
                bootstrap_servers=brokers,
                consumer_timeout_ms=self.settings['KAFKA_CONSUMER_TIMEOUT'],
                auto_offset_reset=self.
                settings['KAFKA_CONSUMER_AUTO_OFFSET_RESET'],
                auto_commit_interval_ms=self.
                settings['KAFKA_CONSUMER_COMMIT_INTERVAL_MS'],
                enable_auto_commit=self.
                settings['KAFKA_CONSUMER_AUTO_COMMIT_ENABLE'],
                max_partition_fetch_bytes=self.
                settings['KAFKA_CONSUMER_FETCH_MESSAGE_MAX_BYTES'])
        except KeyError as e:
            self.logger.error('Missing setting named ' + str(e),
                              {'ex': traceback.format_exc()})
        except:
            self.logger.error(
                "Couldn't initialize kafka consumer for topic", {
                    'ex': traceback.format_exc(),
                    'topic': self.settings['KAFKA_INCOMING_TOPIC']
                })
            raise

    @retry(wait_exponential_multiplier=500, wait_exponential_max=10000)
    def _create_producer(self):
        """Tries to establish a Kafka consumer connection"""
        try:
            brokers = self.settings['KAFKA_HOSTS']
            self.logger.debug("Creating new kafka producer using brokers: " +
                              str(brokers))

            return KafkaProducer(
                bootstrap_servers=brokers,
                value_serializer=lambda m: json.dumps(m),
                retries=3,
                linger_ms=self.settings['KAFKA_PRODUCER_BATCH_LINGER_MS'],
                buffer_memory=self.settings['KAFKA_PRODUCER_BUFFER_BYTES'])
        except KeyError as e:
            self.logger.error('Missing setting named ' + str(e),
                              {'ex': traceback.format_exc()})
        except:
            self.logger.error("Couldn't initialize kafka producer.",
                              {'ex': traceback.format_exc()})
            raise

    def close(self):
        '''
        Call to properly tear down the Kafka Monitor
        '''
        if self.consumer is not None:
            self.consumer.close()
Exemple #27
0
class TestSettingsWrapper(TestCase):

    defaults = {"STRING": "stuff", "DICT": {"value": "other stuff"}}

    def setUp(self):
        self.wrapper = SettingsWrapper()

    def test_no_defaults(self):
        self.wrapper._load_defaults()
        sets = self.wrapper.settings()
        self.assertEqual(sets, {})

    def test_load_default(self):
        self.wrapper._load_defaults("test_default_settings.py")
        sets = self.wrapper.settings()
        self.assertEqual(sets, self.defaults)

    def test_no_override(self):
        # test no prior defaults
        self.wrapper.my_settings = {}
        self.wrapper._load_custom()
        sets = self.wrapper.settings()
        self.assertEqual(sets, {})

        self.wrapper._load_defaults("test_default_settings.py")
        self.wrapper._load_custom()
        sets = self.wrapper.settings()
        self.assertEqual(sets, self.defaults)

    def test_override_default(self):
        self.wrapper._load_defaults("test_default_settings.py")
        self.wrapper._load_custom("test_override_defaults.py")
        sets = self.wrapper.settings()
        actual = {
            'NEW_DICT': {
                'other': 'stuff'
            },
            'MY_STRING': 'cool',
            'DICT': {
                'append': 'value',
                'value': 'override'
            },
            'STRING': 'my stuff',
            'NEW_LIST': ['item1']
        }
        self.assertEqual(sets, actual)

    def test_load_string(self):
        s = """STRING = \"my stuff\"\nMY_STRING = \"cool\"\nNEW_LIST = [\'item2\']"""

        real = {
            'STRING': 'my stuff',
            'MY_STRING': 'cool',
            'NEW_LIST': ['item2']
        }

        sets = self.wrapper.load_from_string(s)
        self.assertItemsEqual(real, sets)
Exemple #28
0
class KafkaMonitor:
    def __init__(self, settings_name, unit_test=False):
        '''
        @param settings_name: the local settings file name
        @param unit_test: whether running unit tests or not
        '''
        self.settings_name = settings_name
        self.wrapper = SettingsWrapper()
        self.logger = None
        self.unit_test = unit_test

    def _import_class(self, cl):
        '''
        Imports a class from a string

        @param name: the module and class name in dot notation
        '''
        d = cl.rfind(".")
        classname = cl[d + 1:len(cl)]
        m = __import__(cl[0:d], globals(), locals(), [classname])
        return getattr(m, classname)

    def _load_plugins(self):
        '''
        Sets up all plugins, defaults and settings.py
        '''
        plugins = self.settings['PLUGINS']

        self.plugins_dict = {}
        for key in plugins:
            # skip loading the plugin if its value is None
            if plugins[key] is None:
                continue
            # valid plugin, import and setup
            self.logger.debug("Trying to load plugin {cls}".format(cls=key))
            the_class = self._import_class(key)
            instance = the_class()
            instance._set_logger(self.logger)
            if not self.unit_test:
                instance.setup(self.settings)
            the_schema = None

            print("self.settings['PLUGIN_DIR'] + instance.schema====",
                  self.settings['PLUGIN_DIR'] + instance.schema)
            with open(self.settings['PLUGIN_DIR'] +
                      instance.schema) as the_file:
                the_schema = json.load(the_file)

            mini = {}
            mini['instance'] = instance
            mini['schema'] = the_schema

            self.plugins_dict[plugins[key]] = mini

        self.plugins_dict = OrderedDict(
            sorted(self.plugins_dict.items(), key=lambda t: t[0]))

    def setup(self, level=None, log_file=None, json=None):
        '''
        Load everything up. Note that any arg here will override both
        default and custom settings

        @param level: the log level
        @param log_file: boolean t/f whether to log to a file, else stdout
        @param json: boolean t/f whether to write the logs in json
        '''
        self.settings = self.wrapper.load(self.settings_name)

        my_level = level if level else self.settings['LOG_LEVEL']
        # negate because logger wants True for std out
        my_output = not log_file if log_file else self.settings['LOG_STDOUT']
        my_json = json if json else self.settings['LOG_JSON']
        self.logger = LogFactory.get_instance(
            json=my_json,
            stdout=my_output,
            level=my_level,
            name=self.settings['LOGGER_NAME'],
            dir=self.settings['LOG_DIR'],
            file=self.settings['LOG_FILE'],
            bytes=self.settings['LOG_MAX_BYTES'],
            backups=self.settings['LOG_BACKUPS'])

        self.validator = self.extend_with_default(Draft4Validator)

    def _setup_stats(self):
        '''
        Sets up the stats collection
        '''
        self.stats_dict = {}

        redis_conn = redis.Redis(host=self.settings['REDIS_HOST'],
                                 port=self.settings['REDIS_PORT'])

        try:
            redis_conn.info()
            self.logger.debug("Connected to Redis in StatsCollector Setup")
        except ConnectionError:
            self.logger.warn("Failed to connect to Redis in StatsCollector"
                             " Setup, no stats will be collected")
            return

        if self.settings['STATS_TOTAL']:
            self._setup_stats_total(redis_conn)

        if self.settings['STATS_PLUGINS']:
            self._setup_stats_plugins(redis_conn)

    def _setup_stats_total(self, redis_conn):
        '''
        Sets up the total stats collectors

        @param redis_conn: the redis connection
        '''
        self.stats_dict['total'] = {}
        self.stats_dict['fail'] = {}
        temp_key1 = 'stats:kafka-monitor:total'
        temp_key2 = 'stats:kafka-monitor:fail'
        for item in self.settings['STATS_TIMES']:
            try:
                time = getattr(StatsCollector, item)
                self.stats_dict['total'][time] = StatsCollector \
                        .get_rolling_time_window(
                                redis_conn=redis_conn,
                                key='{k}:{t}'.format(k=temp_key1, t=time),
                                window=time,
                                cycle_time=self.settings['STATS_CYCLE'])
                self.stats_dict['fail'][time] = StatsCollector \
                        .get_rolling_time_window(
                                redis_conn=redis_conn,
                                key='{k}:{t}'.format(k=temp_key2, t=time),
                                window=time,
                                cycle_time=self.settings['STATS_CYCLE'])
                self.logger.debug("Set up total/fail Stats Collector '{i}'"\
                        .format(i=item))
            except AttributeError as e:
                self.logger.warning("Unable to find Stats Time '{s}'"\
                        .format(s=item))
        total1 = StatsCollector.get_hll_counter(
            redis_conn=redis_conn,
            key='{k}:lifetime'.format(k=temp_key1),
            cycle_time=self.settings['STATS_CYCLE'],
            roll=False)
        total2 = StatsCollector.get_hll_counter(
            redis_conn=redis_conn,
            key='{k}:lifetime'.format(k=temp_key2),
            cycle_time=self.settings['STATS_CYCLE'],
            roll=False)
        self.logger.debug("Set up total/fail Stats Collector 'lifetime'")
        self.stats_dict['total']['lifetime'] = total1
        self.stats_dict['fail']['lifetime'] = total2

    def _setup_stats_plugins(self, redis_conn):
        '''
        Sets up the plugin stats collectors

        @param redis_conn: the redis connection
        '''
        self.stats_dict['plugins'] = {}
        for key in self.plugins_dict:
            plugin_name = self.plugins_dict[key]['instance'].__class__.__name__
            temp_key = 'stats:kafka-monitor:{p}'.format(p=plugin_name)
            self.stats_dict['plugins'][plugin_name] = {}
            for item in self.settings['STATS_TIMES']:
                try:
                    time = getattr(StatsCollector, item)

                    self.stats_dict['plugins'][plugin_name][time] = StatsCollector \
                            .get_rolling_time_window(
                                    redis_conn=redis_conn,
                                    key='{k}:{t}'.format(k=temp_key, t=time),
                                    window=time,
                                    cycle_time=self.settings['STATS_CYCLE'])
                    self.logger.debug("Set up {p} plugin Stats Collector '{i}'"\
                            .format(p=plugin_name, i=item))
                except AttributeError:
                    self.logger.warning("Unable to find Stats Time '{s}'"\
                            .format(s=item))
            total = StatsCollector.get_hll_counter(
                redis_conn=redis_conn,
                key='{k}:lifetime'.format(k=temp_key),
                cycle_time=self.settings['STATS_CYCLE'],
                roll=False)
            self.logger.debug("Set up {p} plugin Stats Collector 'lifetime'"\
                            .format(p=plugin_name))
            self.stats_dict['plugins'][plugin_name]['lifetime'] = total

    def _setup_kafka(self):
        '''
        Sets up kafka connections
        '''
        @MethodTimer.timeout(self.settings['KAFKA_CONN_TIMEOUT'], False)
        def _hidden_setup():
            try:
                self.kafka_conn = KafkaClient(self.settings['KAFKA_HOSTS'])
                self.kafka_conn.ensure_topic_exists(
                    self.settings['KAFKA_INCOMING_TOPIC'])
                self.consumer = SimpleConsumer(
                    self.kafka_conn,
                    self.settings['KAFKA_GROUP'],
                    self.settings['KAFKA_INCOMING_TOPIC'],
                    auto_commit=True,
                    iter_timeout=1.0)
            except KafkaUnavailableError as ex:
                message = "An exception '{0}' occured. Arguments:\n{1!r}" \
                    .format(type(ex).__name__, ex.args)
                self.logger.error(message)
                sys.exit(1)
            return True

        ret_val = _hidden_setup()

        if ret_val:
            self.logger.debug("Successfully connected to Kafka")
        else:
            self.logger.error("Failed to set up Kafka Connection within"
                              " timeout")
            # this is essential to running the kafka monitor
            sys.exit(1)

    def extend_with_default(self, validator_class):
        '''
        Method to add default fields to our schema validation
        ( From the docs )
        '''
        validate_properties = validator_class.VALIDATORS["properties"]

        def set_defaults(validator, properties, instance, schema):
            for error in validate_properties(
                    validator,
                    properties,
                    instance,
                    schema,
            ):
                yield error

            for property, subschema in properties.iteritems():
                if "default" in subschema:
                    instance.setdefault(property, subschema["default"])

        return validators.extend(
            validator_class,
            {"properties": set_defaults},
        )

    def _main_loop(self):
        '''
        Continuous loop that reads from a kafka topic and tries to validate
        incoming messages
        '''
        self.logger.debug("Processing messages")
        old_time = 0
        while True:
            self._process_messages()
            if self.settings['STATS_DUMP'] != 0:
                new_time = int(time.time() / self.settings['STATS_DUMP'])
                # only log every X seconds
                if new_time != old_time:
                    self._dump_stats()
                    old_time = new_time

            time.sleep(.01)

    def _process_messages(self):
        try:
            for message in self.consumer.get_messages():
                if message is None:
                    self.logger.debug("no message")
                    break
                try:
                    self._increment_total_stat(message.message.value)
                    the_dict = json.loads(message.message.value)
                    print('the_dict', the_dict)
                    found_plugin = False

                    print('self.plugins_dict', self.plugins_dict)
                    for key in self.plugins_dict:
                        obj = self.plugins_dict[key]
                        instance = obj['instance']

                        print('instance==', instance)

                        schema = obj['schema']

                        print(
                            'schema********************************************',
                            schema)
                        try:
                            print('before       v = self.validator(schema)')

                            v = self.validator(schema)

                            print('after       v = self.validator(schema)')

                            print('the_dict-------', the_dict)

                            v.validate(the_dict)
                            found_plugin = True

                            print('found_plugin====', found_plugin)

                            self._increment_plugin_stat(
                                instance.__class__.__name__, the_dict)

                            print('instance.handle(the_dict)', the_dict)
                            ret = instance.handle(the_dict)
                            # break if nothing is returned
                            if ret is None:
                                break
                        except ValidationError:

                            print('  except ValidationError:======')

                            pass
                    if not found_plugin:
                        extras = {}
                        extras['parsed'] = True
                        extras['valid'] = False
                        extras['data'] = the_dict
                        self.logger.warn(
                            "Did not find schema to validate "
                            "request",
                            extra=extras)
                        self._increment_fail_stat(the_dict)

                except ValueError:
                    extras = {}
                    extras['parsed'] = False
                    extras['valid'] = False
                    extras['data'] = message.message.value
                    self.logger.warning('Unparseable JSON Received',
                                        extra=extras)
                    self._increment_fail_stat(message.message.value)

        except OffsetOutOfRangeError:
            # consumer has no idea where they are
            self.consumer.seek(0, 2)
            self.logger.error("Kafka offset out of range error")

    def _increment_total_stat(self, string):
        '''
        Increments the total stat counters

        @param string: the loaded message object for the counter
        '''
        string = string + str(time.time())
        if 'total' in self.stats_dict:
            self.logger.debug("Incremented total stats")
            for key in self.stats_dict['total']:
                if key == 'lifetime':

                    self.stats_dict['total'][key].increment(string)
                else:
                    self.stats_dict['total'][key].increment()

    def _increment_fail_stat(self, item):
        '''
        Increments the total stat counters

        @param item: the loaded message object for HLL counter
        '''
        if isinstance(item, dict):
            item['ts'] = time.time()
        elif isinstance(item, str):
            item = item + str(time.time())

        if 'fail' in self.stats_dict:
            self.logger.debug("Incremented fail stats")
            for key in self.stats_dict['fail']:
                if key == 'lifetime':
                    self.stats_dict['fail'][key].increment(item)
                else:
                    self.stats_dict['fail'][key].increment()

    def _increment_plugin_stat(self, name, item):
        '''
        Increments the total stat counters

        @param name: The formal name of the plugin
        @param dict: the loaded message object for HLL counter
        '''
        item['ts'] = time.time()
        if 'plugins' in self.stats_dict:
            self.logger.debug("Incremented plugin '{p}' plugin stats"\
                    .format(p=name))
            for key in self.stats_dict['plugins'][name]:
                if key == 'lifetime':
                    self.stats_dict['plugins'][name][key].increment(item)
                else:
                    self.stats_dict['plugins'][name][key].increment()

    def _dump_stats(self):
        '''
        Dumps the stats out
        '''
        extras = {}
        if 'total' in self.stats_dict:
            self.logger.debug("Compiling total/fail dump stats")
            for key in self.stats_dict['total']:
                final = 'total_{t}'.format(t=key)
                extras[final] = self.stats_dict['total'][key].value()
            for key in self.stats_dict['fail']:
                final = 'fail_{t}'.format(t=key)
                extras[final] = self.stats_dict['fail'][key].value()

        if 'plugins' in self.stats_dict:
            self.logger.debug("Compiling plugin dump stats")
            for name in self.stats_dict['plugins']:
                for key in self.stats_dict['plugins'][name]:
                    final = 'plugin_{n}_{t}'.format(n=name, t=key)
                    extras[final] = self.stats_dict['plugins'][name][
                        key].value()

        if not self.logger.json:
            self.logger.info('Kafka Monitor Stats Dump:\n{0}'.format(
                json.dumps(extras, indent=4, sort_keys=True)))
        else:
            self.logger.info('Kafka Monitor Stats Dump', extra=extras)

    def run(self):
        '''
        Set up and run
        '''
        self._setup_kafka()
        self._load_plugins()
        self._setup_stats()
        self._main_loop()

    def feed(self, json_item):
        '''
        Feeds a json item into the Kafka topic

        @param json_item: The loaded json object
        '''
        @MethodTimer.timeout(self.settings['KAFKA_FEED_TIMEOUT'], False)
        def _feed(json_item):
            try:
                self.kafka_conn = KafkaClient(self.settings['KAFKA_HOSTS'])
                topic = self.settings['KAFKA_INCOMING_TOPIC']
                producer = SimpleProducer(self.kafka_conn)
            except KafkaUnavailableError:
                self.logger.error("Unable to connect to Kafka")
                return False

            if not self.logger.json:
                self.logger.info('Feeding JSON into {0}\n{1}'.format(
                    topic, json.dumps(json_item, indent=4)))
            else:
                self.logger.info('Feeding JSON into {0}\n'.format(topic),
                                 extra={'value': json_item})

            self.kafka_conn.ensure_topic_exists(topic)
            producer.send_messages(topic, json.dumps(json_item))

            return True

        result = _feed(json_item)

        if result:
            self.logger.info("Successfully fed item to Kafka")
        else:
            self.logger.error("Failed to feed item into Kafka")
Exemple #29
0
 def setUp(self):
     self.wrapper = SettingsWrapper()
Exemple #30
0
def main():
    # initial main parser setup
    parser = argparse.ArgumentParser(
        description='Kafka Dump: Scrapy Cluster Kafka topic dump utility for '
        'debugging.',
        add_help=False)
    parser.add_argument('-h',
                        '--help',
                        action=ArgparseHelper,
                        help='show this help message and exit')

    subparsers = parser.add_subparsers(help='commands', dest='command')

    # args to use for all commands
    base_parser = argparse.ArgumentParser(add_help=False)
    base_parser.add_argument('-kh',
                             '--kafka-host',
                             action='store',
                             required=False,
                             help="The override Kafka host")
    base_parser.add_argument('-s',
                             '--settings',
                             action='store',
                             required=False,
                             help="The settings file to read from",
                             default="localsettings.py")
    base_parser.add_argument(
        '-ll',
        '--log-level',
        action='store',
        required=False,
        help="The log level",
        default=None,
        choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'])

    # list command
    list_parser = subparsers.add_parser('list',
                                        help='List all Kafka topics',
                                        parents=[base_parser])

    # dump command
    dump_parser = subparsers.add_parser('dump',
                                        help='Dump a Kafka topic',
                                        parents=[base_parser])
    dump_parser.add_argument('-t',
                             '--topic',
                             action='store',
                             required=True,
                             help="The Kafka topic to read from")
    dump_parser.add_argument('-c',
                             '--consumer',
                             action='store',
                             required=False,
                             default=None,
                             help="The Kafka consumer id to use")
    dump_parser.add_argument('-b',
                             '--from-beginning',
                             action='store_const',
                             required=False,
                             const=True,
                             help="Read the topic from the beginning")
    dump_parser.add_argument('-nb',
                             '--no-body',
                             action='store_const',
                             required=False,
                             const=True,
                             default=False,
                             help="Do not include the raw html 'body' key in"
                             " the json dump of the topic")
    dump_parser.add_argument('-p',
                             '--pretty',
                             action='store_const',
                             required=False,
                             const=True,
                             default=False,
                             help="Pretty print the json objects consumed")
    dump_parser.add_argument('-d',
                             '--decode-base64',
                             action='store_const',
                             required=False,
                             const=True,
                             default=False,
                             help="Decode the base64 encoded raw html body")

    args = vars(parser.parse_args())

    wrapper = SettingsWrapper()
    settings = wrapper.load(args['settings'])

    kafka_host = args['kafka_host'] if args['kafka_host'] else settings[
        'KAFKA_HOSTS']
    log_level = args['log_level'] if args['log_level'] else settings[
        'LOG_LEVEL']
    logger = LogFactory.get_instance(level=log_level, name='kafkadump')

    if args['command'] == 'list':
        try:
            logger.debug("Connecting to {0}...".format(kafka_host))
            kafka = KafkaClient(kafka_host)
            logger.info("Connected to {0}".format(kafka_host))
        except KafkaUnavailableError as ex:
            message = "An exception '{0}' occured. Arguments:\n{1!r}" \
                .format(type(ex).__name__, ex.args)
            logger.error(message)
            sys.exit(1)
        logger.debug('Running list command')
        print("Topics:")
        for topic in list(kafka.topic_partitions.keys()):
            print("-", topic)
        kafka.close()
        return 0
    elif args['command'] == 'dump':
        logger.debug('Running dump command')
        topic = args["topic"]
        consumer_id = args["consumer"]

        try:
            logger.debug("Getting Kafka consumer")

            offset = 'earliest' if args["from_beginning"] else 'latest'

            consumer = KafkaConsumer(  # 消费来自demo.crawled_firehose话题的消息
                topic,
                group_id=consumer_id,
                bootstrap_servers=kafka_host,
                consumer_timeout_ms=settings['KAFKA_CONSUMER_TIMEOUT'],
                auto_offset_reset=offset,
                auto_commit_interval_ms=settings[
                    'KAFKA_CONSUMER_COMMIT_INTERVAL_MS'],
                enable_auto_commit=settings[
                    'KAFKA_CONSUMER_AUTO_COMMIT_ENABLE'],
                max_partition_fetch_bytes=settings[
                    'KAFKA_CONSUMER_FETCH_MESSAGE_MAX_BYTES'])
        except NoBrokersAvailable as ex:
            logger.error('Unable to connect to Kafka')
            sys.exit(1)

        num_records = 0
        total_bytes = 0
        item = None

        while True:
            try:
                for message in consumer:
                    if message is None:
                        logger.debug("no message")
                        break
                    logger.debug("Received message")
                    val = message.value
                    try:
                        item = json.loads(val)
                        if args['decode_base64'] and 'body' in item:
                            item['body'] = base64.b64decode(item['body'])

                        if args['no_body'] and 'body' in item:
                            del item['body']
                    except ValueError:
                        logger.info("Message is not a JSON object")
                        item = val
                    body_bytes = len(item)

                    if args['pretty']:
                        print(json.dumps(item, indent=4))
                    else:
                        print(item)
                    num_records = num_records + 1
                    total_bytes = total_bytes + body_bytes
            except KeyboardInterrupt:
                logger.debug("Keyboard interrupt received")
                break
            except:
                logger.error(traceback.print_exc())
                break

        total_mbs = old_div(float(total_bytes), (1024 * 1024))
        if item is not None:
            print("Last item:")
            print(json.dumps(item, indent=4))
        if num_records > 0:
            logger.info(
                "Num Records: {n}, Total MBs: {m}, kb per message: {kb}".
                format(n=num_records,
                       m=total_mbs,
                       kb=(float(total_bytes) / num_records / 1024)))
        else:
            logger.info("No records consumed")
            num_records = 0

        logger.info("Closing Kafka connection")
        try:
            consumer.close()
        except:
            # Exception is thrown when group_id is None.
            # See https://github.com/dpkp/kafka-python/issues/619
            pass
        return 0
class Feed:

    def __init__(self, settings_name, unit_test=False):
        '''
        @param settings_name: the local settings file name
        @param unit_test: whether running unit tests or not
        '''
        self.settings_name = settings_name
        self.wrapper = SettingsWrapper()
        self.logger = None
        self.unit_test = unit_test

    def setup(self, level=None, log_file=None, json=None):
        '''
        Load everything up. Note that any arg here will override both
        default and custom settings

        @param level: the log level
        @param log_file: boolean t/f whether to log to a file, else stdout
        @param json: boolean t/f whether to write the logs in json
        '''
        self.settings = self.wrapper.load(self.settings_name)

        my_level = level if level else self.settings['LOG_LEVEL']
        # negate because logger wants True for std out
        my_output = not log_file if log_file else self.settings['LOG_STDOUT']
        my_json = json if json else self.settings['LOG_JSON']
        self.logger = LogFactory.get_instance(json=my_json, stdout=my_output,
                                              level=my_level,
                                              name=self.settings['LOGGER_NAME'],
                                              dir=self.settings['LOG_DIR'],
                                              file=self.settings['LOG_FILE'],
                                              bytes=self.settings['LOG_MAX_BYTES'],
                                              backups=self.settings['LOG_BACKUPS'])

        self.validator = self.extend_with_default(Draft4Validator)

    def extend_with_default(self, validator_class):
        '''
        Method to add default fields to our schema validation
        ( From the docs )
        '''
        validate_properties = validator_class.VALIDATORS["properties"]

        def set_defaults(validator, properties, instance, schema):
            for error in validate_properties(
                validator, properties, instance, schema,
            ):
                yield error

            for property, subschema in properties.iteritems():
                if "default" in subschema:
                    instance.setdefault(property, subschema["default"])

        return validators.extend(
            validator_class, {"properties": set_defaults},
        )


    def feed(self, json_item):
        instance = ScraperHandler()
        instance._set_logger(self.logger)
        instance.setup(self.settings)
        the_schema = None
        with open(self.settings['PLUGIN_DIR'] + instance.schema) as the_file:
            the_schema = json.load(the_file)

        the_dict = json_item
        ret = True
        try:
            self.validator(the_schema).validate(the_dict)
            instance.handle(the_dict)
            self.logger.info("Successfully fed item to Kafka")
        except ValidationError:
            self.logger.error("Failed to feed item into Kafka")
Exemple #32
0
class KafkaMonitor:

    def __init__(self, settings_name, unit_test=False):
        '''
        @param settings_name: the local settings file name
        @param unit_test: whether running unit tests or not
        '''
        self.settings_name = settings_name
        self.wrapper = SettingsWrapper()
        self.logger = None
        self.unit_test = unit_test

    def _import_class(self, cl):
        '''
        Imports a class from a string

        @param name: the module and class name in dot notation
        '''
        d = cl.rfind(".")
        classname = cl[d+1:len(cl)]
        m = __import__(cl[0:d], globals(), locals(), [classname])
        return getattr(m, classname)

    def _load_plugins(self):
        '''
        Sets up all plugins, defaults and settings.py
        '''
        plugins = self.settings['PLUGINS']

        self.plugins_dict = {}
        for key in plugins:
            # skip loading the plugin if its value is None
            if plugins[key] is None:
                continue
            # valid plugin, import and setup
            self.logger.debug("Trying to load plugin {cls}".format(cls=key))
            the_class = self._import_class(key)
            instance = the_class()
            instance._set_logger(self.logger)
            if not self.unit_test:
                instance.setup(self.settings)
            the_schema = None
            with open(self.settings['PLUGIN_DIR'] + instance.schema) as the_file:
                the_schema = json.load(the_file)

            mini = {}
            mini['instance'] = instance
            mini['schema'] = the_schema

            self.plugins_dict[plugins[key]] = mini

        self.plugins_dict = OrderedDict(sorted(self.plugins_dict.items(),
                                               key=lambda t: t[0]))

    def setup(self, level=None, log_file=None, json=None):
        '''
        Load everything up. Note that any arg here will override both
        default and custom settings

        @param level: the log level
        @param log_file: boolean t/f whether to log to a file, else stdout
        @param json: boolean t/f whether to write the logs in json
        '''
        self.settings = self.wrapper.load(self.settings_name)

        my_level = level if level else self.settings['LOG_LEVEL']
        # negate because logger wants True for std out
        my_output = not log_file if log_file else self.settings['LOG_STDOUT']
        my_json = json if json else self.settings['LOG_JSON']
        self.logger = LogFactory.get_instance(json=my_json, stdout=my_output,
                                              level=my_level,
                                              name=self.settings['LOGGER_NAME'],
                                              dir=self.settings['LOG_DIR'],
                                              file=self.settings['LOG_FILE'],
                                              bytes=self.settings['LOG_MAX_BYTES'],
                                              backups=self.settings['LOG_BACKUPS'])

        self.validator = self.extend_with_default(Draft4Validator)

    def _setup_stats(self):
        '''
        Sets up the stats collection
        '''
        self.stats_dict = {}

        redis_conn = redis.Redis(host=self.settings['REDIS_HOST'],
                                 port=self.settings['REDIS_PORT'])

        try:
            redis_conn.info()
            self.logger.debug("Connected to Redis in StatsCollector Setup")
        except ConnectionError:
            self.logger.warn("Failed to connect to Redis in StatsCollector"
                             " Setup, no stats will be collected")
            return

        if self.settings['STATS_TOTAL']:
            self._setup_stats_total(redis_conn)

        if self.settings['STATS_PLUGINS']:
            self._setup_stats_plugins(redis_conn)

    def _setup_stats_total(self, redis_conn):
        '''
        Sets up the total stats collectors

        @param redis_conn: the redis connection
        '''
        self.stats_dict['total'] = {}
        self.stats_dict['fail'] = {}
        temp_key1 = 'stats:kafka-monitor:total'
        temp_key2 = 'stats:kafka-monitor:fail'
        for item in self.settings['STATS_TIMES']:
            try:
                time = getattr(StatsCollector, item)
                self.stats_dict['total'][time] = StatsCollector \
                        .get_rolling_time_window(
                                redis_conn=redis_conn,
                                key='{k}:{t}'.format(k=temp_key1, t=time),
                                window=time,
                                cycle_time=self.settings['STATS_CYCLE'])
                self.stats_dict['fail'][time] = StatsCollector \
                        .get_rolling_time_window(
                                redis_conn=redis_conn,
                                key='{k}:{t}'.format(k=temp_key2, t=time),
                                window=time,
                                cycle_time=self.settings['STATS_CYCLE'])
                self.logger.debug("Set up total/fail Stats Collector '{i}'"\
                        .format(i=item))
            except AttributeError as e:
                self.logger.warning("Unable to find Stats Time '{s}'"\
                        .format(s=item))
        total1 = StatsCollector.get_hll_counter(redis_conn=redis_conn,
                                                key='{k}:lifetime'.format(k=temp_key1),
                                                cycle_time=self.settings['STATS_CYCLE'],
                                                roll=False)
        total2 = StatsCollector.get_hll_counter(redis_conn=redis_conn,
                                                key='{k}:lifetime'.format(k=temp_key2),
                                                cycle_time=self.settings['STATS_CYCLE'],
                                                roll=False)
        self.logger.debug("Set up total/fail Stats Collector 'lifetime'")
        self.stats_dict['total']['lifetime'] = total1
        self.stats_dict['fail']['lifetime'] = total2

    def _setup_stats_plugins(self, redis_conn):
        '''
        Sets up the plugin stats collectors

        @param redis_conn: the redis connection
        '''
        self.stats_dict['plugins'] = {}
        for key in self.plugins_dict:
            plugin_name = self.plugins_dict[key]['instance'].__class__.__name__
            temp_key = 'stats:kafka-monitor:{p}'.format(p=plugin_name)
            self.stats_dict['plugins'][plugin_name] = {}
            for item in self.settings['STATS_TIMES']:
                try:
                    time = getattr(StatsCollector, item)

                    self.stats_dict['plugins'][plugin_name][time] = StatsCollector \
                            .get_rolling_time_window(
                                    redis_conn=redis_conn,
                                    key='{k}:{t}'.format(k=temp_key, t=time),
                                    window=time,
                                    cycle_time=self.settings['STATS_CYCLE'])
                    self.logger.debug("Set up {p} plugin Stats Collector '{i}'"\
                            .format(p=plugin_name, i=item))
                except AttributeError:
                    self.logger.warning("Unable to find Stats Time '{s}'"\
                            .format(s=item))
            total = StatsCollector.get_hll_counter(redis_conn=redis_conn,
                                                   key='{k}:lifetime'.format(k=temp_key),
                                                   cycle_time=self.settings['STATS_CYCLE'],
                                                   roll=False)
            self.logger.debug("Set up {p} plugin Stats Collector 'lifetime'"\
                            .format(p=plugin_name))
            self.stats_dict['plugins'][plugin_name]['lifetime'] = total

    def _setup_kafka(self):
        '''
        Sets up kafka connections
        '''
        @MethodTimer.timeout(self.settings['KAFKA_CONN_TIMEOUT'], False)
        def _hidden_setup():
            try:
                self.kafka_conn = KafkaClient(self.settings['KAFKA_HOSTS'])
                self.kafka_conn.ensure_topic_exists(
                        self.settings['KAFKA_INCOMING_TOPIC'])
                self.consumer = SimpleConsumer(self.kafka_conn,
                                               self.settings['KAFKA_GROUP'],
                                               self.settings['KAFKA_INCOMING_TOPIC'],
                                               auto_commit=True,
                                               iter_timeout=1.0)
            except KafkaUnavailableError as ex:
                message = "An exception '{0}' occured. Arguments:\n{1!r}" \
                    .format(type(ex).__name__, ex.args)
                self.logger.error(message)
                sys.exit(1)
            return True
        ret_val = _hidden_setup()

        if ret_val:
            self.logger.debug("Successfully connected to Kafka")
        else:
            self.logger.error("Failed to set up Kafka Connection within"
                              " timeout")
            # this is essential to running the kafka monitor
            sys.exit(1)

    def extend_with_default(self, validator_class):
        '''
        Method to add default fields to our schema validation
        ( From the docs )
        '''
        validate_properties = validator_class.VALIDATORS["properties"]

        def set_defaults(validator, properties, instance, schema):
            for error in validate_properties(
                validator, properties, instance, schema,
            ):
                yield error

            for property, subschema in properties.iteritems():
                if "default" in subschema:
                    instance.setdefault(property, subschema["default"])

        return validators.extend(
            validator_class, {"properties": set_defaults},
        )

    def _main_loop(self):
        '''
        Continuous loop that reads from a kafka topic and tries to validate
        incoming messages
        '''
        self.logger.debug("Processing messages")
        old_time = 0
        while True:
            self._process_messages()
            if self.settings['STATS_DUMP'] != 0:
                new_time = int(time.time() / self.settings['STATS_DUMP'])
                # only log every X seconds
                if new_time != old_time:
                    self._dump_stats()
                    old_time = new_time

            time.sleep(.01)

    def _process_messages(self):
        try:
            for message in self.consumer.get_messages():
                if message is None:
                    self.logger.debug("no message")
                    break
                try:
                    self._increment_total_stat(message.message.value)
                    the_dict = json.loads(message.message.value)
                    found_plugin = False
                    for key in self.plugins_dict:
                        obj = self.plugins_dict[key]
                        instance = obj['instance']
                        schema = obj['schema']
                        try:
                            self.validator(schema).validate(the_dict)
                            found_plugin = True
                            self._increment_plugin_stat(
                                    instance.__class__.__name__,
                                    the_dict)
                            ret = instance.handle(the_dict)
                            # break if nothing is returned
                            if ret is None:
                                break
                        except ValidationError:
                            pass
                    if not found_plugin:
                        extras = {}
                        extras['parsed'] = True
                        extras['valid'] = False
                        extras['data'] = the_dict
                        self.logger.warn("Did not find schema to validate "
                                         "request", extra=extras)
                        self._increment_fail_stat(the_dict)

                except ValueError:
                    extras = {}
                    extras['parsed'] = False
                    extras['valid'] = False
                    extras['data'] = message.message.value
                    self.logger.warning('Unparseable JSON Received',
                                        extra=extras)
                    self._increment_fail_stat(message.message.value)

        except OffsetOutOfRangeError:
            # consumer has no idea where they are
            self.consumer.seek(0, 2)
            self.logger.error("Kafka offset out of range error")

    def _increment_total_stat(self, string):
        '''
        Increments the total stat counters

        @param string: the loaded message object for the counter
        '''
        string = string + str(time.time())
        if 'total' in self.stats_dict:
            self.logger.debug("Incremented total stats")
            for key in self.stats_dict['total']:
                if key == 'lifetime':

                    self.stats_dict['total'][key].increment(string)
                else:
                    self.stats_dict['total'][key].increment()

    def _increment_fail_stat(self, item):
        '''
        Increments the total stat counters

        @param item: the loaded message object for HLL counter
        '''
        if isinstance(item, dict):
            item['ts'] = time.time()
        elif isinstance(item, str):
            item = item + str(time.time())

        if 'fail' in self.stats_dict:
            self.logger.debug("Incremented fail stats")
            for key in self.stats_dict['fail']:
                if key == 'lifetime':
                    self.stats_dict['fail'][key].increment(item)
                else:
                    self.stats_dict['fail'][key].increment()

    def _increment_plugin_stat(self, name, item):
        '''
        Increments the total stat counters

        @param name: The formal name of the plugin
        @param dict: the loaded message object for HLL counter
        '''
        item['ts'] = time.time()
        if 'plugins' in self.stats_dict:
            self.logger.debug("Incremented plugin '{p}' plugin stats"\
                    .format(p=name))
            for key in self.stats_dict['plugins'][name]:
                if key == 'lifetime':
                    self.stats_dict['plugins'][name][key].increment(item)
                else:
                    self.stats_dict['plugins'][name][key].increment()

    def _dump_stats(self):
        '''
        Dumps the stats out
        '''
        extras = {}
        if 'total' in self.stats_dict:
            self.logger.debug("Compiling total/fail dump stats")
            for key in self.stats_dict['total']:
                final = 'total_{t}'.format(t=key)
                extras[final] = self.stats_dict['total'][key].value()
            for key in self.stats_dict['fail']:
                final = 'fail_{t}'.format(t=key)
                extras[final] = self.stats_dict['fail'][key].value()

        if 'plugins' in self.stats_dict:
            self.logger.debug("Compiling plugin dump stats")
            for name in self.stats_dict['plugins']:
                for key in self.stats_dict['plugins'][name]:
                    final = 'plugin_{n}_{t}'.format(n=name, t=key)
                    extras[final] = self.stats_dict['plugins'][name][key].value()

        if not self.logger.json:
            self.logger.info('Kafka Monitor Stats Dump:\n{0}'.format(
                    json.dumps(extras, indent=4, sort_keys=True)))
        else:
            self.logger.info('Kafka Monitor Stats Dump', extra=extras)

    def run(self):
        '''
        Set up and run
        '''
        self._setup_kafka()
        self._load_plugins()
        self._setup_stats()
        self._main_loop()

    def feed(self, json_item):
        '''
        Feeds a json item into the Kafka topic

        @param json_item: The loaded json object
        '''
        @MethodTimer.timeout(self.settings['KAFKA_FEED_TIMEOUT'], False)
        def _feed(json_item):
            try:
                self.kafka_conn = KafkaClient(self.settings['KAFKA_HOSTS'])
                topic = self.settings['KAFKA_INCOMING_TOPIC']
                producer = SimpleProducer(self.kafka_conn)
            except KafkaUnavailableError:
                self.logger.error("Unable to connect to Kafka")
                return False

            if not self.logger.json:
                self.logger.info('Feeding JSON into {0}\n{1}'.format(
                    topic, json.dumps(json_item, indent=4)))
            else:
                self.logger.info('Feeding JSON into {0}\n'.format(topic),
                                 extra={'value': json_item})

            self.kafka_conn.ensure_topic_exists(topic)
            producer.send_messages(topic, json.dumps(json_item))

            return True

        result = _feed(json_item)

        if result:
            self.logger.info("Successfully fed item to Kafka")
        else:
            self.logger.error("Failed to feed item into Kafka")
Exemple #33
0
class Dispatcher:

    def __init__(self, tasks, server):

        self.tasks = tasks  # 初始URL种子队列
        self.server = server
        self.wrapper = SettingsWrapper()

        self.spiders = []  # 当前运行爬虫节点
        self.spider_count = 0  # 当前运行爬虫节点个数
        self.chose = None  # 一致性哈希分布
        self.settings = None
        self.logger = None

    def setup(self):
        """从配置文件中加载配置信息"""
        self.settings = self.wrapper.load('settings.py')
        self.logger = LogFactory.get_instance(json=self.settings['LOG_JSON'],
                                              stdout=self.settings['LOG_STDOUT'],
                                              level=self.settings['LOG_LEVEL'],
                                              name=self.settings['LOGGER_NAME'],
                                              dir=self.settings['LOG_DIR'],
                                              file=self.settings['LOG_FILE'],
                                              bytes=self.settings['LOG_MAX_BYTES'],
                                              backups=self.settings['LOG_BACKUPS'])

    def initial_seeds(self):
        """初始化调度器"""

        while True:
            initial_len = self.server.llen('seeds')
            if initial_len:
                break
            time.sleep(180)
            continue

        self.logger.debug('获取初始种子列表.........')
        while True:
            tasks = self.server.lrange('seeds', 0, -1)
            self.server.ltrim('seeds', -1, 0)
            self.tasks.extend(tasks)
            if self.tasks:
                break

        self.logger.debug('获取初始爬虫进程个数.........')
        self.spiders = self.server.keys('stats:spider:*:*')  # spiders列表
        self.spider_count = len(self.spiders)

        if self.spider_count:
            self.logger.debug('调用一致性哈希算法布局爬虫节点位置.......')
            job_ids = []
            for spider in self.spiders:
                job_ids.append(spider.split(':')[3])
            self.chose = ketama.Continuum(job_ids)

            self.logger.debug('分配初始种子URLs队列........')
            for task_json in self.tasks:
                task = pickle.loads(task_json)
                if 'url' in task and 'spider_type' in task:
                    extract = tldextract.TLDExtract()
                    url = task['url']
                    spider_type = task['spider_type']
                    domain = extract(url).domain
                    job_id = self.chose[url.encode('utf-8')]
                    queue_key = '{spider_type}:{job_id}:{domain}:queue'.format(spider_type=spider_type,
                                                                               job_id=job_id,
                                                                               domain=domain)
                    priority = task['priority']
                    self.server.zadd(queue_key, pickle.dumps(task), priority)
                else:
                    self.logger.error("please input url and spider_type that you want to crawl!")

    def spider_state_watcher(self):
        """监测爬虫节点是否有变化"""
        self.spiders = self.server.keys('stats:spider:*:*')
        spider_count_now = len(self.spiders)
        if spider_count_now != self.spider_count:
            self.spider_count = spider_count_now
            return True

    def center_node_dispather(self):
        """主节点任务调度"""
        while True:
            self.logger.debug('获取新加入的URLs.........')
            tasks = []
            if self.server.llen('seeds'):
                tasks.append(self.server.lpop('seeds'))
            self.tasks.extend(tasks)

            state = self.spider_state_watcher()
            if state:
                self.logger.debug('遍历爬虫节点并依次暂停当前运行的爬虫..........')
                spider_ids = []
                spider_ip_ids = []
                for spider_key in self.spiders:
                    spider_ids.append(spider_key.split(':')[3])
                    spider_ip_ids.append((spider_key.split(':')[2], spider_key.split(':')[3]))
                for spider_ip_id in spider_ip_ids:
                    key = '{job}:status'.format(job=spider_ip_id[1])
                    self.server.set(key, 'pause')

                time.sleep(4)

                self.logger.debug('由于爬虫节点状态改变,调整哈希分布...........')
                self.chose = ketama.Continuum(spider_ids)

                self.logger.debug('调整爬虫节点所负责的站点数据抓取任务, 请勿在此段时间启动额外的爬虫..........')
                queue_keys = self.server.keys('*:queue')
                for queue_key in queue_keys:
                    tasks.extend(self.server.zrange(queue_key, 0, -1))  # 获取所有爬虫队列中的urls
                    self.server.zremrangebyrank(queue_key, 0, -1)  # 清空爬虫队列

                self.logger.debug('恢复先前暂停的爬虫节点.......')
                for spider_ip_id in spider_ip_ids:
                    key = '{job}:status'.format(job=spider_ip_id[1])
                    self.server.set(key, 'running')

            self.logger.debug('等待!, 重新分配URLs..............')
            for task_json in tasks:
                task = pickle.loads(task_json)
                if 'url' in task and 'spider_type' in task:
                    extract = tldextract.TLDExtract()
                    url = task['url']
                    spider_type = task['spider_type']
                    domain = extract(url).domain
                    job_id = self.chose[url.encode('utf-8')]
                    queue_key = '{spider_type}:{job_id}:{domain}:queue'.format(spider_type=spider_type,
                                                                               job_id=job_id,
                                                                               domain=domain)
                    priority = task['priority']
                    self.server.zadd(queue_key, pickle.dumps(task), priority)
                else:
                    self.logger.error("please input url and spider_type that you want to crawl!")

    def run(self):
        """启动调度器"""
        self.initial_seeds()
        self.center_node_dispather()