Beispiel #1
0
class Cache(BaseThreadedModule):
    """
    A simple wrapper around the python simplekv module.

    It can be used to store results of modules in all simplekv supported backends.

    When set, the following options cause RedisStore to use a buffer for setting values.
    Multiple values are set via the pipe command, which speeds up storage. Still this comes at a price.
    Buffered values, that have not yet been send to redis, will be lost when LumberMill crashes.

    backend: backends supported by [simplekv](http://pythonhosted.org//simplekv/)
    store_interval_in_secs: Sending data to redis in x seconds intervals.
    batch_size: Sending data to redis if count is above, even if store_interval_in_secs is not reached.
    backlog_size: Maximum count of values waiting for transmission. Values above count will be dropped.

    Configuration template:

    - Cache:
       backend:                         # <default: 'DictStore'; type: string; values:['DictStore', 'RedisStore', 'MemcacheStore']; is: optional>
       server:                          # <default: None; type: None||string; is: required if backend in ['RedisStore', 'MemcacheStore'] and cluster is None else optional>
       cluster:                         # <default: None; type: None||dictionary; is: required if backend == 'RedisStore' and server is None else optional>
       port:                            # <default: 6379; type: integer; is: optional>
       db:                              # <default: 0; type: integer; is: optional>
       password:                        # <default: None; type: None||string; is: optional>
       socket_timeout:                  # <default: 10; type: integer; is: optional>
       charset:                         # <default: 'utf-8'; type: string; is: optional>
       errors:                          # <default: 'strict'; type: string; is: optional>
       decode_responses:                # <default: False; type: boolean; is: optional>
       unix_socket_path:                # <default: None; type: None||string; is: optional>
       batch_size:                      # <default: None; type: None||integer; is: optional>
       store_interval_in_secs:          # <default: None; type: None||integer; is: optional>
       backlog_size:                    # <default: 5000; type: integer; is: optional>
    """
    module_type = "stand_alone"
    """Set module type"""

    def configure(self, configuration):
        # Call parent configure method
        BaseThreadedModule.configure(self, configuration)
        self.backend = self.getConfigurationValue('backend')
        self.backend_client = None
        self.kv_store = None
        self.set_buffer = None
        if self.backend == 'DictStore':
            import simplekv.memory
            self.kv_store = simplekv.memory.DictStore()
        elif self.backend == 'RedisStore':
            import simplekv.memory.redisstore
            self.backend_client = self._getRedisClient()
            self.kv_store = simplekv.memory.redisstore.RedisStore(self.backend_client)
        elif self.backend == 'MemcacheStore':
            import simplekv.memory.memcachestore
            self.backend_client = self._getMemcacheClient()
            self.kv_store = simplekv.memory.memcachestore.MemcacheStore(self.backend_client)
        else:
            self.logger("Unknown backend type %s. Please check." % backend)
            self.lumbermill.shutDown();

        if self.getConfigurationValue('store_interval_in_secs') or self.getConfigurationValue('batch_size'):
            if self.backend == 'RedisStore':
                self.set_buffer = Buffer(self.getConfigurationValue('batch_size'), self._setRedisBufferedCallback, self.getConfigurationValue('store_interval_in_secs'), maxsize=self.getConfigurationValue('backlog_size'))
            else:
                self.set_buffer = Buffer(self.getConfigurationValue('batch_size'), self._setBufferedCallback, self.getConfigurationValue('store_interval_in_secs'), maxsize=self.getConfigurationValue('backlog_size'))
            self._set = self.set
            self.set = self._setBuffered
            self._get = self.get
            self.get = self._getBuffered
            self._delete = self.delete
            self.delete = self._deleteBuffered
            self._pop = self.pop
            self.pop = self._popBuffered

    def _getRedisClient(self):
        if not self.getConfigurationValue('cluster') or len(self.getConfigurationValue('cluster')) == 0:
            redis_store = self.getConfigurationValue('server')
            client = self._getSimpleRedisClient()
        else:
            redis_store = self.getConfigurationValue('cluster')
            client = self._getClusterRedisClient()
        try:
            client.ping()
        except:
            etype, evalue, etb = sys.exc_info()
            self.logger.error("Could not connect to redis store at %s. Exception: %s, Error: %s." % (redis_store, etype, evalue))
            self.lumbermill.shutDown()
        return client

    def _getMemcacheClient(self):
        client = None
        # TODO: implement memcache client
        return client

    def _getSimpleRedisClient(self):
        try:
            client = redis.StrictRedis(host=self.getConfigurationValue('server'),
                                       port=self.getConfigurationValue('port'),
                                       db=self.getConfigurationValue('db'),
                                       password=self.getConfigurationValue('password'),
                                       socket_timeout=self.getConfigurationValue('socket_timeout'),
                                       charset=self.getConfigurationValue('charset'),
                                       errors=self.getConfigurationValue('errors'),
                                       decode_responses=self.getConfigurationValue('decode_responses'),
                                       unix_socket_path=self.getConfigurationValue('unix_socket_path'))
            return client
        except:
            etype, evalue, etb = sys.exc_info()
            self.logger.error("Could not connect to redis store at %s. Exception: %s, Error: %s." % (self.getConfigurationValue['server'], etype, evalue))
            self.lumbermill.shutDown()

    def _getClusterRedisClient(self):
        try:
            import rediscluster
        except ImportError:
            self.logger.error("Could not import rediscluster module. To install follow instructions @https://github.com/salimane/rediscluster-py")
            self.lumbermill.shutDown()
        # TODO: Implement a locking mechanism for the cluster client.
        # Some modules like Facet depend on this.
        cluster = {'nodes': {}, 'master_of': {}}
        counter = 1
        for master_node, slave_nodes in self.getConfigurationValue('cluster').items():
            master_node_key = "node_%d" % counter
            node_name_or_ip, node_port = self._parseRedisServerAddress(master_node)
            cluster['nodes'].update({master_node_key: {'host': node_name_or_ip, 'port': node_port}})
            if 'default_node' not in cluster:
                cluster['default_node'] = master_node
            if type(slave_nodes) is str:
                slave_nodes = [slave_nodes]
            for slave_node in slave_nodes:
                counter += 1
                slave_node_key = "node_%d" % counter
                node_name_or_ip, node_port = self._parseRedisServerAddress(slave_node)
                cluster['nodes'].update({slave_node_key: {'host':node_name_or_ip, 'port': node_port}})
                cluster['master_of'].update({master_node_key: slave_node_key})
        try:
            client = rediscluster.StrictRedisCluster(cluster=cluster, db=self.getConfigurationValue('db'))
        except:
            etype, evalue, etb = sys.exc_info()
            self.logger.error("Could not connect to redis store at %s. Exception: %s, Error: %s." % (self.getConfigurationValue['cluster'], etype, evalue))
            self.lumbermill.shutDown()
        return client

    def _parseRedisServerAddress(self, node_address):
        try:
            node_name_or_ip, node_port = node_address.split(":")
        except ValueError:
            node_name_or_ip = node_address
            node_port = self.getConfigurationValue('port')
        return (node_name_or_ip, node_port)

    def getBackendName(self):
        return self.backend

    def iterKeys(self):
        for key in self.kv_store.iter_keys():
            yield key

    def getClient(self):
        return self.backend_client

    def getLock(self, name, timeout=None, sleep=0.1):
        lock = False
        try:
            lock = self.backend_client.lock(name, timeout, sleep)
        except AttributeError:
            pass
        return lock

    def set(self, key, value, ttl=0, pickle=True):
        if pickle is True:
            try:
                value = cPickle.dumps(value)
            except:
                etype, evalue, etb = sys.exc_info()
                self.logger.error("Could not store %s:%s in redis. Exception: %s, Error: %s." % (key, value, etype, evalue))
                raise
        # Only backend clients support ttl.
        if self.backend_client and ttl:
            self.kv_store.put(key, value, ttl_secs=ttl)
        else:
            self.kv_store.put(key, value)

    def _setBuffered(self, key, value, ttl=0, pickle=True):
        self.set_buffer.append({'key': key, 'value': value, 'ttl': ttl, 'pickle': pickle})

    def _setBufferedCallback(self, values):
        for value in values:
            self._set(value['key'], value['value'], value['ttl'], value['pickle'])

    def _setRedisBufferedCallback(self, values):
        pipe = self.backend_client.pipeline()
        for value in values:
            if value['pickle'] is True:
                try:
                    value['value'] = cPickle.dumps(value['value'])
                except:
                    etype, evalue, etb = sys.exc_info()
                    self.logger.error("Could not store %s:%s in redis. Exception: %s, Error: %s." % (value['key'], value['value'], etype, evalue))
                    raise
            if(value['ttl'] == 0):
                pipe.set(value['key'], value['value'])
            else:
                pipe.setex(value['key'], value['ttl'], value['value'])
        try:
            pipe.execute()
            return True
        except:
            etype, evalue, etb = sys.exc_info()
            self.logger.error("Could not flush buffer. Exception: %s, Error: %s." % (etype, evalue))

    def get(self, key, unpickle=True):
        value = self.kv_store.get(key)
        if unpickle and value:
            try:
                value = cPickle.loads(value)
            except:
                etype, evalue, etb = sys.exc_info()
                self.logger.error("Could not unpickle %s:%s from redis. Exception: %s, Error: %s." % (key, value, etype, evalue))
                raise
        return value

    def _getBuffered(self, key, unpickle=True):
        try:
            value_idx = next(index for (index, entry) in enumerate(self.set_buffer.buffer) if entry["key"] == key)
            return self.set_buffer.buffer[value_idx]['value']
        except:
            return self._get(key, unpickle)

    def delete(self, key):
        self.kv_store.delete(key)

    def _deleteBuffered(self, key):
        try:
            value_idx = next(index for (index, entry) in enumerate(self.set_buffer.buffer) if entry["key"] == key)
            self.set_buffer.buffer.pop(value_idx)
            return
        except:
            self._delete(key)

    def pop(self, key, unpickle=True):
        value = self.get(key, unpickle)
        if value:
            self.delete(key)
        return value

    def _popBuffered(self, key, unpickle=True):
        try:
            value_idx = next(index for (index, entry) in enumerate(self.set_buffer.buffer) if entry["key"] == key)
            return self.set_buffer.buffer.pop(value_idx)['value']
        except:
            return self._pop(key, unpickle)

    def shutDown(self):
        try:
            self.buffer.flush()
        except:
            pass
        BaseThreadedModule.shutDown(self)
class RedisListSink(BaseThreadedModule):
    """
    Send events to a redis lists.

    list: Name of redis list to send data to.
    server: Redis server to connect to.
    port: Port redis server is listening on.
    db: Redis db.
    password: Redis password.
    format: Which event fields to send on, e.g. '$(@timestamp) - $(url) - $(country_code)'. If not set the whole event dict is send.
    store_interval_in_secs: Send data to redis in x seconds intervals.
    batch_size: Send data to redis if event count is above, even if store_interval_in_secs is not reached.
    backlog_size: Maximum count of events waiting for transmission. Events above count will be dropped.

    Configuration template:

    - RedisListSink:
       list:                            # <type: String; is: required>
       server:                          # <default: 'localhost'; type: string; is: optional>
       port:                            # <default: 6379; type: integer; is: optional>
       db:                              # <default: 0; type: integer; is: optional>
       password:                        # <default: None; type: None||string; is: optional>
       format:                          # <default: None; type: None||string; is: optional>
       store_interval_in_secs:          # <default: 5; type: integer; is: optional>
       batch_size:                      # <default: 500; type: integer; is: optional>
       backlog_size:                    # <default: 500; type: integer; is: optional>
    """

    module_type = "output"
    """Set module type"""
    can_run_forked = True

    def configure(self, configuration):
         # Call parent configure method
        BaseThreadedModule.configure(self, configuration)
        self.format = self.getConfigurationValue('format')
        self.list = self.getConfigurationValue('list')
        self.client = redis.StrictRedis(host=self.getConfigurationValue('server'),
                                          port=self.getConfigurationValue('port'),
                                          password=self.getConfigurationValue('password'),
                                          db=self.getConfigurationValue('db'))
        try:
            self.client.ping()
        except:
            etype, evalue, etb = sys.exc_info()
            self.logger.error("Could not connect to redis store at %s. Exception: %s, Error: %s." % (self.getConfigurationValue('server'),etype, evalue))
            self.lumbermill.shutDown()

    def getStartMessage(self):
        return "publishing to %s:%s -> %s. Max buffer size: %d" % (self.getConfigurationValue('server'),
                                                                   self.getConfigurationValue('port'),
                                                                   self.list,
                                                                   self.getConfigurationValue('backlog_size'))


    def initAfterFork(self):
        BaseThreadedModule.initAfterFork(self)
        self.buffer = Buffer(self.getConfigurationValue('batch_size'), self.storeData, self.getConfigurationValue('store_interval_in_secs'), maxsize=self.getConfigurationValue('backlog_size'))

    def storeData(self, buffered_data):
        try:
            self.client.rpush(self.list, *buffered_data)
            return True
        except:
            exc_type, exc_value, exc_tb = sys.exc_info()
            self.logger.error("Could not add event to redis list %s. Exception: %s, Error: %s." % (self.list, exc_type, exc_value))
            return False

    def handleEvent(self, event):
        if self.format:
            publish_data = mapDynamicValue(self.format, event)
        else:
            publish_data = event
        self.buffer.append(publish_data)
        yield None

    def shutDown(self):
        try:
            self.buffer.flush()
        except:
            pass
        BaseThreadedModule.shutDown(self)
Beispiel #3
0
class RedisListSink(BaseThreadedModule):
    """
    Send events to a redis lists.

    list: Name of redis list to send data to.
    server: Redis server to connect to.
    port: Port redis server is listening on.
    db: Redis db.
    password: Redis password.
    format: Which event fields to send on, e.g. '$(@timestamp) - $(url) - $(country_code)'. If not set the whole event dict is send.
    store_interval_in_secs: Send data to redis in x seconds intervals.
    batch_size: Send data to redis if event count is above, even if store_interval_in_secs is not reached.
    backlog_size: Maximum count of events waiting for transmission. Events above count will be dropped.

    Configuration template:

    - RedisListSink:
       list:                            # <type: String; is: required>
       server:                          # <default: 'localhost'; type: string; is: optional>
       port:                            # <default: 6379; type: integer; is: optional>
       db:                              # <default: 0; type: integer; is: optional>
       password:                        # <default: None; type: None||string; is: optional>
       format:                          # <default: None; type: None||string; is: optional>
       store_interval_in_secs:          # <default: 5; type: integer; is: optional>
       batch_size:                      # <default: 500; type: integer; is: optional>
       backlog_size:                    # <default: 500; type: integer; is: optional>
    """

    module_type = "output"
    """Set module type"""
    can_run_forked = True

    def configure(self, configuration):
        # Call parent configure method
        BaseThreadedModule.configure(self, configuration)
        self.format = self.getConfigurationValue('format')
        self.list = self.getConfigurationValue('list')
        self.client = redis.StrictRedis(
            host=self.getConfigurationValue('server'),
            port=self.getConfigurationValue('port'),
            password=self.getConfigurationValue('password'),
            db=self.getConfigurationValue('db'))
        try:
            self.client.ping()
        except:
            etype, evalue, etb = sys.exc_info()
            self.logger.error(
                "Could not connect to redis store at %s. Exception: %s, Error: %s."
                % (self.getConfigurationValue('server'), etype, evalue))
            self.lumbermill.shutDown()

    def getStartMessage(self):
        return "[%s] on %s:%s. Max buffer size: %d" % (
            self.list, self.getConfigurationValue('server'),
            self.getConfigurationValue('port'),
            self.getConfigurationValue('backlog_size'))

    def initAfterFork(self):
        BaseThreadedModule.initAfterFork(self)
        self.buffer = Buffer(
            self.getConfigurationValue('batch_size'),
            self.storeData,
            self.getConfigurationValue('store_interval_in_secs'),
            maxsize=self.getConfigurationValue('backlog_size'))

    def storeData(self, buffered_data):
        try:
            self.client.rpush(self.list, *buffered_data)
            return True
        except:
            exc_type, exc_value, exc_tb = sys.exc_info()
            self.logger.error(
                "Could not add event to redis list %s. Exception: %s, Error: %s."
                % (self.list, exc_type, exc_value))
            return False

    def handleEvent(self, event):
        if self.format:
            publish_data = mapDynamicValue(self.format, event)
        else:
            publish_data = event
        self.buffer.append(publish_data)
        yield None

    def shutDown(self):
        try:
            self.buffer.flush()
        except:
            pass
        BaseThreadedModule.shutDown(self)
Beispiel #4
0
class ZabbixSink(BaseThreadedModule):
    """
    Send events to zabbix.

    hostname: Hostname for which the metrics should be stored.
    fields: Event fields to send.
    field_prefix: Prefix to prepend to field names. For e.g. cpu_count field with default lumbermill_ prefix, the Zabbix key is lumbermill_cpu_count.
    timestamp_field: Field to provide timestamp. If not provided, current timestamp is used.
    agent_conf: Path to zabbix_agent configuration file. If set to True defaults to /etc/zabbix/zabbix_agentd.conf.
    server: Address of zabbix server. If port differs from default it can be set by appending it, e.g. 127.0.0.1:10052.
    store_interval_in_secs: sending data to es in x seconds intervals.
    batch_size: sending data to es if event count is above, even if store_interval_in_secs is not reached.
    backlog_size: maximum count of events waiting for transmission. Events above count will be dropped.

    Configuration template:

    - ZabbixSink:
       hostname:                        # <type: string; is: required>
       fields:                          # <type: list; is: required>
       field_prefix:                    # <default: "lumbermill_"; type: string; is: optional>
       timestamp_field:                 # <default: "timestamp"; type: string; is: optional>
       agent_conf:                      # <default: True; type: boolean||string; is: optional>
       server:                          # <default: False; type: boolean||string; is: required if agent_conf is False else optional>
       store_interval_in_secs:          # <default: 10; type: integer; is: optional>
       batch_size:                      # <default: 500; type: integer; is: optional>
       backlog_size:                    # <default: 500; type: integer; is: optional>
    """

    module_type = "output"
    """Set module type"""

    def configure(self, configuration):
        BaseThreadedModule.configure(self, configuration)
        self.hostname = self.getConfigurationValue("hostname")
        self.fields = self.getConfigurationValue("fields")
        self.field_prefix = self.getConfigurationValue("field_prefix")
        self.timestamp_field = self.getConfigurationValue("timestamp_field")
        self.batch_size = self.getConfigurationValue('batch_size')
        self.backlog_size = self.getConfigurationValue('backlog_size')
        self.agent_conf = self.getConfigurationValue("agent_conf")
        if self.agent_conf:
            if self.agent_conf is True:
                self.agent_conf = "/etc/zabbix/zabbix_agentd.conf"
            if not os.path.isfile(self.agent_conf):
                self.logger.error("%s does not point to an existing file." % self.agent_conf)
                self.lumbermill.shutDown()
            self.zabbix_sender = ZabbixSender(use_config=self.agent_conf)

        else:
            self.logger.error("asdads")
            server = self.getConfigurationValue("server")
            port = 10051
            if ":" in self.server:
                server, port = self.server.split(":")
            self.zabbix_sender = ZabbixSender(zabbix_server=server, port=port)
        self.buffer = Buffer(self.getConfigurationValue('batch_size'), self.storeData,
                             self.getConfigurationValue('store_interval_in_secs'),
                             maxsize=self.getConfigurationValue('backlog_size'))

    def getStartMessage(self):
        if self.agent_conf:
            return "Config: %s. Max buffer size: %d" % (self.agent_conf, self.getConfigurationValue('backlog_size'))
        else:
            return "Server: %s. Max buffer size: %d" % (self.getConfigurationValue("server"), self.getConfigurationValue('backlog_size'))

    def initAfterFork(self):
        BaseThreadedModule.initAfterFork(self)
        self.buffer = Buffer(self.getConfigurationValue('batch_size'), self.storeData,
                             self.getConfigurationValue('store_interval_in_secs'),
                             maxsize=self.getConfigurationValue('backlog_size'))

    def handleEvent(self, event):
        self.buffer.append(event)
        yield None

    def storeData(self, events):
        packet = []
        for event in events:
            if self.timestamp_field:
                try:
                    timestamp = event[self.timestamp_field]
                except KeyError:
                    timestamp = None
            hostname = mapDynamicValue(self.hostname, mapping_dict=event, use_strftime=True)
            for field_name in self.fields:
                try:
                    packet.append(ZabbixMetric(hostname, "%s%s" % (self.field_prefix, field_name), event[field_name], timestamp))
                except KeyError:
                    pass
                    #self.logger.warning("Could not send metrics for %s:%s. Field not found." % (hostname, field_name))
        response = self.zabbix_sender.send(packet)
        if response.failed != 0:
            self.logger.warning("%d of %d metrics were not processed correctly." % (response.total-response.processed, response.total))

    def shutDown(self):
        self.buffer.flush()
Beispiel #5
0
class GraphiteSink(BaseThreadedModule):
    """
    Send metrics to graphite server.

    server: Graphite server to connect to.
    port: Port carbon-cache is listening on.
    formats: Format of messages to send to graphite, e.g.: ['lumbermill.stats.event_rate_$(interval)s $(event_rate)'].
    store_interval_in_secs: Send data to graphite in x seconds intervals.
    batch_size: Send data to graphite if event count is above, even if store_interval_in_secs is not reached.
    backlog_size: Send count of events waiting for transmission. Events above count will be dropped.

    Here a simple example to send http_status statistics to graphite:

    ...

    - Statistics:
       interval: 10
       fields: ['http_status']

    - GraphiteSink:
       filter: if $(field_name) == "http_status"
       server: 127.0.0.1
       batch_size: 1
       formats: ['lumbermill.stats.http_200_$(interval)s $(field_counts.200)',
                 'lumbermill.stats.http_400_$(interval)s $(field_counts.400)',
                 'lumbermill.stats.http_total_$(interval)s $(total_count)']

    ...

    Configuration template:

    - GraphiteSink:
       server:                          # <default: 'localhost'; type: string; is: optional>
       port:                            # <default: 2003; type: integer; is: optional>
       formats:                         # <type: list; is: required>
       store_interval_in_secs:          # <default: 5; type: integer; is: optional>
       batch_size:                      # <default: 50; type: integer; is: optional>
       backlog_size:                    # <default: 50; type: integer; is: optional>
    """

    module_type = "output"
    """Set module type"""

    def configure(self, configuration):
        # Call parent configure method
        BaseThreadedModule.configure(self, configuration)
        self.formats = self.getConfigurationValue('formats')
        self.connection_data = (self.getConfigurationValue('server'), self.getConfigurationValue('port'))
        self.connection = None

    def connect(self):
        # Connect to server
        connection = socket.socket()
        try:
            connection.connect(self.connection_data)
            return connection
        except:
            etype, evalue, etb = sys.exc_info()
            self.logger.error("Failed to connect to %s. Exception: %s, Error: %s." % (self.connection_data, etype, evalue))
            return False

    def getStartMessage(self):
        return "%s:%s. Max buffer size: %d" % (self.connection_data[0], self.connection_data[1], self.getConfigurationValue('backlog_size'))

    def initAfterFork(self):
        BaseThreadedModule.initAfterFork(self)
        self.buffer = Buffer(self.getConfigurationValue('batch_size'), self.storeData, self.getConfigurationValue('store_interval_in_secs'), maxsize=self.getConfigurationValue('backlog_size'))
        self.connection = self.connect()
        if not self.connection:
            self.lumbermill.shutDown()
            return
        BaseThreadedModule.initAfterFork(self)

    def handleEvent(self, event):
        for format in self.formats:
            mapped_data = self.mapDynamicValue(format, event)
            if mapped_data:
                self.buffer.append("%s %s" % (mapped_data, int(time.time())))
        yield None

    def storeData(self, events):
        for event in events:
            try:
                if not event.endswith("\n"):
                    event += "\n"
                self.connection.send(event)
                return True
            except:
                etype, evalue, etb = sys.exc_info()
                self.logger.error("Server communication error. Exception: %s, Error: %s." % (etype, evalue))
                tries = 0
                self.connection.close()
                self.connection = None
                while tries < 5 and not self.connection:
                    time.sleep(5)
                    self.logger.warning("Trying to reconnect to %s." % (self.connection_data))
                    # Try to reconnect.
                    self.connection = self.connect()
                    tries += 1
                if not self.connection:
                    self.logger.error("Reconnect failed. Shutting down.")
                    self.lumbermill.shutDown()
                else:
                    self.logger.info("Reconnection to %s successful." % (self.connection_data))

    def shutDown(self):
        try:
            self.connection.close()
        except:
            pass
Beispiel #6
0
class ZmqSink(BaseThreadedModule):
    """
    Sends events to zeromq.

    server: Server to connect to. Pattern: hostname:port.
    pattern: Either push or pub.
    mode: Whether to run a server or client. If running as server, pool size is restricted to a single process.
    topic: The channels topic.
    hwm: Highwatermark for sending socket.
    format: Which event fields to send on, e.g. '$(@timestamp) - $(url) - $(country_code)'. If not set the whole event dict is send msgpacked.
    store_interval_in_secs: Send data to redis in x seconds intervals.
    batch_size: Send data to redis if event count is above, even if store_interval_in_secs is not reached.
    backlog_size: Maximum count of events waiting for transmission. Events above count will be dropped.

    Configuration template:

    - ZmqSink:
       server:                          # <default: 'localhost:5570'; type: string; is: optional>
       pattern:                         # <default: 'push'; type: string; values: ['push', 'pub']; is: optional>
       mode:                            # <default: 'connect'; type: string; values: ['connect', 'bind']; is: optional>
       topic:                           # <default: None; type: None||string; is: optional>
       hwm:                             # <default: None; type: None||integer; is: optional>
       format:                          # <default: None; type: None||string; is: optional>
       store_interval_in_secs:          # <default: 5; type: integer; is: optional>
       batch_size:                      # <default: 500; type: integer; is: optional>
       backlog_size:                    # <default: 500; type: integer; is: optional>
    """

    module_type = "input"
    """Set module type"""
    can_run_forked = True

    def configure(self, configuration):
         # Call parent configure method
        BaseThreadedModule.configure(self, configuration)
        self.server = None
        self.topic = self.getConfigurationValue('topic')
        self.format = self.getConfigurationValue('format')
        self.mode = self.getConfigurationValue('mode')
        if self.mode == "bind":
            self.can_run_forked = False

    def initZmqContext(self):
        self.zmq_context = zmq.Context()
        if self.getConfigurationValue('pattern') == 'push':
            self.client = self.zmq_context.socket(zmq.PUSH)
        else:
            self.client = self.zmq_context.socket(zmq.PUB)
        if self.getConfigurationValue('hwm'):
            try:
                self.client.setsockopt(zmq.SNDHWM, self.getConfigurationValue('hwm'))
            except:
                self.client.setsockopt(zmq.HWM, self.getConfigurationValue('hwm'))
        server_name, server_port = self.getConfigurationValue('server').split(":")
        try:
            server_addr = socket.gethostbyname(server_name)
        except socket.gaierror:
            server_addr = server_name
        try:
            if self.getConfigurationValue('mode') == 'connect':
                self.client.connect('tcp://%s:%s' % (server_addr, server_port))
            else:
                self.client.bind('tcp://%s:%s' % (server_addr, server_port))
        except:
            etype, evalue, etb = sys.exc_info()
            self.logger.error("Could not connect to zeromq at %s. Exception: %s, Error: %s." % (self.getConfigurationValue('server'), etype, evalue))
            self.lumbermill.shutDown()

    def getStartMessage(self):
        return "%s. Max buffer size: %d" % (self.server, self.getConfigurationValue('backlog_size'))

    def initAfterFork(self):
        BaseThreadedModule.initAfterFork(self)
        self.initZmqContext()
        self.buffer = Buffer(self.getConfigurationValue('batch_size'), self.storeData, self.getConfigurationValue('store_interval_in_secs'), maxsize=self.getConfigurationValue('backlog_size'))

    def storeData(self, buffered_data):
        try:
            for data in buffered_data:
                #print "Sending %s.\n" % data
                self.client.send("%s" % data)
            return True
        except zmq.error.ContextTerminated:
            pass
        except:
            exc_type, exc_value, exc_tb = sys.exc_info()
            if exc_value in ['Interrupted system call', 'Socket operation on non-socket']:
                return False
            self.logger.error("Could not add events to zmq. Exception: %s, Error: %s." % (exc_type, exc_value))
            return False

    def handleEvent(self, event):
        if self.format:
            publish_data = mapDynamicValue(self.format, event)
        else:
            publish_data = msgpack.packb(event)
        if self.topic:
             publish_data = "%s %s" % (self.topic, publish_data)
        self.buffer.append(publish_data)
        yield None

    def shutDown(self):
        try:
            self.buffer.flush()
        except:
            pass
        try:
            self.client.close()
            self.zmq_context.term()
        except AttributeError:
            pass
        # Call parent shutDown method.
        BaseThreadedModule.shutDown(self)
class ElasticSearchSink(BaseThreadedModule):
    """
    Store the data dictionary in an elasticsearch index.

    The elasticsearch module takes care of discovering all nodes of the elasticsearch cluster.
    Requests will the be loadbalanced via round robin.

    action:     Either index or update. If update be sure to provide the correct doc_id.
    format:     Which event fields to send on, e.g. '$(@timestamp) - $(url) - $(country_code)'.
                If not set the whole event dict is send.
    nodes:      Configures the elasticsearch nodes.
    read_timeout: Set number of seconds to wait until requests to elasticsearch will time out.
    connection_type:    One of: 'thrift', 'http'.
    http_auth:  'user:password'.
    use_ssl:    One of: True, False.
    index_name: Sets the index name. Timepatterns like %Y.%m.%d and dynamic values like $(bar) are allowed here.
    doc_id:     Sets the es document id for the committed event data.
    routing:    Sets a routing value (@see: http://www.elasticsearch.org/blog/customizing-your-document-routing/)
                Timepatterns like %Y.%m.%d are allowed here.
    ttl:        When set, documents will be automatically deleted after ttl expired.
                Can either set time in milliseconds or elasticsearch date format, e.g.: 1d, 15m etc.
                This feature needs to be enabled for the index.
                @See: http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/mapping-ttl-field.html
    sniff_on_start: The client can be configured to inspect the cluster state to get a list of nodes upon startup.
                    Might cause problems on hosts with multiple interfaces. If connections fail, try to deactivate this.
    sniff_on_connection_fail: The client can be configured to inspect the cluster state to get a list of nodes upon failure.
                              Might cause problems on hosts with multiple interfaces. If connections fail, try to deactivate this.
    consistency:    One of: 'one', 'quorum', 'all'.
    store_interval_in_secs:     Send data to es in x seconds intervals.
    batch_size: Sending data to es if event count is above, even if store_interval_in_secs is not reached.
    backlog_size:   Maximum count of events waiting for transmission. If backlog size is exceeded no new events will be processed.

    Configuration template:

    - ElasticSearchSink:
       action:                          # <default: 'index'; type: string; is: optional; values: ['index', 'update']>
       format:                          # <default: None; type: None||string; is: optional>
       nodes:                           # <type: string||list; is: required>
       read_timeout:                    # <default: 10; type: integer; is: optional>
       connection_type:                 # <default: 'urllib3'; type: string; values: ['urllib3', 'requests']; is: optional>
       http_auth:                       # <default: None; type: None||string; is: optional>
       use_ssl:                         # <default: False; type: boolean; is: optional>
       index_name:                      # <default: 'lumbermill-%Y.%m.%d'; type: string; is: optional>
       doc_id:                          # <default: '$(lumbermill.event_id)'; type: string; is: optional>
       routing:                         # <default: None; type: None||string; is: optional>
       ttl:                             # <default: None; type: None||integer||string; is: optional>
       sniff_on_start:                  # <default: False; type: boolean; is: optional>
       sniff_on_connection_fail:        # <default: False; type: boolean; is: optional>
       consistency:                     # <default: 'quorum'; type: string; values: ['one', 'quorum', 'all']; is: optional>
       store_interval_in_secs:          # <default: 5; type: integer; is: optional>
       batch_size:                      # <default: 500; type: integer; is: optional>
       backlog_size:                    # <default: 500; type: integer; is: optional>
    """

    module_type = "output"
    """Set module type"""
    def configure(self, configuration):
        # Call parent configure method.
        BaseThreadedModule.configure(self, configuration)
        for module_name in ['elasticsearch', 'urllib3', 'requests']:
            if self.getConfigurationValue('log_level') == 'info':
                logging.getLogger(module_name).setLevel(logging.WARN)
            else:
                # Set log level for elasticsarch library if configured to other than default.
                logging.getLogger(module_name).setLevel(self.logger.level)
        self.action = self.getConfigurationValue('action')
        self.format = self.getConfigurationValue('format')
        self.consistency = self.getConfigurationValue("consistency")
        self.ttl = self.getConfigurationValue("ttl")
        self.index_name = self.getConfigurationValue("index_name")
        self.routing_pattern = self.getConfigurationValue("routing")
        self.doc_id_pattern = self.getConfigurationValue("doc_id")
        self.es_nodes = self.getConfigurationValue("nodes")
        self.read_timeout = self.getConfigurationValue("read_timeout")
        if not isinstance(self.es_nodes, list):
            self.es_nodes = [self.es_nodes]
        if self.getConfigurationValue("connection_type") == 'urllib3':
            self.connection_class = elasticsearch.connection.Urllib3HttpConnection
        elif self.getConfigurationValue("connection_type") == 'requests':
            self.connection_class = elasticsearch.connection.RequestsHttpConnection

    def getStartMessage(self):
        return "Idx: %s. Max buffer size: %d" % (
            self.index_name, self.getConfigurationValue('backlog_size'))

    def initAfterFork(self):
        BaseThreadedModule.initAfterFork(self)
        # Init es client after fork as mentioned in https://elasticsearch-py.readthedocs.org/en/master/
        self.es = self.connect()
        if not self.es:
            self.lumbermill.shutDown()
            return
        # As the buffer uses a threaded timed function to flush its buffer and thread will not survive a fork, init buffer here.
        self.buffer = Buffer(
            self.getConfigurationValue('batch_size'),
            self.storeData,
            self.getConfigurationValue('store_interval_in_secs'),
            maxsize=self.getConfigurationValue('backlog_size'))

    def connect(self):
        es = False
        tries = 0
        while tries < 5 and not es:
            try:
                # Connect to es node and round-robin between them.
                self.logger.debug("Connecting to %s." % self.es_nodes)
                es = elasticsearch.Elasticsearch(
                    self.es_nodes,
                    connection_class=self.connection_class,
                    timeout=self.read_timeout,
                    sniff_on_start=self.getConfigurationValue(
                        'sniff_on_start'),
                    sniff_on_connection_fail=self.getConfigurationValue(
                        'sniff_on_connection_fail'),
                    sniff_timeout=5,
                    maxsize=20,
                    use_ssl=self.getConfigurationValue('use_ssl'),
                    http_auth=self.getConfigurationValue('http_auth'))
            except:
                etype, evalue, etb = sys.exc_info()
                self.logger.warning(
                    "Connection to %s failed. Exception: %s, Error: %s." %
                    (self.es_nodes, etype, evalue))
                self.logger.warning(
                    "Waiting %s seconds before retring to connect." %
                    ((4 + tries)))
                time.sleep(4 + tries)
                tries += 1
                continue
        if not es:
            self.logger.error("Connection to %s failed. Shutting down." %
                              self.es_nodes)
            self.lumbermill.shutDown()
        else:
            self.logger.debug("Connection to %s successful." % self.es_nodes)
        return es

    def handleEvent(self, event):
        if self.format:
            publish_data = self.getConfigurationValue('format', event)
        else:
            publish_data = event
        self.buffer.append(publish_data)
        yield None

    def dataToElasticSearchJson(self, events):
        """
        Format data for elasticsearch bulk update.
        """
        json_data = []
        for event in events:
            index_name = mapDynamicValueInString(self.index_name,
                                                 event,
                                                 use_strftime=True).lower()
            event_type = event['lumbermill'][
                'event_type'] if 'lumbermill' in event and 'event_type' in event[
                    'lumbermill'] else 'Unknown'
            doc_id = mapDynamicValue(self.doc_id_pattern, event)
            routing = mapDynamicValue(self.routing_pattern, use_strftime=True)
            if not doc_id:
                self.logger.error(
                    "Could not find doc_id %s for event %s." %
                    (self.getConfigurationValue("doc_id"), event))
                continue
            header = {
                self.action: {
                    '_index': index_name,
                    '_type': event_type,
                    '_id': doc_id
                }
            }
            if self.routing_pattern:
                header['index']['_routing'] = routing
            if self.ttl:
                header['index']['_ttl'] = self.ttl
            if self.action == 'update':
                event = {'doc': event}
            try:
                json_data.append("\n".join(
                    (json.dumps(header), json.dumps(event), "\n")))
            except UnicodeDecodeError:
                etype, evalue, etb = sys.exc_info()
                self.logger.error(
                    "Could not json encode %s. Exception: %s, Error: %s." %
                    (event, etype, evalue))
        json_data = "".join(json_data)
        return json_data

    def storeData(self, events):
        json_data = self.dataToElasticSearchJson(events)
        try:
            #started = time.time()
            # Bulk update of 500 events took 0.139621019363.
            self.es.bulk(body=json_data, consistency=self.consistency)
            #print("Bulk update of %s events took %s." % (len(events), time.time() - started))
            return True
        except elasticsearch.exceptions.ConnectionError:
            try:
                self.logger.warning(
                    "Lost connection to %s. Trying to reconnect." %
                    (self.es_nodes, index_name))
                self.es = self.connect()
            except:
                time.sleep(.5)
        except:
            etype, evalue, etb = sys.exc_info()
            self.logger.error(
                "Server communication error. Exception: %s, Error: %s." %
                (etype, evalue))
            self.logger.debug("Payload: %s" % json_data)
            if "Broken pipe" in evalue or "Connection reset by peer" in evalue:
                self.es = self.connect()

    def shutDown(self):
        try:
            self.buffer.flush()
        except:
            pass
        BaseThreadedModule.shutDown(self)
Beispiel #8
0
class KeyValueStore(BaseThreadedModule):
    """
    A simple wrapper around the python simplekv module.

    It can be used to store results of modules in all simplekv supported backends.

    When set, the following options cause RedisStore to use a buffer for setting values.
    Multiple values are set via the pipe command, which speeds up storage. Still this comes at a price.
    Buffered values, that have not yet been send to redis, will be lost when LumberMill crashes.

    backend: backends supported by [simplekv](http://pythonhosted.org//simplekv/)
    store_interval_in_secs: Sending data to redis in x seconds intervals.
    batch_size: Sending data to redis if count is above, even if store_interval_in_secs is not reached.
    backlog_size: Maximum count of values waiting for transmission. Values above count will be dropped.

    Configuration template:

    - KeyValueStore:
       backend:                         # <default: 'DictStore'; type: string; values:['DictStore', 'RedisStore', 'MemcacheStore']; is: optional>
       server:                          # <default: None; type: None||string; is: required if backend in ['RedisStore', 'MemcacheStore'] and cluster is None else optional>
       cluster:                         # <default: None; type: None||dictionary; is: required if backend == 'RedisStore' and server is None else optional>
       port:                            # <default: 6379; type: integer; is: optional>
       db:                              # <default: 0; type: integer; is: optional>
       password:                        # <default: None; type: None||string; is: optional>
       socket_timeout:                  # <default: 10; type: integer; is: optional>
       charset:                         # <default: 'utf-8'; type: string; is: optional>
       errors:                          # <default: 'strict'; type: string; is: optional>
       decode_responses:                # <default: False; type: boolean; is: optional>
       unix_socket_path:                # <default: None; type: None||string; is: optional>
       batch_size:                      # <default: None; type: None||integer; is: optional>
       store_interval_in_secs:          # <default: None; type: None||integer; is: optional>
       backlog_size:                    # <default: 5000; type: integer; is: optional>
    """
    module_type = "stand_alone"
    """Set module type"""

    def configure(self, configuration):
        # Call parent configure method
        BaseThreadedModule.configure(self, configuration)
        backend = self.getConfigurationValue('backend')
        self.backend_client = None
        if backend == 'DictStore':
            import simplekv.memory
            self.backend_client = None
            self.kv_store = simplekv.memory.DictStore()
        elif backend == 'RedisStore':
            import simplekv.memory.redisstore
            self.backend_client = self.getRedisClient()
            self.kv_store = simplekv.memory.redisstore.RedisStore(self.backend_client)
        elif backend == 'MemcacheStore':
            import simplekv.memory.memcachestore
            self.backend_client = self.getMemcacheClient()
            self.kv_store = simplekv.memory.memcachestore.MemcacheStore(self.backend_client)

        self.set_buffer = None
        if self.getConfigurationValue('store_interval_in_secs') or self.getConfigurationValue('batch_size'):
            if backend == 'RedisStore':
                self.set_buffer = Buffer(self.getConfigurationValue('batch_size'), self.setRedisBufferedCallback, self.getConfigurationValue('store_interval_in_secs'), maxsize=self.getConfigurationValue('backlog_size'))
            else:
                 self.set_buffer = Buffer(self.getConfigurationValue('batch_size'), self.setBufferedCallback, self.getConfigurationValue('store_interval_in_secs'), maxsize=self.getConfigurationValue('backlog_size'))
            self._set = self.set
            self.set = self.setBuffered
            self._get = self.get
            self.get = self.getBuffered
            self._delete = self.delete
            self.delete = self.deleteBuffered
            self._pop = self.pop
            self.pop = self.popBuffered

    def getRedisClient(self):
        if not self.getConfigurationValue('cluster') or len(self.getConfigurationValue('cluster')) == 0:
            redis_store = self.getConfigurationValue('server')
            client = self.getSimpleRedisClient()
        else:
            redis_store = self.getConfigurationValue('cluster')
            client = self.getClusterRedisClient()
        try:
            client.ping()
        except:
            etype, evalue, etb = sys.exc_info()
            self.logger.error("Could not connect to redis store at %s. Exception: %s, Error: %s." % (redis_store, etype, evalue))
            self.lumbermill.shutDown()
        return client

    def getMemcacheClient(self):
        client = None
        # TODO: implement memcache client
        return client

    def getSimpleRedisClient(self):
        try:
            client = redis.StrictRedis(host=self.getConfigurationValue('server'),
                                       port=self.getConfigurationValue('port'),
                                       db=self.getConfigurationValue('db'),
                                       password=self.getConfigurationValue('password'),
                                       socket_timeout=self.getConfigurationValue('socket_timeout'),
                                       charset=self.getConfigurationValue('charset'),
                                       errors=self.getConfigurationValue('errors'),
                                       decode_responses=self.getConfigurationValue('decode_responses'),
                                       unix_socket_path=self.getConfigurationValue('unix_socket_path'))
            return client
        except:
            etype, evalue, etb = sys.exc_info()
            self.logger.error("Could not connect to redis store at %s. Exception: %s, Error: %s." % (self.getConfigurationValue['server'], etype, evalue))
            self.lumbermill.shutDown()

    def getClusterRedisClient(self):
        try:
            import rediscluster
        except ImportError:
            self.logger.error("Could not import rediscluster module. To install follow instructions @https://github.com/salimane/rediscluster-py")
            self.lumbermill.shutDown()
        # TODO: Implement a locking mechanism for the cluster client.
        # Some modules like Facet depend on this.
        cluster = {'nodes': {}, 'master_of': {}}
        counter = 1
        for master_node, slave_nodes in self.getConfigurationValue('cluster').items():
            master_node_key = "node_%d" % counter
            node_name_or_ip, node_port = self._parseRedisServerAddress(master_node)
            cluster['nodes'].update({master_node_key: {'host': node_name_or_ip, 'port': node_port}})
            if 'default_node' not in cluster:
                cluster['default_node'] = master_node
            if type(slave_nodes) is str:
                slave_nodes = [slave_nodes]
            for slave_node in slave_nodes:
                counter += 1
                slave_node_key = "node_%d" % counter
                node_name_or_ip, node_port = self._parseRedisServerAddress(slave_node)
                cluster['nodes'].update({slave_node_key: {'host':node_name_or_ip, 'port': node_port}})
                cluster['master_of'].update({master_node_key: slave_node_key})
        try:
            client = rediscluster.StrictRedisCluster(cluster=cluster, db=self.getConfigurationValue('db'))
        except:
            etype, evalue, etb = sys.exc_info()
            self.logger.error("Could not connect to redis store at %s. Exception: %s, Error: %s." % (self.getConfigurationValue['cluster'], etype, evalue))
            self.lumbermill.shutDown()
        return client

    def _parseRedisServerAddress(self, node_address):
        try:
            node_name_or_ip, node_port = node_address.split(":")
        except ValueError:
            node_name_or_ip = node_address
            node_port = self.getConfigurationValue('port')
        return (node_name_or_ip, node_port)

    def iterKeys(self):
        for key in self.kv_store.iter_keys():
            yield key

    def getClient(self):
        return self.backend_client

    def getLock(self, name, timeout=None, sleep=0.1):
        lock = False
        try:
            lock = self.backend_client.lock(name, timeout, sleep)
        except:
            pass
        return lock

    def set(self, key, value, ttl=0, pickle=True):
        if pickle is True:
            try:
                value = cPickle.dumps(value)
            except:
                etype, evalue, etb = sys.exc_info()
                self.logger.error("Could not store %s:%s in redis. Exception: %s, Error: %s." % (key, value, etype, evalue))
                raise
        if ttl:
            self.kv_store.put(key, value, ttl_secs=ttl)
        else:
            self.kv_store.put(key, value)

    def setBuffered(self, key, value, ttl=0, pickle=True):
        if pickle is True:
            try:
                value = cPickle.dumps(value)
            except:
                etype, evalue, etb = sys.exc_info()
                self.logger.error("Could not store %s:%s in redis. Exception: %s, Error: %s." % (key, value, etype, evalue))
                raise
        if ttl:
            self.set_buffer.append({'key':key, 'ttl': ttl, 'value': value})
        else:
            self.set_buffer.append({'key':key, 'value': value})

    def setBufferedCallback(self, values):
        for value in values:
            if 'ttl' in value:
                self._set(value['key'], value['value'], value['ttl'])
            else:
                self._set(value['key'], value['value'])

    def setRedisBufferedCallback(self, values):
        pipe = self.backend_client.pipeline()
        for value in values:
            if 'ttl' in value:
                pipe.setex(value['key'], value['ttl'], value['value'])
            else:
                pipe.set(value['key'], value['value'])
        try:
            pipe.execute()
            return True
        except:
            etype, evalue, etb = sys.exc_info()
            self.logger.error("Could not flush buffer. Exception: %s, Error: %s." % (etype, evalue))

    def get(self, key, unpickle=True):
        value = self.kv_store.get(key)
        if unpickle and value:
            try:
                value = cPickle.loads(value)
            except:
                etype, evalue, etb = sys.exc_info()
                self.logger.error("Could not unpickle %s:%s from redis. Exception: %s, Error: %s." % (key, value, etype, evalue))
                raise
        return value

    def getBuffered(self, key, unpickle=True):
        try:
            value_idx = next(index for (index, entry) in enumerate(self.set_buffer.buffer) if entry["key"] == key)
            return self.set_buffer.buffer[value_idx]
        except:
            return self._get(key, unpickle)

    def delete(self, key):
        self.kv_store.delete(key)

    def deleteBuffered(self, key):
        try:
            value_idx = next(index for (index, entry) in enumerate(self.set_buffer.buffer) if entry["key"] == key)
            self.set_buffer.buffer.pop(value_idx)
            return
        except:
            self._delete(key)

    def pop(self, key, unpickle=True):
        value = self.get(key, unpickle)
        if value:
            self.delete(key)
        return value

    def popBuffered(self, key, unpickle=True):
        try:
            value_idx = next(index for (index, entry) in enumerate(self.set_buffer.buffer) if entry["key"] == key)
            return self.set_buffer.buffer.pop(value_idx)
        except:
            return self._pop(key, unpickle)

    def shutDown(self):
        try:
            self.buffer.flush()
        except:
            pass
        BaseThreadedModule.shutDown(self)
Beispiel #9
0
class RedisStore(BaseThreadedModule):
    """
    A simple wrapper around the redis python module.

    It can be used to store results of modules in a redis key/value store.

        server: Redis server to connect to.
        cluster: Dictionary of redis masters as keys and pack_followers as values, e.g.: {'172.16.0.1:6379': '172.16.0.2:6379'}
        port: Port redis server is listening on.
        db: Redis db.
        password: Redis password.
        socket_timeout: Socket timeout in seconds.
        charset: Charset to use.
        errors: tbd.
        decode_responses: specifies whether return values from Redis commands get decoded automatically using the client's charset value.
        unix_socket_path: Path to unix socket file.

    When set, the following options cause RedisStore to use a buffer for setting values.
    Multiple values are set via the pipe command, which speeds up storage. Still this comes at a price.
    Buffered values, that have not yet been send to redis, will be lost when LumberMill crashes.

        store_interval_in_secs: Sending data to redis in x seconds intervals.
        batch_size: Sending data to redis if count is above, even if store_interval_in_secs is not reached.
        backlog_size: Maximum count of values waiting for transmission. Values above count will be dropped.

    Configuration template:

    - RedisStore:
       server:                         # <default: 'localhost'; type: string; is: optional>
       cluster:                        # <default: {}; type: dictionary; is: optional>
       port:                           # <default: 6379; type: integer; is: optional>
       db:                             # <default: 0; type: integer; is: optional>
       password:                       # <default: None; type: None||string; is: optional>
       socket_timeout:                 # <default: 10; type: integer; is: optional>
       charset:                        # <default: 'utf-8'; type: string; is: optional>
       errors:                         # <default: 'strict'; type: string; is: optional>
       decode_responses:               # <default: False; type: boolean; is: optional>
       unix_socket_path:               # <default: None; type: None||string; is: optional>
       batch_size:                     # <default: None; type: None||integer; is: optional>
       store_interval_in_secs:         # <default: None; type: None||integer; is: optional>
       backlog_size:                   # <default: 5000; type: integer; is: optional>
    """
    module_type = "stand_alone"
    """Set module type"""
    def configure(self, configuration):
        # Call parent configure method
        BaseThreadedModule.configure(self, configuration)
        if len(self.getConfigurationValue('cluster')) == 0:
            redis_store = self.getConfigurationValue('server')
            self.client = self.getRedisClient()
        else:
            redis_store = self.getConfigurationValue('cluster')
            self.client = self.getClusterRedisClient()
        try:
            self.client.ping()
        except:
            etype, evalue, etb = sys.exc_info()
            self.logger.error(
                "Could not connect to redis store at %s. Exception: %s, Error: %s."
                % (redis_store, etype, evalue))
            self.lumbermill.shutDown()
        self.set_buffer = None
        if self.getConfigurationValue(
                'store_interval_in_secs') or self.getConfigurationValue(
                    'batch_size'):
            self.set_buffer = Buffer(
                self.getConfigurationValue('batch_size'),
                self.setBufferedCallback,
                self.getConfigurationValue('store_interval_in_secs'),
                maxsize=self.getConfigurationValue('backlog_size'))
            self._set = self.set
            self.set = self.setBuffered
            self._get = self.get
            self.get = self.getBuffered
            self._delete = self.delete
            self.delete = self.deleteBuffered
            self._pop = self.pop
            self.pop = self.popBuffered

    def getRedisClient(self):
        try:
            client = redis.StrictRedis(
                host=self.getConfigurationValue('server'),
                port=self.getConfigurationValue('port'),
                db=self.getConfigurationValue('db'),
                password=self.getConfigurationValue('password'),
                socket_timeout=self.getConfigurationValue('socket_timeout'),
                charset=self.getConfigurationValue('charset'),
                errors=self.getConfigurationValue('errors'),
                decode_responses=self.getConfigurationValue(
                    'decode_responses'),
                unix_socket_path=self.getConfigurationValue(
                    'unix_socket_path'))
            return client
        except:
            etype, evalue, etb = sys.exc_info()
            self.logger.error(
                "Could not connect to redis store at %s. Exception: %s, Error: %s."
                % (self.getConfigurationValue['server'], etype, evalue))

    def getClusterRedisClient(self):
        import rediscluster
        # TODO: Implement a locking mechnism for the cluster client.
        # Some modules like Facet depend on this.
        cluster = {'nodes': {}, 'master_of': {}}
        counter = 1
        for master_node, slave_nodes in self.getConfigurationValue(
                'cluster').items():
            master_node_key = "node_%d" % counter
            node_name_or_ip, node_port = self._parseRedisServerAddress(
                master_node)
            cluster['nodes'].update({
                master_node_key: {
                    'host': node_name_or_ip,
                    'port': node_port
                }
            })
            #if 'default_node' not in cluster:
            #    cluster['default_node'] = master_node
            if type(slave_nodes) is str:
                slave_nodes = [slave_nodes]
            for slave_node in slave_nodes:
                counter += 1
                slave_node_key = "node_%d" % counter
                node_name_or_ip, node_port = self._parseRedisServerAddress(
                    slave_node)
                cluster['nodes'].update({
                    slave_node_key: {
                        'host': node_name_or_ip,
                        'port': node_port
                    }
                })
                #cluster['master_of'].update({master_node_key: slave_node_key})
        client = rediscluster.StrictRedisCluster(
            cluster=cluster, db=self.getConfigurationValue('db'))
        return client

    def _parseRedisServerAddress(self, node_address):
        try:
            node_name_or_ip, node_port = node_address.split(":")
        except ValueError:
            node_name_or_ip = node_address
            node_port = self.getConfigurationValue('port')
        return (node_name_or_ip, node_port)

    def getClient(self):
        return self.client

    def getLock(self, name, timeout=None, sleep=0.1):
        return self.client.lock(name, timeout, sleep)

    def set(self, key, value, ttl=0, pickle=True):
        if pickle is True:
            try:
                value = cPickle.dumps(value)
            except:
                etype, evalue, etb = sys.exc_info()
                self.logger.error(
                    "Could not store %s:%s in redis. Exception: %s, Error: %s."
                    % (key, value, etype, evalue))
                raise
        if ttl:
            self.client.setex(key, ttl, value)
        else:
            self.client.set(key, value)

    def setBuffered(self, key, value, ttl=0, pickle=True):
        if pickle is True:
            try:
                value = cPickle.dumps(value)
            except:
                etype, evalue, etb = sys.exc_info()
                self.logger.error(
                    "Could not store %s:%s in redis. Exception: %s, Error: %s."
                    % (key, value, etype, evalue))
                raise
        if ttl:
            self.set_buffer.append({'key': key, 'ttl': ttl, 'value': value})
        else:
            self.set_buffer.append({'key': key, 'value': value})

    def setBufferedCallback(self, values):
        pipe = self.client.pipeline()
        for value in values:
            if 'ttl' in value:
                pipe.setex(value['key'], value['ttl'], value['value'])
            else:
                pipe.set(value['key'], value['value'])
        try:
            pipe.execute()
            return True
        except:
            etype, evalue, etb = sys.exc_info()
            self.logger.error(
                "Could not flush buffer. Exception: %s, Error: %s." %
                (etype, evalue))

    def get(self, key, unpickle=True):
        value = self.client.get(key)
        if unpickle and value:
            try:
                value = cPickle.loads(value)
            except:
                etype, evalue, etb = sys.exc_info()
                self.logger.error(
                    "Could not unpickle %s:%s from redis. Exception: %s, Error: %s."
                    % (key, value, etype, evalue))
                raise
        return value

    def getBuffered(self, key, unpickle=True):
        try:
            value_idx = next(index
                             for (index,
                                  entry) in enumerate(self.set_buffer.buffer)
                             if entry["key"] == key)
            return self.set_buffer.buffer[value_idx]
        except:
            return self._get(key, unpickle)

    def delete(self, key):
        self.client.delete(key)

    def deleteBuffered(self, key):
        try:
            value_idx = next(index
                             for (index,
                                  entry) in enumerate(self.set_buffer.buffer)
                             if entry["key"] == key)
            self.set_buffer.buffer.pop(value_idx)
            return
        except:
            self._delete(key)

    def pop(self, key, unpickle=True):
        value = self.get(key, unpickle)
        if value:
            self.delete(key)
        return value

    def popBuffered(self, key, unpickle=True):
        try:
            value_idx = next(index
                             for (index,
                                  entry) in enumerate(self.set_buffer.buffer)
                             if entry["key"] == key)
            return self.set_buffer.buffer.pop(value_idx)
        except:
            return self._pop(key, unpickle)

    def shutDown(self):
        try:
            self.buffer.flush()
        except:
            pass
        BaseThreadedModule.shutDown(self)
Beispiel #10
0
class MongoDbSink(BaseThreadedModule):
    """
    Store incoming events in a mongodb.

    host: Mongodb server.
    database: Mongodb database name.
    collection: Mongodb collection name. Timepatterns like %Y.%m.%d and dynamic values like $(bar) are allowed here.
    optinonal_connection_params: Other optional parameters as documented in https://api.mongodb.org/python/current/api/pymongo/mongo_client.html
    format:     Which event fields to send on, e.g. '$(@timestamp) - $(url) - $(country_code)'.
                If not set the whole event dict is send.
    doc_id:     Sets the document id for the committed event data.
    store_interval_in_secs:     Send data to es in x seconds intervals.
    batch_size: Sending data to es if event count is above, even if store_interval_in_secs is not reached.
    backlog_size:   Maximum count of events waiting for transmission. If backlog size is exceeded no new events will be processed.

    Configuration template:

    - MongoDbSink:
       host:                            # <default: 'localhost:27017'; type: string; is: optional>
       database:                        # <default: 'lumbermill'; type: string; is: optional>
       collection:                      # <default: 'lumbermill-%Y.%m.%d'; type: string; is: optional>
       optinonal_connection_params:     # <default: {'serverSelectionTimeoutMS': 5}; type: dictionary; is: optional>
       format:                          # <default: None; type: None||string; is: optional>
       doc_id:                          # <default: '$(lumbermill.event_id)'; type: string; is: optional>
       store_interval_in_secs:          # <default: 5; type: integer; is: optional>
       batch_size:                      # <default: 500; type: integer; is: optional>
       backlog_size:                    # <default: 5000; type: integer; is: optional>
    """

    module_type = "output"
    """Set module type"""
    def configure(self, configuration):
        # Call parent configure method.
        BaseThreadedModule.configure(self, configuration)
        self.format = self.getConfigurationValue('format')
        self.collection = self.getConfigurationValue('collection')
        self.database = self.getConfigurationValue('database')
        self.doc_id_pattern = self.getConfigurationValue("doc_id")

    def getStartMessage(self):
        return "DB: %s. Max buffer size: %d" % (self.getConfigurationValue(
            'database'), self.getConfigurationValue('backlog_size'))

    def initAfterFork(self):
        BaseThreadedModule.initAfterFork(self)
        # Init monogdb client after fork.
        self.mongodb = self.connect()
        if not self.mongodb:
            self.lumbermill.shutDown()
            return
        # As the buffer uses a threaded timed function to flush its buffer and thread will not survive a fork, init buffer here.
        self.buffer = Buffer(
            self.getConfigurationValue('batch_size'),
            self.storeData,
            self.getConfigurationValue('store_interval_in_secs'),
            maxsize=self.getConfigurationValue('backlog_size'))

    def connect(self):
        try:
            mongodb_client = pymongo.MongoClient(
                self.getConfigurationValue('host'),
                **self.getConfigurationValue('optinonal_connection_params'))
            self.logger.debug(str(mongodb_client.server_info()))
        except:
            etype, evalue, etb = sys.exc_info()
            self.logger.warning(
                "Connection to %s failed. Exception: %s, Error: %s." %
                (self.getConfigurationValue('host'), etype, evalue))
        if not mongodb_client:
            self.logger.error("Connection to %s failed. Shutting down." %
                              self.getConfigurationValue('host'))
            self.lumbermill.shutDown()
        else:
            self.logger.debug("Connection to %s successful." %
                              self.getConfigurationValue('host'))
        return mongodb_client

    def handleEvent(self, event):
        if self.format:
            publish_data = self.getConfigurationValue('format', event)
        else:
            publish_data = event
        self.buffer.append(publish_data)
        yield None

    def storeData(self, events):
        mongo_db = self.mongodb[self.database]
        bulk_objects = {}
        for event in events:
            collection_name = mapDynamicValueInString(
                self.collection, event, use_strftime=True).lower()
            doc_id = mapDynamicValue(self.doc_id_pattern, event)
            if not doc_id:
                self.logger.error("Could not find doc_id %s for event %s." %
                                  (self.doc_id_pattern, event))
                continue
            event['_id'] = doc_id
            if collection_name not in bulk_objects.keys():
                bulk_objects[collection_name] = mongo_db[
                    collection_name].initialize_ordered_bulk_op()
            try:
                bulk_objects[collection_name].insert(event)
            except:
                etype, evalue, etb = sys.exc_info()
                self.logger.error(
                    "Server communication error. Exception: %s, Error: %s." %
                    (etype, evalue))
                self.logger.debug("Payload: %s" % event)
                if "Broken pipe" in evalue or "Connection reset by peer" in evalue:
                    self.mongodb = self.connect()
        for collection_name, bulk_object in bulk_objects.iteritems():
            try:
                result = bulk_object.execute()
                self.logger.debug(str(result))
            except:
                etype, evalue, etb = sys.exc_info()
                self.logger.error(
                    "Server communication error. Exception: %s, Error: %s." %
                    (etype, evalue))

    def shutDown(self):
        try:
            self.buffer.flush()
        except:
            pass
        BaseThreadedModule.shutDown(self)
Beispiel #11
0
class SQSSink(BaseThreadedModule):
    """
    Send messages to amazon sqs service.

    aws_access_key_id: Your AWS id.
    aws_secret_access_key: Your AWS password.
    region: The region in which to find your sqs service.
    queue: Queue name.
    format: Which event fields to send on, e.g. '$(@timestamp) - $(url) - $(country_code)'.
            If not set event.data will be send es MessageBody, all other fields will be send as MessageAttributes.
    store_interval_in_secs: Send data to redis in x seconds intervals.
    batch_size: Number of messages to collect before starting to send messages to sqs. This refers to the internal
                receive buffer of this plugin. When the receive buffer is maxed out, this plugin will always send
                the maximum of 10 messages in one send_message_batch call.
    backlog_size: Maximum count of events waiting for transmission. Events above count will be dropped.

    values: ['us-east-1', 'us-west-1', 'us-west-2', 'eu-central-1', 'eu-west-1', 'ap-southeast-1', 'ap-southeast-2', 'ap-northeast-1', 'sa-east-1', 'us-gov-west-1', 'cn-north-1']

    Configuration template:

    - SQSSink:
       aws_access_key_id:               # <type: string; is: required>
       aws_secret_access_key:           # <type: string; is: required>
       region:                          # <type: string; is: required>
       queue:                           # <type: string; is: required>
       format:                          # <default: None; type: None||string; is: optional>
       store_interval_in_secs:          # <default: 5; type: integer; is: optional>
       batch_size:                      # <default: 500; type: integer; is: optional>
       backlog_size:                    # <default: 500; type: integer; is: optional>
       receivers:
        - NextModule
    """

    module_type = "output"
    """Set module type"""
    def configure(self, configuration):
        # Call parent configure method
        BaseThreadedModule.configure(self, configuration)
        # Set boto log level.
        logging.getLogger('boto3').setLevel(logging.CRITICAL)
        logging.getLogger('botocore').setLevel(logging.CRITICAL)
        self.batch_size = self.getConfigurationValue('batch_size')
        self.format = self.getConfigurationValue('format')

    def getStartMessage(self):
        return "Queue: %s [%s]. Max buffer size: %d" % (
            self.getConfigurationValue('queue'),
            self.getConfigurationValue('region'),
            self.getConfigurationValue('backlog_size'))

    def initAfterFork(self):
        BaseThreadedModule.initAfterFork(self)
        self.buffer = Buffer(
            self.getConfigurationValue('batch_size'),
            self.storeData,
            self.getConfigurationValue('store_interval_in_secs'),
            maxsize=self.getConfigurationValue('backlog_size'))
        try:
            self.sqs_resource = boto3.resource(
                'sqs',
                region_name=self.getConfigurationValue('region'),
                api_version=None,
                use_ssl=True,
                verify=None,
                endpoint_url=None,
                aws_access_key_id=self.getConfigurationValue(
                    'aws_access_key_id'),
                aws_secret_access_key=self.getConfigurationValue(
                    'aws_secret_access_key'),
                aws_session_token=None,
                config=None)
            self.sqs_queue = self.sqs_resource.get_queue_by_name(
                QueueName=self.getConfigurationValue('queue'))
        except:
            etype, evalue, etb = sys.exc_info()
            self.logger.error(
                "Could not connect to sqs service. Exception: %s, Error: %s." %
                (etype, evalue))
            self.lumbermill.shutDown()

    def handleEvent(self, event):
        self.buffer.append(event)
        yield None

    def storeData(self, buffered_data):
        batch_messages = []
        for event in buffered_data:
            try:
                id = event['lumbermill']['event_id']
            except KeyError:
                id = "%032x%s" % (random.getrandbits(128), os.getpid())
            message = {'Id': id}
            if self.format:
                event = mapDynamicValue(self.format, event)
            else:
                try:
                    event = json.dumps(event)
                except:
                    etype, evalue, etb = sys.exc_info()
                    self.logger.warning(
                        "Error while encoding event data: %s to json. Exception: %s, Error: %s."
                        % (event, etype, evalue))
            message['MessageBody'] = event
            batch_messages.append(message)
            if len(batch_messages) % 10:
                self.sqs_queue.send_messages(Entries=batch_messages)
                batch_messages = []
        if len(batch_messages) > 0:
            self.send()

    def shutDown(self):
        self.buffer.flush()
Beispiel #12
0
class MongoDbSink(BaseThreadedModule):
    """
    Store incoming events in a mongodb.

    host: Mongodb server.
    database: Mongodb database name.
    collection: Mongodb collection name. Timepatterns like %Y.%m.%d and dynamic values like $(bar) are allowed here.
    optinonal_connection_params: Other optional parameters as documented in https://api.mongodb.org/python/current/api/pymongo/mongo_client.html
    format:     Which event fields to send on, e.g. '$(@timestamp) - $(url) - $(country_code)'.
                If not set the whole event dict is send.
    doc_id:     Sets the document id for the committed event data.
    store_interval_in_secs:     Send data to es in x seconds intervals.
    batch_size: Sending data to es if event count is above, even if store_interval_in_secs is not reached.
    backlog_size:   Maximum count of events waiting for transmission. If backlog size is exceeded no new events will be processed.

    Configuration template:

    - MongoDbSink:
       host:                            # <default: 'localhost:27017'; type: string; is: optional>
       database:                        # <default: 'lumbermill'; type: string; is: optional>
       collection:                      # <default: 'lumbermill-%Y.%m.%d'; type: string; is: optional>
       optinonal_connection_params:     # <default: {'serverSelectionTimeoutMS': 5}; type: dictionary; is: optional>
       format:                          # <default: None; type: None||string; is: optional>
       doc_id:                          # <default: '$(lumbermill.event_id)'; type: string; is: optional>
       store_interval_in_secs:          # <default: 5; type: integer; is: optional>
       batch_size:                      # <default: 500; type: integer; is: optional>
       backlog_size:                    # <default: 5000; type: integer; is: optional>
    """

    module_type = "output"
    """Set module type"""

    def configure(self, configuration):
        # Call parent configure method.
        BaseThreadedModule.configure(self, configuration)
        self.format = self.getConfigurationValue('format')
        self.collection = self.getConfigurationValue('collection')
        self.database = self.getConfigurationValue('database')
        self.doc_id_pattern = self.getConfigurationValue("doc_id")

    def getStartMessage(self):
        return "DB: %s. Max buffer size: %d" % (self.getConfigurationValue('database'), self.getConfigurationValue('backlog_size'))

    def initAfterFork(self):
        BaseThreadedModule.initAfterFork(self)
        # Init monogdb client after fork.
        self.mongodb = self.connect()
        if not self.mongodb:
            self.lumbermill.shutDown()
            return
        # As the buffer uses a threaded timed function to flush its buffer and thread will not survive a fork, init buffer here.
        self.buffer = Buffer(self.getConfigurationValue('batch_size'), self.storeData, self.getConfigurationValue('store_interval_in_secs'), maxsize=self.getConfigurationValue('backlog_size'))

    def connect(self):
        try:
            mongodb_client = pymongo.MongoClient(self.getConfigurationValue('host'), **self.getConfigurationValue('optinonal_connection_params'))
            self.logger.debug(str(mongodb_client.server_info()))
        except:
            etype, evalue, etb = sys.exc_info()
            self.logger.warning("Connection to %s failed. Exception: %s, Error: %s." % (self.getConfigurationValue('host'), etype, evalue))
        if not mongodb_client:
            self.logger.error("Connection to %s failed. Shutting down." % self.getConfigurationValue('host'))
            self.lumbermill.shutDown()
        else:
            self.logger.debug("Connection to %s successful." % self.getConfigurationValue('host'))
        return mongodb_client

    def handleEvent(self, event):
        if self.format:
            publish_data = self.getConfigurationValue('format', event)
        else:
            publish_data = event
        self.buffer.append(publish_data)
        yield None

    def storeData(self, events):
        mongo_db = self.mongodb[self.database]
        bulk_objects = {}
        for event in events:
            collection_name = mapDynamicValueInString(self.collection, event, use_strftime=True).lower()
            doc_id = mapDynamicValue(self.doc_id_pattern, event)
            if not doc_id:
                self.logger.error("Could not find doc_id %s for event %s." % (self.doc_id_pattern, event))
                continue
            event['_id'] = doc_id
            if collection_name not in bulk_objects.keys():
                bulk_objects[collection_name] = mongo_db[collection_name].initialize_ordered_bulk_op()
            try:
                bulk_objects[collection_name].insert(event)
            except:
                etype, evalue, etb = sys.exc_info()
                self.logger.error("Server communication error. Exception: %s, Error: %s." % (etype, evalue))
                self.logger.debug("Payload: %s" % event)
                if "Broken pipe" in evalue or "Connection reset by peer" in evalue:
                    self.mongodb = self.connect()
        for collection_name, bulk_object in bulk_objects.iteritems():
            try:
                result = bulk_object.execute()
                self.logger.debug(str(result))
            except:
                etype, evalue, etb = sys.exc_info()
                self.logger.error("Server communication error. Exception: %s, Error: %s." % (etype, evalue))

    def shutDown(self):
        try:
            self.buffer.flush()
        except:
            pass
        BaseThreadedModule.shutDown(self)
Beispiel #13
0
class GraphiteSink(BaseThreadedModule):
    """
    Send metrics to graphite server.

    server: Graphite server to connect to.
    port: Port carbon-cache is listening on.
    formats: Format of messages to send to graphite, e.g.: ['lumbermill.stats.event_rate_$(interval)s $(event_rate)'].
    store_interval_in_secs: Send data to graphite in x seconds intervals.
    batch_size: Send data to graphite if event count is above, even if store_interval_in_secs is not reached.
    backlog_size: Send count of events waiting for transmission. Events above count will be dropped.

    Here a simple example to send http_status statistics to graphite:

    ...

    - Statistics:
       interval: 10
       fields: ['http_status']

    - GraphiteSink:
       filter: if $(field_name) == "http_status"
       server: 127.0.0.1
       batch_size: 1
       formats: ['lumbermill.stats.http_200_$(interval)s $(field_counts.200)',
                 'lumbermill.stats.http_400_$(interval)s $(field_counts.400)',
                 'lumbermill.stats.http_total_$(interval)s $(total_count)']

    ...

    Configuration template:

    - GraphiteSink:
       server:                          # <default: 'localhost'; type: string; is: optional>
       port:                            # <default: 2003; type: integer; is: optional>
       formats:                         # <type: list; is: required>
       store_interval_in_secs:          # <default: 5; type: integer; is: optional>
       batch_size:                      # <default: 50; type: integer; is: optional>
       backlog_size:                    # <default: 50; type: integer; is: optional>
    """

    module_type = "output"
    """Set module type"""
    def configure(self, configuration):
        # Call parent configure method
        BaseThreadedModule.configure(self, configuration)
        self.formats = self.getConfigurationValue('formats')
        self.connection_data = (self.getConfigurationValue('server'),
                                self.getConfigurationValue('port'))
        self.connection = None

    def connect(self):
        # Connect to server
        connection = socket.socket()
        try:
            connection.connect(self.connection_data)
            return connection
        except:
            etype, evalue, etb = sys.exc_info()
            self.logger.error(
                "Failed to connect to %s. Exception: %s, Error: %s." %
                (self.connection_data, etype, evalue))
            return False

    def getStartMessage(self):
        return "%s:%s. Max buffer size: %d" % (
            self.connection_data[0], self.connection_data[1],
            self.getConfigurationValue('backlog_size'))

    def initAfterFork(self):
        BaseThreadedModule.initAfterFork(self)
        self.buffer = Buffer(
            self.getConfigurationValue('batch_size'),
            self.storeData,
            self.getConfigurationValue('store_interval_in_secs'),
            maxsize=self.getConfigurationValue('backlog_size'))
        self.connection = self.connect()
        if not self.connection:
            self.lumbermill.shutDown()
            return
        BaseThreadedModule.initAfterFork(self)

    def handleEvent(self, event):
        for format in self.formats:
            mapped_data = self.mapDynamicValue(format, event)
            if mapped_data:
                self.buffer.append("%s %s" % (mapped_data, int(time.time())))
        yield None

    def storeData(self, events):
        for event in events:
            try:
                if not event.endswith("\n"):
                    event += "\n"
                self.connection.send(event)
                return True
            except:
                etype, evalue, etb = sys.exc_info()
                self.logger.error(
                    "Server communication error. Exception: %s, Error: %s." %
                    (etype, evalue))
                tries = 0
                self.connection.close()
                self.connection = None
                while tries < 5 and not self.connection:
                    time.sleep(5)
                    self.logger.warning("Trying to reconnect to %s." %
                                        (self.connection_data))
                    # Try to reconnect.
                    self.connection = self.connect()
                    tries += 1
                if not self.connection:
                    self.logger.error("Reconnect failed. Shutting down.")
                    self.lumbermill.shutDown()
                else:
                    self.logger.info("Reconnection to %s successful." %
                                     (self.connection_data))

    def shutDown(self):
        try:
            self.connection.close()
        except:
            pass
Beispiel #14
0
class FileSink(BaseThreadedModule):
    """
    Store all received events in a file.

    file_name: absolute path to filen. String my contain pythons strtime directives and event fields, e.g. %Y-%m-%d.
    format: Which event fields to use in the logline, e.g. '$(@timestamp) - $(url) - $(country_code)'
    store_interval_in_secs: sending data to es in x seconds intervals.
    batch_size: sending data to es if event count is above, even if store_interval_in_secs is not reached.
    backlog_size: maximum count of events waiting for transmission. Events above count will be dropped.
    compress: Compress output as gzip or snappy file. For this to be effective, the chunk size should not be too small.

    Configuration template:

    - FileSink:
       file_name:                       # <type: string; is: required>
       format:                          # <default: '$(data)'; type: string; is: optional>
       store_interval_in_secs:          # <default: 10; type: integer; is: optional>
       batch_size:                      # <default: 500; type: integer; is: optional>
       backlog_size:                    # <default: 500; type: integer; is: optional>
       compress:                        # <default: None; type: None||string; values: [None,'gzip','snappy']; is: optional>
    """

    module_type = "output"
    """Set module type"""
    can_run_forked = False

    def configure(self, configuration):
        # Call parent configure method
        BaseThreadedModule.configure(self, configuration)
        self.batch_size = self.getConfigurationValue('batch_size')
        self.backlog_size = self.getConfigurationValue('backlog_size')
        self.file_name = self.getConfigurationValue('file_name')
        self.format = self.getConfigurationValue('format')
        self.compress = self.getConfigurationValue('compress')
        self.file_handles = {}
        if self.compress == 'gzip':
            try:
                # Import module into namespace of object. Otherwise it will not be accessible when process was forked.
                self.gzip_module = __import__('gzip')
            except ImportError:
                self.logger.error(
                    'Gzip compression selected but gzip module could not be loaded.'
                )
                self.lumbermill.shutDown()
        if self.compress == 'snappy':
            try:
                self.snappy_module = __import__('snappy')
            except ImportError:
                self.logger.error(
                    'Snappy compression selected but snappy module could not be loaded.'
                )
                self.lumbermill.shutDown()
        self.buffer = Buffer(
            self.batch_size,
            self.storeData,
            self.getConfigurationValue('store_interval_in_secs'),
            maxsize=self.backlog_size)
        TimedFunctionManager.startTimedFunction(self.closeStaleFileHandles)

    def getStartMessage(self):
        return "File: %s. Max buffer size: %d" % (
            self.file_name, self.getConfigurationValue('backlog_size'))

    @setInterval(60)
    def closeStaleFileHandles(self):
        """
        Close and delete file handles that are unused since 5 minutes.
        """
        for path, file_handle_data in self.file_handles.items():
            last_used_time_ago = time.time() - file_handle_data['lru']
            if last_used_time_ago < 300:
                continue
            self.logger.info('Closing stale file handle for %s.' % (path))
            file_handle_data['handle'].close()
            self.file_handles.pop(path)

    def closeAllFileHandles(self):
        for path, file_handle_data in self.file_handles.items():
            self.logger.info('Closing file handle for %s.' % path)
            file_handle_data['handle'].close()
            self.file_handles.pop(path)

    def ensurePathExists(self, path):
        dirpath = os.path.dirname(path)
        if not os.path.exists(dirpath):
            os.makedirs(dirpath)

    def handleEvent(self, event):
        self.buffer.append(event)
        yield None

    def getOrCreateFileHandle(self, path, mode):
        file_handle = None
        try:
            file_handle = self.file_handles[path]['handle']
            self.file_handles[path]['lru'] = time.time()
        except KeyError:
            try:
                file_handle = open(path, mode)
                self.file_handles[path] = {
                    'handle': file_handle,
                    'lru': time.time()
                }
            except:
                etype, evalue, etb = sys.exc_info()
                self.logger.error(
                    'Could no open %s for writing. Exception: %s, Error: %s.' %
                    (path, etype, evalue))
        return file_handle

    def storeData(self, events):
        write_data = collections.defaultdict(str)
        for event in events:
            path = mapDynamicValue(self.file_name,
                                   mapping_dict=event,
                                   use_strftime=True)
            line = mapDynamicValue(self.format, mapping_dict=event)
            write_data["%s" % path] += line + "\n"
        for path, lines in write_data.items():
            try:
                self.ensurePathExists(path)
            except:
                etype, evalue, etb = sys.exc_info()
                self.logger.error(
                    'Could no create path %s. Events could not be written. Exception: %s, Error: %s.'
                    % (path, etype, evalue))
                return
            mode = "a+"
            if self.compress == 'gzip':
                path += ".gz"
                mode += "b"
                lines = self.compressGzip(lines)
            elif self.compress == 'snappy':
                path += ".snappy"
                lines = self.compressSnappy(lines)
                mode += "b"
            try:
                fh = self.getOrCreateFileHandle(path, mode)
                fh.write(lines)
                fh.flush()
                return True
            except:
                etype, evalue, etb = sys.exc_info()
                self.logger.error(
                    'Could no write event data to %s. Exception: %s, Error: %s.'
                    % (path, etype, evalue))

    def shutDown(self):
        self.buffer.flush()
        self.closeAllFileHandles()
        BaseThreadedModule.shutDown(self)

    def compressGzip(self, data):
        buffer = StringIO()
        compressor = self.gzip_module.GzipFile(mode='wb', fileobj=buffer)
        try:
            compressor.write(data)
        finally:
            compressor.close()
        return buffer.getvalue()

    def compressSnappy(self, data):
        return self.snappy_module.compress(data)
class ElasticSearchSink(BaseThreadedModule):
    """
    Store the data dictionary in an elasticsearch index.

    The elasticsearch module takes care of discovering all nodes of the elasticsearch cluster.
    Requests will the be loadbalanced via round robin.

    action:     Either index or update. If update be sure to provide the correct doc_id.
    fields:     Which event fields to send on, e.g. [timestamp, url, country_code].
                If not set the whole event dict is send.
    nodes:      Configures the elasticsearch nodes.
    read_timeout: Set number of seconds to wait until requests to elasticsearch will time out.
    connection_type:    One of: 'thrift', 'http'.
    http_auth:  'user:password'.
    use_ssl:    One of: True, False.
    index_name: Sets the index name. Timepatterns like %Y.%m.%d and dynamic values like $(bar) are allowed here.
    doc_id:     Sets the es document id for the committed event data.
    routing:    Sets a routing value (@see: http://www.elasticsearch.org/blog/customizing-your-document-routing/)
                Timepatterns like %Y.%m.%d are allowed here.
    ttl:        When set, documents will be automatically deleted after ttl expired.
                Can either set time in milliseconds or elasticsearch date format, e.g.: 1d, 15m etc.
                This feature needs to be enabled for the index.
                @See: http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/mapping-ttl-field.html
    sniff_on_start: The client can be configured to inspect the cluster state to get a list of nodes upon startup.
                    Might cause problems on hosts with multiple interfaces. If connections fail, try to deactivate this.
    sniff_on_connection_fail: The client can be configured to inspect the cluster state to get a list of nodes upon failure.
                              Might cause problems on hosts with multiple interfaces. If connections fail, try to deactivate this.
    store_interval_in_secs:     Send data to es in x seconds intervals.
    batch_size: Sending data to es if event count is above, even if store_interval_in_secs is not reached.
    backlog_size:   Maximum count of events waiting for transmission. If backlog size is exceeded no new events will be processed.

    Configuration template:

    - ElasticSearchSink:
       action:                          # <default: 'index'; type: string; is: optional; values: ['index', 'update']>
       fields:                          # <default: None; type: None||list; is: optional>
       nodes:                           # <type: string||list; is: required>
       read_timeout:                    # <default: 10; type: integer; is: optional>
       connection_type:                 # <default: 'urllib3'; type: string; values: ['urllib3', 'requests']; is: optional>
       http_auth:                       # <default: None; type: None||string; is: optional>
       use_ssl:                         # <default: False; type: boolean; is: optional>
       index_name:                      # <default: 'lumbermill-%Y.%m.%d'; type: string; is: optional>
       doc_id:                          # <default: '$(lumbermill.event_id)'; type: string; is: optional>
       doc_type:                        # <default: '$(lumbermill.event_type)'; type: string; is: optional>
       routing:                         # <default: None; type: None||string; is: optional>
       ttl:                             # <default: None; type: None||integer||string; is: optional>
       sniff_on_start:                  # <default: False; type: boolean; is: optional>
       sniff_on_connection_fail:        # <default: False; type: boolean; is: optional>
       store_interval_in_secs:          # <default: 5; type: integer; is: optional>
       batch_size:                      # <default: 500; type: integer; is: optional>
       backlog_size:                    # <default: 500; type: integer; is: optional>
    """

    module_type = "output"
    """Set module type"""

    def configure(self, configuration):
        # Call parent configure method.
        BaseThreadedModule.configure(self, configuration)
        for module_name in ['elasticsearch', 'urllib3', 'requests']:
            if self.getConfigurationValue('log_level') == 'info':
                logging.getLogger(module_name).setLevel(logging.WARN)
            else:
                # Set log level for elasticsarch library if configured to other than default.
                logging.getLogger(module_name).setLevel(self.logger.level)
        self.action = self.getConfigurationValue('action')
        self.fields = self.getConfigurationValue('fields')
        self.ttl = self.getConfigurationValue("ttl")
        self.index_name = self.getConfigurationValue("index_name")
        self.routing_pattern = self.getConfigurationValue("routing")
        self.doc_id_pattern = self.getConfigurationValue("doc_id")
        self.doc_type_pattern = self.getConfigurationValue("doc_type")
        self.doc_type_is_dynamic = self.isDynamicConfigurationValue("doc_type")
        self.es_nodes = self.getConfigurationValue("nodes")
        self.read_timeout = self.getConfigurationValue("read_timeout")
        if not isinstance(self.es_nodes, list):
            self.es_nodes = [self.es_nodes]
        if self.getConfigurationValue("connection_type") == 'urllib3':
            self.connection_class = elasticsearch.connection.Urllib3HttpConnection
        elif self.getConfigurationValue("connection_type") == 'requests':
            self.connection_class = elasticsearch.connection.RequestsHttpConnection

    def getStartMessage(self):
        return "Idx: %s. Max buffer size: %d" % (self.index_name, self.getConfigurationValue('backlog_size'))

    def initAfterFork(self):
        BaseThreadedModule.initAfterFork(self)
        # Init es client after fork as mentioned in https://elasticsearch-py.readthedocs.org/en/master/
        self.es = self.connect()
        if not self.es:
            self.lumbermill.shutDown()
            return
        # As the buffer uses a threaded timed function to flush its buffer and thread will not survive a fork, init buffer here.
        self.buffer = Buffer(self.getConfigurationValue('batch_size'), self.storeData, self.getConfigurationValue('store_interval_in_secs'), maxsize=self.getConfigurationValue('backlog_size'))

    def connect(self):
        es = False
        tries = 0
        while tries < 5 and not es:
            try:
                # Connect to es node and round-robin between them.
                self.logger.debug("Connecting to %s." % self.es_nodes)
                es = elasticsearch.Elasticsearch(self.es_nodes,
                                                 connection_class=self.connection_class,
                                                 timeout=self.read_timeout,
                                                 sniff_on_start=self.getConfigurationValue('sniff_on_start'),
                                                 sniff_on_connection_fail=self.getConfigurationValue('sniff_on_connection_fail'),
                                                 sniff_timeout=5,
                                                 maxsize=20,
                                                 use_ssl=self.getConfigurationValue('use_ssl'),
                                                 http_auth=self.getConfigurationValue('http_auth'))
            except:
                etype, evalue, etb = sys.exc_info()
                self.logger.warning("Connection to %s failed. Exception: %s, Error: %s." % (self.es_nodes, etype, evalue))
                self.logger.warning("Waiting %s seconds before retring to connect." % ((4 + tries)))
                time.sleep(4 + tries)
                tries += 1
                continue
        if not es:
            self.logger.error("Connection to %s failed. Shutting down." % self.es_nodes)
            self.lumbermill.shutDown()
        else:
            self.logger.debug("Connection to %s successful." % self.es_nodes)
        return es

    def handleEvent(self, event):
        if self.fields:
            publish_data = {}
            for field in self.fields:
                try:
                    publish_data.update(event[field])
                except KeyError:
                    continue
        else:
            publish_data = event
        self.buffer.append(publish_data)
        yield None

    def dataToElasticSearchJson(self, events):
        """
        Format data for elasticsearch bulk update.
        """
        json_data = []
        for event in events:
            index_name = mapDynamicValueInString(self.index_name, event, use_strftime=True).lower()
            doc_type = mapDynamicValueInString(self.doc_type_pattern, event)
            doc_id = mapDynamicValueInString(self.doc_id_pattern, event)
            routing = mapDynamicValue(self.routing_pattern, use_strftime=True)
            if not doc_id:
                self.logger.error("Could not find doc_id %s for event %s." % (self.getConfigurationValue("doc_id"), event))
                continue
            header = {self.action: {'_index': index_name,
                                    '_type': doc_type,
                                    '_id': doc_id}}
            if self.routing_pattern:
                header['index']['_routing'] = routing
            if self.ttl:
                header['index']['_ttl'] = self.ttl
            if self.action == 'update':
                event = {'doc': event}
            try:
                json_data.append("\n".join((json.dumps(header), json.dumps(event), "\n")))
            except UnicodeDecodeError:
                etype, evalue, etb = sys.exc_info()
                self.logger.error("Could not json encode %s. Exception: %s, Error: %s." % (event, etype, evalue))
        json_data = "".join(json_data)
        return json_data

    def storeData(self, events):
        json_data = self.dataToElasticSearchJson(events)
        try:
            #started = time.time()
            # Bulk update of 500 events took 0.139621019363.
            self.es.bulk(body=json_data)
            #print("Bulk update of %s events took %s." % (len(events), time.time() - started))
            return True
        except elasticsearch.exceptions.ConnectionError:
            try:
                self.logger.warning("Lost connection to %s. Trying to reconnect." % (self.es_nodes, self.index_name))
                self.es = self.connect()
            except:
                time.sleep(.5)
        except:
            etype, evalue, etb = sys.exc_info()
            self.logger.error("Server communication error. Exception: %s, Error: %s." % (etype, evalue))
            self.logger.debug("Payload: %s" % json_data)
            if "Broken pipe" in evalue or "Connection reset by peer" in evalue:
                self.es = self.connect()

    def shutDown(self):
        try:
            self.buffer.flush()
        except:
            pass
        BaseThreadedModule.shutDown(self)
Beispiel #16
0
class RedisStore(BaseThreadedModule):
    """
    A simple wrapper around the redis python module.

    It can be used to store results of modules in a redis key/value store.

        server: Redis server to connect to.
        cluster: Dictionary of redis masters as keys and pack_followers as values, e.g.: {'172.16.0.1:6379': '172.16.0.2:6379'}
        port: Port redis server is listening on.
        db: Redis db.
        password: Redis password.
        socket_timeout: Socket timeout in seconds.
        charset: Charset to use.
        errors: tbd.
        decode_responses: specifies whether return values from Redis commands get decoded automatically using the client's charset value.
        unix_socket_path: Path to unix socket file.

    When set, the following options cause RedisStore to use a buffer for setting values.
    Multiple values are set via the pipe command, which speeds up storage. Still this comes at a price.
    Buffered values, that have not yet been send to redis, will be lost when LumberMill crashes.

        store_interval_in_secs: Sending data to redis in x seconds intervals.
        batch_size: Sending data to redis if count is above, even if store_interval_in_secs is not reached.
        backlog_size: Maximum count of values waiting for transmission. Values above count will be dropped.

    Configuration template:

    - RedisStore:
       server:                         # <default: 'localhost'; type: string; is: optional>
       cluster:                        # <default: {}; type: dictionary; is: optional>
       port:                           # <default: 6379; type: integer; is: optional>
       db:                             # <default: 0; type: integer; is: optional>
       password:                       # <default: None; type: None||string; is: optional>
       socket_timeout:                 # <default: 10; type: integer; is: optional>
       charset:                        # <default: 'utf-8'; type: string; is: optional>
       errors:                         # <default: 'strict'; type: string; is: optional>
       decode_responses:               # <default: False; type: boolean; is: optional>
       unix_socket_path:               # <default: None; type: None||string; is: optional>
       batch_size:                     # <default: None; type: None||integer; is: optional>
       store_interval_in_secs:         # <default: None; type: None||integer; is: optional>
       backlog_size:                   # <default: 5000; type: integer; is: optional>
    """
    module_type = "stand_alone"
    """Set module type"""

    def configure(self, configuration):
         # Call parent configure method
        BaseThreadedModule.configure(self, configuration)
        if len(self.getConfigurationValue('cluster')) == 0:
            redis_store = self.getConfigurationValue('server')
            self.client = self.getRedisClient()
        else:
            redis_store = self.getConfigurationValue('cluster')
            self.client = self.getClusterRedisClient()
        try:
            self.client.ping()
        except:
            etype, evalue, etb = sys.exc_info()
            self.logger.error("Could not connect to redis store at %s. Exception: %s, Error: %s." % (redis_store,etype, evalue))
            self.lumbermill.shutDown()
        self.set_buffer = None
        if self.getConfigurationValue('store_interval_in_secs') or self.getConfigurationValue('batch_size'):
            self.set_buffer = Buffer(self.getConfigurationValue('batch_size'), self.setBufferedCallback, self.getConfigurationValue('store_interval_in_secs'), maxsize=self.getConfigurationValue('backlog_size'))
            self._set = self.set
            self.set = self.setBuffered
            self._get = self.get
            self.get = self.getBuffered
            self._delete = self.delete
            self.delete = self.deleteBuffered
            self._pop = self.pop
            self.pop = self.popBuffered

    def getRedisClient(self):
        try:
            client = redis.StrictRedis(host=self.getConfigurationValue('server'),
                                       port=self.getConfigurationValue('port'),
                                       db=self.getConfigurationValue('db'),
                                       password=self.getConfigurationValue('password'),
                                       socket_timeout=self.getConfigurationValue('socket_timeout'),
                                       charset=self.getConfigurationValue('charset'),
                                       errors=self.getConfigurationValue('errors'),
                                       decode_responses=self.getConfigurationValue('decode_responses'),
                                       unix_socket_path=self.getConfigurationValue('unix_socket_path'))
            return client
        except:
            etype, evalue, etb = sys.exc_info()
            self.logger.error("Could not connect to redis store at %s. Exception: %s, Error: %s." % (self.getConfigurationValue['server'], etype, evalue))

    def getClusterRedisClient(self):
        import rediscluster
        # TODO: Implement a locking mechnism for the cluster client.
        # Some modules like Facet depend on this.
        cluster = {'nodes': {}, 'master_of': {}}
        counter = 1
        for master_node, slave_nodes in self.getConfigurationValue('cluster').items():
            master_node_key = "node_%d" % counter
            node_name_or_ip, node_port = self._parseRedisServerAddress(master_node)
            cluster['nodes'].update({master_node_key: {'host':node_name_or_ip, 'port': node_port}})
            #if 'default_node' not in cluster:
            #    cluster['default_node'] = master_node
            if type(slave_nodes) is str:
                slave_nodes = [slave_nodes]
            for slave_node in slave_nodes:
                counter += 1
                slave_node_key = "node_%d" % counter
                node_name_or_ip, node_port = self._parseRedisServerAddress(slave_node)
                cluster['nodes'].update({slave_node_key: {'host':node_name_or_ip, 'port': node_port}})
                #cluster['master_of'].update({master_node_key: slave_node_key})
        client = rediscluster.StrictRedisCluster(cluster=cluster,
                                                 db=self.getConfigurationValue('db'))
        return client

    def _parseRedisServerAddress(self, node_address):
        try:
            node_name_or_ip, node_port = node_address.split(":")
        except ValueError:
            node_name_or_ip = node_address
            node_port = self.getConfigurationValue('port')
        return (node_name_or_ip, node_port)

    def getClient(self):
        return self.client

    def getLock(self, name, timeout=None, sleep=0.1):
        return self.client.lock(name, timeout, sleep)

    def set(self, key, value, ttl=0, pickle=True):
        if pickle is True:
            try:
                value = cPickle.dumps(value)
            except:
                etype, evalue, etb = sys.exc_info()
                self.logger.error("Could not store %s:%s in redis. Exception: %s, Error: %s." % (key, value, etype, evalue))
                raise
        if ttl:
            self.client.setex(key, ttl, value)
        else:
            self.client.set(key, value)

    def setBuffered(self, key, value, ttl=0, pickle=True):
        if pickle is True:
            try:
                value = cPickle.dumps(value)
            except:
                etype, evalue, etb = sys.exc_info()
                self.logger.error("Could not store %s:%s in redis. Exception: %s, Error: %s." % (key, value, etype, evalue))
                raise
        if ttl:
            self.set_buffer.append({'key':key, 'ttl': ttl, 'value': value})
        else:
            self.set_buffer.append({'key':key, 'value': value})

    def setBufferedCallback(self, values):
        pipe = self.client.pipeline()
        for value in values:
            if 'ttl' in value:
                pipe.setex(value['key'], value['ttl'], value['value'])
            else:
                pipe.set(value['key'], value['value'])
        try:
            pipe.execute()
            return True
        except:
            etype, evalue, etb = sys.exc_info()
            self.logger.error("Could not flush buffer. Exception: %s, Error: %s." % (etype, evalue))


    def get(self, key, unpickle=True):
        value = self.client.get(key)
        if unpickle and value:
            try:
                value = cPickle.loads(value)
            except:
                etype, evalue, etb = sys.exc_info()
                self.logger.error("Could not unpickle %s:%s from redis. Exception: %s, Error: %s." % (key, value, etype, evalue))
                raise
        return value

    def getBuffered(self, key, unpickle=True):
        try:
            value_idx = next(index for (index, entry) in enumerate(self.set_buffer.buffer) if entry["key"] == key)
            return self.set_buffer.buffer[value_idx]
        except:
            return self._get(key, unpickle)

    def delete(self, key):
        self.client.delete(key)

    def deleteBuffered(self, key):
        try:
            value_idx = next(index for (index, entry) in enumerate(self.set_buffer.buffer) if entry["key"] == key)
            self.set_buffer.buffer.pop(value_idx)
            return
        except:
            self._delete(key)

    def pop(self, key, unpickle=True):
        value = self.get(key, unpickle)
        if value:
            self.delete(key)
        return value

    def popBuffered(self, key, unpickle=True):
        try:
            value_idx = next(index for (index, entry) in enumerate(self.set_buffer.buffer) if entry["key"] == key)
            return self.set_buffer.buffer.pop(value_idx)
        except:
            return self._pop(key, unpickle)

    def shutDown(self):
        try:
            self.buffer.flush()
        except:
            pass
        BaseThreadedModule.shutDown(self)