def storeData(self, events): mongo_db = self.mongodb[self.database] bulk_objects = {} for event in events: collection_name = mapDynamicValueInString( self.collection, event, use_strftime=True).lower() doc_id = mapDynamicValue(self.doc_id_pattern, event) if not doc_id: self.logger.error("Could not find doc_id %s for event %s." % (self.doc_id_pattern, event)) continue event['_id'] = doc_id if collection_name not in bulk_objects.keys(): bulk_objects[collection_name] = mongo_db[ collection_name].initialize_ordered_bulk_op() try: bulk_objects[collection_name].insert(event) except: etype, evalue, etb = sys.exc_info() self.logger.error( "Server communication error. Exception: %s, Error: %s." % (etype, evalue)) self.logger.debug("Payload: %s" % event) if "Broken pipe" in evalue or "Connection reset by peer" in evalue: self.mongodb = self.connect() for collection_name, bulk_object in bulk_objects.iteritems(): try: result = bulk_object.execute() self.logger.debug(str(result)) except: etype, evalue, etb = sys.exc_info() self.logger.error( "Server communication error. Exception: %s, Error: %s." % (etype, evalue))
def dataToElasticSearchJson(self, events): """ Format data for elasticsearch bulk update. """ json_data = [] for event in events: index_name = mapDynamicValueInString(self.index_name, event, use_strftime=True).lower() doc_type = mapDynamicValueInString(self.doc_type_pattern, event) doc_id = mapDynamicValueInString(self.doc_id_pattern, event) routing = mapDynamicValue(self.routing_pattern, use_strftime=True) if not doc_id: self.logger.error("Could not find doc_id %s for event %s." % (self.getConfigurationValue("doc_id"), event)) continue header = {self.action: {'_index': index_name, '_type': doc_type, '_id': doc_id}} if self.routing_pattern: header['index']['_routing'] = routing if self.ttl: header['index']['_ttl'] = self.ttl if self.action == 'update': event = {'doc': event} try: json_data.append("\n".join((json.dumps(header), json.dumps(event), "\n"))) except UnicodeDecodeError: etype, evalue, etb = sys.exc_info() self.logger.error("Could not json encode %s. Exception: %s, Error: %s." % (event, etype, evalue)) json_data = "".join(json_data) return json_data
def storeData(self, events): mongo_db = self.mongodb[self.database] bulk_objects = {} for event in events: collection_name = mapDynamicValueInString(self.collection, event, use_strftime=True).lower() doc_id = mapDynamicValue(self.doc_id_pattern, event) if not doc_id: self.logger.error("Could not find doc_id %s for event %s." % (self.doc_id_pattern, event)) continue event['_id'] = doc_id if collection_name not in bulk_objects.keys(): bulk_objects[collection_name] = mongo_db[collection_name].initialize_ordered_bulk_op() try: bulk_objects[collection_name].insert(event) except: etype, evalue, etb = sys.exc_info() self.logger.error("Server communication error. Exception: %s, Error: %s." % (etype, evalue)) self.logger.debug("Payload: %s" % event) if "Broken pipe" in evalue or "Connection reset by peer" in evalue: self.mongodb = self.connect() for collection_name, bulk_object in bulk_objects.iteritems(): try: result = bulk_object.execute() self.logger.debug(str(result)) except: etype, evalue, etb = sys.exc_info() self.logger.error("Server communication error. Exception: %s, Error: %s." % (etype, evalue))
def storeData(self, buffered_data): batch_messages = [] for event in buffered_data: try: id = event['lumbermill']['event_id'] except KeyError: id = "%032x%s" % (random.getrandbits(128), os.getpid()) message = {'Id': id} if self.format: event = mapDynamicValue(self.format, event) else: try: event = json.dumps(event) except: etype, evalue, etb = sys.exc_info() self.logger.warning( "Error while encoding event data: %s to json. Exception: %s, Error: %s." % (event, etype, evalue)) message['MessageBody'] = event batch_messages.append(message) if len(batch_messages) % 10: self.sqs_queue.send_messages(Entries=batch_messages) batch_messages = [] if len(batch_messages) > 0: self.send()
def handleEvent(self, event): if self.format: publish_data = mapDynamicValue(self.format, event) else: publish_data = event self.buffer.append(publish_data) yield None
def __handleEvent(self, event): if self.format: publish_data = mapDynamicValue(self.format, event) else: publish_data = event self.buffer.append(publish_data) yield None
def handleEvent(self, event): if self.format: publish_data = mapDynamicValue(self.format, event) else: publish_data = msgpack.packb(event) if self.topic: publish_data = "%s %s" % (self.topic, publish_data) self.buffer.append(publish_data) yield None
def testMapDynamicValueWithValueFormat(self): self.assertTrue(mapDynamicValue('%(longitude)d', self.event) == '7') self.assertTrue(mapDynamicValue('%(longitude)+d', self.event) == '+7') self.assertTrue(mapDynamicValue('%(longitude)05.2f', self.event) == '07.63') self.assertTrue(mapDynamicValue('%(fields.1)10s', self.event) == ' expects') self.assertTrue(mapDynamicValue('%(fields.1)-10s', self.event) == 'expects ') self.assertTrue(mapDynamicValue('%(fields.1).5s', self.event) == 'expec') self.assertTrue(mapDynamicValue('%(fields.1)-10.5s', self.event) == 'expec ')
def dataToElasticSearchJson(self, events): """ Format data for elasticsearch bulk update. """ json_data = [] for event in events: index_name = mapDynamicValueInString(self.index_name, event, use_strftime=True).lower() event_type = event['lumbermill'][ 'event_type'] if 'lumbermill' in event and 'event_type' in event[ 'lumbermill'] else 'Unknown' doc_id = mapDynamicValue(self.doc_id_pattern, event) routing = mapDynamicValue(self.routing_pattern, use_strftime=True) if not doc_id: self.logger.error( "Could not find doc_id %s for event %s." % (self.getConfigurationValue("doc_id"), event)) continue header = { self.action: { '_index': index_name, '_type': event_type, '_id': doc_id } } if self.routing_pattern: header['index']['_routing'] = routing if self.ttl: header['index']['_ttl'] = self.ttl if self.action == 'update': event = {'doc': event} try: json_data.append("\n".join( (json.dumps(header), json.dumps(event), "\n"))) except UnicodeDecodeError: etype, evalue, etb = sys.exc_info() self.logger.error( "Could not json encode %s. Exception: %s, Error: %s." % (event, etype, evalue)) json_data = "".join(json_data) return json_data
def handleEvent(self, event): if self.format: publish_data = mapDynamicValue(self.format, event) else: publish_data = event try: self.client.publish(self.getConfigurationValue('channel', event), publish_data) except: etype, evalue, etb = sys.exc_info() self.logger.error("Could not publish event to redis channel %s at %s. Exception: %s, Error: %s." % (self.getConfigurationValue('channel', event), self.getConfigurationValue('server'), etype, evalue)) yield None
def storeEvents(self, events): """ As a sidenote: synchronizing multiple processes with a lock to ensure only one process will write to a given file, seems not to work as expected. webhdfs does not directly free a lease on a file after appending. A better approach seems to be to retry the write a number of times before failing. """ if len(events) == 0: return self.is_storing = True path = time.strftime(self.path) self.ensureDirExists(path) write_data = collections.defaultdict(str) for event in events: filename = time.strftime( self.getConfigurationValue('name_pattern')) filename = filename % event line = mapDynamicValue(self.format, event) write_data[filename] += line write_tries = 0 retry_sleep_time = .4 for filename, lines in write_data.items(): if self.compress == 'gzip': filename += ".gz" lines = self.compressGzip(lines) elif self.compress == 'snappy': filename += ".snappy" lines = self.compressSnappy(lines) while write_tries < 10: try: self.ensureFileExists('%s/%s' % (path, filename)) self.hdfs.append_file('%s/%s' % (path, filename), lines) break except KeyError: etype, evalue, etb = sys.exc_info() self.logger.error( 'Could no log event %s. The format key %s was not present in event.' % (event, evalue)) except pywebhdfs.errors.PyWebHdfsException: write_tries += 1 # Retry max_retry times. This can solve problems like leases beeing hold by another process. if write_tries < 10: time.sleep(retry_sleep_time * write_tries) continue # Issue error after max retries. etype, evalue, etb = sys.exc_info() self.logger.error( 'Max write retries reached. Could no log event %s. Exception: %s, Error: %s.' % (event, etype, evalue)) self.events_container = [] self.is_storing = False
def handleEvent(self, event): while self.printing: time.sleep(.0001) self.printing = True if self.format: output = mapDynamicValue(self.format, event) else: output = event if self.pretty_print and not self.format: pprint.pprint(output, indent=4) else: print("%s" % output) self.printing = False yield None
def storeData(self, events): write_data = collections.defaultdict(str) for event in events: path = mapDynamicValue(self.file_name, mapping_dict=event, use_strftime=True) line = mapDynamicValue(self.format, mapping_dict=event) write_data["%s" % path] += line + "\n" for path, lines in write_data.items(): try: self.ensurePathExists(path) except: etype, evalue, etb = sys.exc_info() self.logger.error( 'Could no create path %s. Events could not be written. Exception: %s, Error: %s.' % (path, etype, evalue)) return mode = "a+" if self.compress == 'gzip': path += ".gz" mode += "b" lines = self.compressGzip(lines) elif self.compress == 'snappy': path += ".snappy" lines = self.compressSnappy(lines) mode += "b" try: fh = self.getOrCreateFileHandle(path, mode) fh.write(lines) fh.flush() return True except: etype, evalue, etb = sys.exc_info() self.logger.error( 'Could no write event data to %s. Exception: %s, Error: %s.' % (path, etype, evalue))
def handleEvent(self, event): if self.format: publish_data = mapDynamicValue(self.format, event) else: publish_data = event try: self.client.publish(self.getConfigurationValue('channel', event), publish_data) except: etype, evalue, etb = sys.exc_info() self.logger.error( "Could not publish event to redis channel %s at %s. Exception: %s, Error: %s." % (self.getConfigurationValue('channel', event), self.getConfigurationValue('server'), etype, evalue)) yield None
def configure(self, configuration): # Call parent configure method. BaseThreadedModule.configure(self, configuration) # Set log level for elasticsarch library if configured to other than default. if self.getConfigurationValue('log_level') != 'info': logging.getLogger('elasticsearch').setLevel(self.logger.level) logging.getLogger('requests').setLevel(self.logger.level) else: logging.getLogger('elasticsearch').setLevel(logging.WARN) logging.getLogger('requests').setLevel(logging.WARN) self.query = self.getConfigurationValue('query') # Test if query is valid json. try: json.loads(self.query) except: etype, evalue, etb = sys.exc_info() self.logger.error( "Parsing json query %s failed. Exception: %s, Error: %s." % (self.query, etype, evalue)) self.lumbermill.shutDown() self.search_type = self.getConfigurationValue('search_type') self.batch_size = self.getConfigurationValue('batch_size') self.field_mappings = self.getConfigurationValue('field_mappings') self.es_nodes = self.getConfigurationValue('nodes') self.read_timeout = self.getConfigurationValue("read_timeout") if not isinstance(self.es_nodes, list): self.es_nodes = [self.es_nodes] self.index_name_pattern = self.getConfigurationValue('index_name') self.index_name = mapDynamicValue(self.index_name_pattern, use_strftime=True).lower() if self.getConfigurationValue("connection_type") == 'urllib3': self.connection_class = connection.Urllib3HttpConnection elif self.getConfigurationValue('connection_type') == 'requests': self.connection_class = connection.RequestsHttpConnection self.lock = Lock() self.manager = Manager() if self.search_type == 'scan': self.can_run_forked = True scroll_id = self.getInitalialScrollId() if not scroll_id: self.lumbermill.shutDown() self.shared_scroll_id = self.manager.Value(c_char_p, scroll_id) elif self.search_type == 'normal': self.query_from = 0 self.query = json.loads(self.query) self.query['size'] = self.batch_size self.es = self.connect()
def storeEvents(self, events): """ As a sidenote: synchronizing multiple processes with a lock to ensure only one process will write to a given file, seems not to work as expected. webhdfs does not directly free a lease on a file after appending. A better approach seems to be to retry the write a number of times before failing. """ if len(events) == 0: return self.is_storing = True path = time.strftime(self.path) self.ensureDirExists(path) write_data = collections.defaultdict(str) for event in events: filename = time.strftime(self.getConfigurationValue('name_pattern')) filename = filename % event line = mapDynamicValue(self.format, event) write_data[filename] += line write_tries = 0 retry_sleep_time = .4 for filename, lines in write_data.items(): if self.compress == 'gzip': filename += ".gz" lines = self.compressGzip(lines) elif self.compress == 'snappy': filename += ".snappy" lines = self.compressSnappy(lines) while write_tries < 10: try: self.ensureFileExists('%s/%s' % (path, filename)) self.hdfs.append_file('%s/%s' % (path, filename), lines) break except KeyError: etype, evalue, etb = sys.exc_info() self.logger.error('Could no log event %s. The format key %s was not present in event.' % (event, evalue)) except pywebhdfs.errors.PyWebHdfsException: write_tries += 1 # Retry max_retry times. This can solve problems like leases beeing hold by another process. if write_tries < 10: time.sleep(retry_sleep_time * write_tries) continue # Issue error after max retries. etype, evalue, etb = sys.exc_info() self.logger.error('Max write retries reached. Could no log event %s. Exception: %s, Error: %s.' % (event, etype, evalue)) self.events_container = [] self.is_storing = False
def storeData(self, events): packet = [] for event in events: if self.timestamp_field: try: timestamp = event[self.timestamp_field] except KeyError: timestamp = None hostname = mapDynamicValue(self.hostname, mapping_dict=event, use_strftime=True) for field_name in self.fields: try: packet.append(ZabbixMetric(hostname, "%s%s" % (self.field_prefix, field_name), event[field_name], timestamp)) except KeyError: pass #self.logger.warning("Could not send metrics for %s:%s. Field not found." % (hostname, field_name)) response = self.zabbix_sender.send(packet) if response.failed != 0: self.logger.warning("%d of %d metrics were not processed correctly." % (response.total-response.processed, response.total))
def configure(self, configuration): # Call parent configure method. BaseThreadedModule.configure(self, configuration) # Set log level for elasticsarch library if configured to other than default. if self.getConfigurationValue('log_level') != 'info': logging.getLogger('elasticsearch').setLevel(self.logger.level) logging.getLogger('requests').setLevel(self.logger.level) else: logging.getLogger('elasticsearch').setLevel(logging.WARN) logging.getLogger('requests').setLevel(logging.WARN) self.query = self.getConfigurationValue('query') # Test if query is valid json. try: json.loads(self.query) except: etype, evalue, etb = sys.exc_info() self.logger.error("Parsing json query %s failed. Exception: %s, Error: %s." % (self.query, etype, evalue)) self.lumbermill.shutDown() self.search_type = self.getConfigurationValue('search_type') self.batch_size = self.getConfigurationValue('batch_size') self.field_mappings = self.getConfigurationValue('field_mappings') self.es_nodes = self.getConfigurationValue('nodes') self.read_timeout = self.getConfigurationValue("read_timeout") if not isinstance(self.es_nodes, list): self.es_nodes = [self.es_nodes] self.index_name_pattern = self.getConfigurationValue('index_name') self.index_name = mapDynamicValue(self.index_name_pattern, use_strftime=True).lower() if self.getConfigurationValue("connection_type") == 'urllib3': self.connection_class = connection.Urllib3HttpConnection elif self.getConfigurationValue('connection_type') == 'requests': self.connection_class = connection.RequestsHttpConnection self.lock = Lock() self.manager = Manager() if self.search_type == 'scroll': self.can_run_forked = True scroll_id = self.getInitalialScrollId() if not scroll_id: self.lumbermill.shutDown() self.shared_scroll_id = self.manager.Value(c_char_p, scroll_id) elif self.search_type == 'normal': self.query_from = 0 self.query = json.loads(self.query) self.query['size'] = self.batch_size self.es = self.connect()
def handleEvent(self, event): while self.printing: time.sleep(.0001) self.printing = True if self.format: output = mapDynamicValue(self.format, event) print("%s" % output) elif self.pretty_print: if not self.fields: output = event else: output = {} for field in self.fields: try: value = event[field] except KeyError: continue output[field] = value pprint.pprint(output, indent=4) self.printing = False yield None
def testMapDynamicValueWithTimePattern(self): timestring = datetime.datetime.utcnow().strftime('%Y.%m.%d') self.assertTrue(mapDynamicValue('test-%Y.%m.%d-%(lumbermill.event_id)s', self.event, use_strftime=True) == 'test-%s-715bd321b1016a442bf046682722c78e' % timestring)
def testMapDynamicValueWithDictType(self): # Make sure that mapDynamicValue will work on a copy of value when passing in a list or a dict. mapping_dict = {'event_id': '%(lumbermill.event_id)s'} mapped_values = mapDynamicValue(mapping_dict, self.event) self.assertEquals(mapped_values['event_id'], '715bd321b1016a442bf046682722c78e') self.assertEquals(mapping_dict, {'event_id': '%(lumbermill.event_id)s'})
def testMapDynamicValues(self): self.assertTrue(mapDynamicValue('%(bytes_send)s', self.event) == "3395") self.assertTrue(mapDynamicValue('%(lumbermill.event_id)s', self.event) == "715bd321b1016a442bf046682722c78e") self.assertTrue(mapDynamicValue('%(lumbermill.list.0)s', self.event) == "10") self.assertTrue(mapDynamicValue('%(lumbermill.list.2.hovercraft)s', self.event) == "eels") self.assertTrue(mapDynamicValue('%(params.spanish)s', self.event) == "[u'inquisition']")
def handleEvent(self, event): if self.format: self.syslogger.info(mapDynamicValue(self.format, event)) else: self.syslogger.info(event) yield None
def handleEvent(self, event): throttled_event_key = mapDynamicValue(self.key, event) throttled_event_count = self.setAndGetEventCountByKey(throttled_event_key) if self.min_count <= throttled_event_count <= self.max_count: yield event
def testMapDynamicValueWithNoneType(self): self.assertEquals(mapDynamicValue(None, self.event), None)
def testMapDynamicValueWithListType(self): # Make sure that mapDynamicValue will work on a copy of value when passing in a list or a dict. mapping_list = ['%(lumbermill.event_id)s'] mapped_values = mapDynamicValue(mapping_list, self.event) self.assertEquals(mapped_values[0], '715bd321b1016a442bf046682722c78e') self.assertEquals(mapping_list, ['%(lumbermill.event_id)s'])
def testMapDynamicValueWithMissingKey(self): self.assertTrue(mapDynamicValue('%(missing_key)s', self.event) == '%(missing_key)s')