class JobCommand(GeneratingCommand): handler = Option(require=True) search_name = Option(require=False) argv = Option(require=False) repeat_on_error = Option(default=True, validate=validators.Boolean()) repeat_on_success = Option(default=True, validate=validators.Boolean()) def generate(self): logging_handler = LoggingHandler() logger = get_logger() #root_logger = logging.getLogger() #configure_logger(root_logger) logger.addHandler(logging_handler) #root_logger.addHandler(get_handler()) logger.debug("running ...") try: func = get_method(self.handler) if self.argv: try: argv = json.loads(self.argv) except json.JSONDecodeError as e: err_msg = traceback.format_exc() logger.error("unable to decode argv: %s" % err_msg) #raise Stop() argv = [] else: argv = [] func(self.service, *argv) logger.debug("done") if self.repeat_on_success: return except exceptions.Repeat: logger.debug("will repeat") return except exceptions.Stop: logger.debug("will stop and not repeat") except Exception as e: err_msg = traceback.format_exc() #err_msg = "..............." logger.error("exception during job execution: %s" % err_msg) if self.repeat_on_error: logger.debug("will repeat") return finally: logging.shutdown() for e in logging_handler.events: yield e if self.search_name: self.service.saved_searches.delete(self.search_name) else: logger.error("missing search_name")
class snowIncidentCommand(GeneratingCommand): assigned = Option(require=True, validate=validators.List()) assigned_by = Option(require=False) daysAgo = Option(require=False, validate=validators.Integer(0)) active = Option(require=True, validate=validators.Boolean()) limit = Option(require=False, validate=validators.Integer(0)) env = Option(require=False) def generate(self): self.logger.debug('snowIncidentCommand: %s', self) searchinfo = self.metadata.searchinfo app = AppConf(searchinfo.splunkd_uri, searchinfo.session_key) env = self.env.lower() if self.env else 'production' conf = app.get_config('getsnow')[env] assigned_by = 'assignment_group' if self.assigned_by == 'group' else 'assigned_to' assignment = {'table': 'sys_user_group', 'field': 'name'} if self.assigned_by == 'group' else {'table': 'sys_user', 'field': 'user_name'} limit = self.limit if self.limit else 10000 snowincident = snow(conf['url'], conf['user'], conf['password']) sids = snowincident.getsysid(assignment['table'], assignment['field'], self.assigned) filters = snowincident.filterbuilder(assigned_by, sids) glide = 'sys_created_on>=javascript:gs.daysAgo({})'.format(self.daysAgo) if self.daysAgo else '' url = snowincident.reqencode(filters, table='incident', glide_system=glide, active=self.active, sysparm_limit=limit) for record in snowincident.getrecords(url): record = snowincident.updatevalue(record, sourcetype='snow:incident') record['_raw'] = json.dumps(record) record = dictexpand(record) yield record
def test_boolean(self): truth_values = { '1': True, '0': False, 't': True, 'f': False, 'true': True, 'false': False, 'y': True, 'n': False, 'yes': True, 'no': False } validator = validators.Boolean() for value in truth_values: for variant in value, value.capitalize(), value.upper(): for s in unicode(variant), bytes(variant): self.assertEqual(validator.__call__(s), truth_values[value]) self.assertIsNone(validator.__call__(None)) self.assertRaises(ValueError, validator.__call__, 'anything-else') return
class StubbedReportingCommand(ReportingCommand): boolean = Option( doc=''' **Syntax:** **boolean=***<value>* **Description:** A boolean value''', require=False, validate=validators.Boolean()) duration = Option( doc=''' **Syntax:** **duration=***<value>* **Description:** A length of time''', validate=validators.Duration()) fieldname = Option( doc=''' **Syntax:** **fieldname=***<value>* **Description:** Name of a field''', validate=validators.Fieldname()) file = Option( doc=''' **Syntax:** **file=***<value>* **Description:** Name of a file''', validate=validators.File(mode='r')) integer = Option( doc=''' **Syntax:** **integer=***<value>* **Description:** An integer value''', validate=validators.Integer()) optionname = Option( doc=''' **Syntax:** **optionname=***<value>* **Description:** The name of an option (used internally)''', validate=validators.OptionName()) regularexpression = Option( doc=''' **Syntax:** **regularexpression=***<value>* **Description:** Regular expression pattern to match''', validate=validators.RegularExpression()) set = Option( doc=''' **Syntax:** **set=***<value>* **Description:** Regular expression pattern to match''', validate=validators.Set("foo", "bar", "test")) @Configuration() def map(self, records): pass def reduce(self, records): pass
class Joiner(EventingCommand): """ combines transactions results into one json, assuming each of those individual results is a json """ should_list = Option( doc="should generate list instead of overwriting conflicting keys", default=False, validate=validators.Boolean()) def transform(self, records): """ applies the appropriate update function to each record """ updating_func = type(self)._update_and_overwrite if self.should_list: updating_func = self._update_and_list for record in records: unified_json = {} individual_json_list = record['_raw'].split('\n') updating_func(unified_json, individual_json_list) record['_raw'] = json.dumps(unified_json) yield record @staticmethod def _update_and_overwrite(unified_json, individual_json_list): for doc in individual_json_list: loaded = json.loads(doc) unified_json.update( loaded ) # update function automatically overwrites duplicate keys def _update_and_list(self, unified_json, individual_json_list): for doc in individual_json_list: loaded = json.loads(doc) for key, value in loaded.items(): existing_value = unified_json.get(key) if key in self.fieldnames and existing_value: # where self.fieldnames are keys NOT to list continue elif key in self.fieldnames and not existing_value: unified_json[key] = value else: type(self)._do_safe_update(unified_json, existing_value, key, value) @staticmethod def _do_safe_update(unified_json, existing_value, key, value): if existing_value: unified_json[key].append(value) else: unified_json[key] = [value]
class CloudgatewayHttpsCheck(GeneratingCommand): """ This command checks spacebridge reachability by using twisted to connect to the websocket echo endpoint and sending a message. The test is considered a success if it gets back the message it sent within 10 seconds. By default it will inherit Splunk's proxy settings and use them. In the command you can disable the proxy by passing useProxy=False. """ useProxy = Option(require=False, validate=validators.Boolean(), default=True) def __init__(self): super(CloudgatewayHttpsCheck, self).__init__() self.echo_state = EchoState() def timeout(self): self.echo_state.message = 'Timeout' reactor.stop() def test_wss(self): ws_url = "wss://{}/echo".format(config.get_spacebridge_server()) headers = {'Authorization': "f00d"} use_proxy = self.useProxy proxy, auth = config.get_ws_https_proxy_settings() if use_proxy: # Proxy setup if auth: headers['Proxy-Authorization'] = 'Basic ' + auth else: proxy = None factory = WebSocketClientFactory(ws_url, headers=headers, proxy=proxy) factory.protocol = CheckMobileWssProtocol factory.state = self.echo_state connectWS(factory) reactor.callLater(10, self.timeout) reactor.run() record = {'websocket': self.echo_state.ok, 'message': self.echo_state.message} return record def generate(self): record = self.test_wss() yield record
class CloudgatewayAsyncCheck(GeneratingCommand): """ This command checks spacebridge reachability by using twisted to make an http call to the health check endpoint. Any http return other than 200 is considered a failure. By default it will inherit Splunk's proxy settings and use them. In the command you can disable the proxy by passing useProxy=False. """ useProxy = Option(require=False, validate=validators.Boolean(), default=True) def __init__(self): super(CloudgatewayAsyncCheck, self).__init__() self.echo_state = EchoState() def run(self): proxy = config.get_https_proxy_settings() uri = "{}/health_check".format(config.get_spacebridge_domain()) if not self.useProxy: proxy = None client = AsyncClient(treq=noverify_treq_instance(https_proxy=proxy)) def done(result): if result.code == 200: self.echo_state.ok = True else: self.echo_state.message = 'Got http {}'.format(result.code) reactor.stop() def err(failure): self.echo_state.message = failure reactor.stop() d = client.async_get_request(uri, None) d.addCallback(done) d.addErrback(err) reactor.run() return { 'https_async': self.echo_state.ok, 'message': self.echo_state.message } def generate(self): yield self.run() ''' HELPERS '''
class DnsLookupCommand(StreamingCommand): perevent = Option( doc=''' **Syntax:** **perevent=***<perevent>* **Description:** create uuid per event''', require=False,validate=validators.Boolean(), default="false") def stream(self, records): guid = uuid.uuid4() for record in records: if self.perevent == True: guid = uuid.uuid4() record['uuid'] = str(guid) yield record
class Vader(StreamingCommand): """ Returns sentiment score between -1 and 1, can also return detailed sentiment values. ##Syntax .. code-block:: vader textfield=<field> ##Description Sentiment analysis using Valence Aware Dictionary and sEntiment Reasoner Using option full_output will return scores for neutral, positive, and negative which are the scores that make up the compound score (that is just returned as the field "sentiment". Best to feed in uncleaned data as it takes into account capitalization and punctuation. ##Example .. code-block:: * | vader textfield=sentence """ textfield = Option(require=True, doc=''' **Syntax:** **textfield=***<fieldname>* **Description:** Name of the field that will contain the text to search against''', validate=validators.Fieldname()) full_output = Option(default=False, doc=''' **Syntax:** **full_output=***<fieldname>* **Description:** If true, returns full sentiment values--neutral, positive, and negative--otherwise only compound is reutrned''', validate=validators.Boolean()) def stream(self, records): sentiment_analyzer = SentimentIntensityAnalyzer() for record in records: polarity = sentiment_analyzer.polarity_scores( record[self.textfield]) record['sentiment'] = polarity['compound'] if self.full_output: record['sentiment_neutral'] = polarity['neu'] record['sentiment_negative'] = polarity['neg'] record['sentiment_positive'] = polarity['pos'] yield record
class SecureGatewayAsyncCheck(GeneratingCommand): """ This command checks spacebridge reachability by using twisted to make an http call to the health check endpoint. Any http return other than 200 is considered a failure. By default it will inherit Splunk's proxy settings and use them. In the command you can disable the proxy by passing useProxy=False. """ useProxy = Option(require=False, validate=validators.Boolean(), default=True) def __init__(self): super(SecureGatewayAsyncCheck, self).__init__() self.echo_state = EchoState() async def run(self): proxy = config.get_https_proxy_settings() uri = "{}/health_check".format(config.get_spacebridge_domain()) if not self.useProxy: proxy = None client = AsyncClient(AioHttpClient(proxy=proxy)) try: result = await client.async_get_request(uri, None) if result.code == 200: self.echo_state.ok = True else: self.echo_state.message = 'Got http {}'.format(result.code) except Exception as e: self.echo_state.message = str(e) return { 'https_async': self.echo_state.ok, 'message': self.echo_state.message } def generate(self): loop = asyncio.new_event_loop() r = loop.run_until_complete(self.run()) loop.close() yield r ''' HELPERS '''
class SecureGatewayHttpsCheck(GeneratingCommand): """ This command checks spacebridge reachability by using requests to make an http call to the health check endpoint. Any http return other than 200 is considered a failure. By default it will inherit Splunk's proxy settings and use them. In the command you can disable the proxy by passing useProxy=False. """ useProxy = Option(require=False, validate=validators.Boolean(), default=True) def generate(self): spacebridge_server = config.get_spacebridge_domain() url = "{}/health_check".format(spacebridge_server) proxies = config.get_proxies() # Unset proxy, if unsetProxy = True if not self.useProxy: proxies = {} # Load data from REST API try: response = requests.get( url, proxies=proxies, timeout=15 ) response.raise_for_status() healthy = {'https_sync': True} except requests.exceptions.HTTPError as err: healthy = {'https_sync': False, 'message': str(err)} except ProxyError as err: healthy = {'https_sync': False, 'message': str(err)} except requests.ConnectionError as err: healthy = {'https_sync': False, 'message': str(err)} yield healthy ''' HELPERS '''
class AnkitCommand(GeneratingCommand): firstname = Option(require=True, validate=validators.OptionName()) lastname = Option(require=True, validate=validators.OptionName()) botn = Option(require=False, validate=validators.Boolean()) def generate(self): url = 'http://api.icndb.com/jokes/random?limitTo=nerdy&firstName={}&lastName={}'.format( self.firstname, self.lastname) response = requests.get(url) value = json.loads(response.text)['value'] joke = value['joke'] botn = self.botn botn_response_value = "" conc_string = joke if botn == True: botn_url = 'http://botn.splunk.link:8000/ep' response = requests.get(botn_url) botn_response_value = response.text conc_string = conc_string + "\n BOTN response: " + response.text yield {'_time': time.time(), '_raw': conc_string}
class CloudgatewayHttpsCheck(GeneratingCommand): """ This command will allow a user to check if any webpage is reachable by allowing a user to enter a url string. At this time, a 404 from either ims.prod-nlp.spl.mobi or auths.prod-nlp.spl.mobi is considered a success for connectivity. Any http return other than 200 or 404 is considered a failure. By default it will inherit Splunk's proxy settings and use them. In the command you can disable the proxy by passing useProxy=False. """ useProxy = Option(require=False, validate=validators.Boolean(), default=True) url = Option(require=True) def generate(self): url = self.url proxies = config.get_proxies() # Unset proxy, if unsetProxy = True if not self.useProxy: proxies = {} # Load data from REST API try: response = requests.get(url, proxies=proxies, timeout=15) response.raise_for_status() healthy = {'connected': True} except requests.exceptions.HTTPError as err: healthy = {'connected': False, 'message': err.message} except ProxyError as err: healthy = {'connected': False, 'message': err.message} yield healthy ''' HELPERS '''
class MakeAlertsCommand(StreamingCommand): time = Option(doc=''' **Syntax:** **time=***<field>* **Description:** Field name used to determine event time for the alert''', require=False, validate=validators.Fieldname(), default='_time') entity = Option(doc=''' **Syntax:** **entity=***<field>* **Description:** Field name used to determine the entity triggering the alert (account name, machine name, ...)''', require=False, validate=validators.Fieldname(), default='entity') alert_type = Option(doc=''' **Syntax:** **type=***<string>* **Description:** Field name used to determine the type of alert''', require=True, name='type') severity = Option(doc=''' **Syntax:** **severity=***<field>* **Description:** Field name used to set severity of the alert''', require=False, validate=validators.Fieldname(), default=None) idfield = Option(doc=''' **Syntax:** **idfield=***<field>* **Description:** Field name used to store the alert id''', require=False, default=None, validate=validators.Fieldname()) combine = Option(doc=''' **Syntax:** **combine=***"<fields>"* **Description:** Comma separated field names where alerts should be combined instead of creating new ones.''', require=False, default=None) combine_window = Option(doc=''' **Syntax:** **combine_window=***<string>* **Description:** hours or days. ''', require=False, default=None) interactive = Option(doc=''' **Syntax:** **interactive=***<bool>* **Description:** If true, makealerts can run in an interactive search, otherwise it will run only in scheduled search (this is to prevent alerts created accidentally when copy and pasting scheduled search text)''', require=False, default=False, validate=validators.Boolean()) preview = Option(doc=''' **Syntax:** **preview=***<bool>* **Description:** If true, makealerts does not create alerts but instead indicates what it would do in the preview field''', require=False, default=False, validate=validators.Boolean()) alerts = None def __init__(self): super(MakeAlertsCommand, self).__init__() self.insert_stats = InsertStats() self.loggerExtra = self.logger def is_scheduled(self): sid = self._metadata.searchinfo.sid return sid.startswith("scheduler_") or sid.startswith("rt_scheduler_") def stream(self, records): #self.logger.info('MakeAlertsCommand: %s, type of record %s', self, type(records)) # logs command line #self.logger.info('SEARCHINFO %s', self._metadata.searchinfo) sid = self._metadata.searchinfo.sid self.loggerExtra = CustomLogAdapter(self.logger, { 'sid': sid, 'type': self.alert_type }) if not self.interactive and not self.is_scheduled(): raise RuntimeError( "When testing makealerts from interactive search, provide the 'interative=t' option." ) if not self.alerts: self.alerts = AlertCollection( self._metadata.searchinfo.session_key) for record in records: search_context = SearchContext(self._metadata.searchinfo, self.loggerExtra) self.alerts.insert(record, event_time=self.time, entity=self.entity, alert_type=self.alert_type, severity=self.severity, idfield=self.idfield, combine=self.combine, combine_window=self.combine_window, preview=self.preview, search_context=search_context, insert_stats=self.insert_stats) if self.preview: record['preview'] = str(search_context.messages) yield record def finish(self): if self.interactive and ( not self.is_scheduled()) and self.insert_stats.errors > 0: self.write_error( "There were {0} error(s) when trying to insert data, check logs with this search 'index=_internal MakeAlertsCommand source=*super_simple_siem.log* ERROR'", self.insert_stats.errors) if not self.preview: self.loggerExtra.info('s3tag=stats', str(self.insert_stats)) try: super(MakeAlertsCommand, self).finish() except: pass
class mispgetevent(ReportingCommand): """ get the attributes from a MISP instance. ##Syntax .. code-block:: | mispgetevent misp_instance=<input> last=<int>(d|h|m) | mispgetevent misp_instance=<input> event=<id1>(,<id2>,...) | mispgetevent misp_instance=<input> date=<<YYYY-MM-DD> (date_to=<YYYY-MM-DD>) ##Description { "returnFormat": "mandatory", "page": "optional", "limit": "optional", "value": "optional", "type": "optional", "category": "optional", "org": "optional", "tag": "optional", "tags": "optional", "searchall": "optional", "date": "optional", "last": "optional", "eventid": "optional", "withAttachments": "optional", "metadata": "optional", "uuid": "optional", "published": "optional", "publish_timestamp": "optional", "timestamp": "optional", "enforceWarninglist": "optional", "sgReferenceOnly": "optional", "eventinfo": "optional", "excludeLocalTags": "optional" } # status "tag": "optional", "searchall": "optional", "metadata": "optional", "published": "optional", "sgReferenceOnly": "optional", "eventinfo": "optional", "excludeLocalTags": "optional" "returnFormat": forced to json, "page": param, "limit": param, "value": not managed, "type": param, CSV string, "category": param, CSV string, "org": not managed, "tags": param, see also not_tags "date": param, "last": param, "eventid": param, "withAttachments": forced to false, "uuid": not managed, "publish_timestamp": managed via param last "timestamp": not managed, "enforceWarninglist": not managed, } """ # MANDATORY MISP instance for this search misp_instance = Option(doc=''' **Syntax:** **misp_instance=instance_name* **Description:**MISP instance parameters as described in local/inputs.conf.''', require=True) # MANDATORY: json_request XOR eventid XOR last XOR date json_request = Option(doc=''' **Syntax:** **json_request=***valid JSON request* **Description:**Valid JSON request''', require=False) eventid = Option(doc=''' **Syntax:** **eventid=***id1(,id2,...)* **Description:**list of event ID(s) or event UUID(s).''', require=False, validate=validators.Match("eventid", r"^[0-9a-f,\-]+$")) last = Option(doc=''' **Syntax:** **last=***<int>d|h|m* **Description:** publication duration in day(s), hour(s) or minute(s). **nota bene:** last is an alias of published_timestamp''', require=False, validate=validators.Match("last", r"^[0-9]+[hdm]$")) date = Option(doc=''' **Syntax:** **date=***The user set event date field - any of valid time related filters"* **Description:**starting date. **eventid**, **last** and **date** are mutually exclusive''', require=False) # Other params page = Option(doc=''' **Syntax:** **page=***<int>* **Description:**define the page for each MISP search; default 1.''', require=False, validate=validators.Match("limit", r"^[0-9]+$")) limit = Option(doc=''' **Syntax:** **limit=***<int>* **Description:**define the limit for each MISP search; default 1000. 0 = no pagination.''', require=False, validate=validators.Match("limit", r"^[0-9]+$")) type = Option(doc=''' **Syntax:** **type=***CSV string* **Description:**Comma(,)-separated string of types to search for. Wildcard is %.''', require=False) category = Option(doc=''' **Syntax:** **category=***CSV string* **Description:**Comma(,)-separated string of categories to search for. Wildcard is %.''', require=False) tags = Option(doc=''' **Syntax:** **tags=***CSV string* **Description:**Comma(,)-separated string of tags to search for. Wildcard is %.''', require=False) not_tags = Option(doc=''' **Syntax:** **not_tags=***CSV string* **Description:**Comma(,)-separated string of tags to exclude. Wildcard is %.''', require=False) published = Option(doc=''' **Syntax:** **published=***<1|y|Y|t|true|True|0|n|N|f|false|False>* **Description:**select only published events (for option from to) .''', require=False, validate=validators.Boolean()) getioc = Option(doc=''' **Syntax:** **getioc=***<1|y|Y|t|true|True|0|n|N|f|false|False>* **Description:**Boolean to return the list of attributes together with the event.''', require=False, validate=validators.Boolean()) pipesplit = Option(doc=''' **Syntax:** **pipesplit=***<1|y|Y|t|true|True|0|n|N|f|false|False>* **Description:**Boolean to split multivalue attributes.''', require=False, validate=validators.Boolean()) @Configuration() def map(self, records): # self.logger.debug('mispevent.map') return records def reduce(self, records): # Phase 1: Preparation my_args = prepare_config(self) my_args['misp_url'] = my_args['misp_url'] + '/events/restSearch' # check that ONE of mandatory fields is present mandatory_arg = 0 if self.json_request is not None: mandatory_arg = mandatory_arg + 1 if self.eventid: mandatory_arg = mandatory_arg + 1 if self.last: mandatory_arg = mandatory_arg + 1 if self.date: mandatory_arg = mandatory_arg + 1 if mandatory_arg == 0: logging.error('Missing "json_request", eventid", \ "last" or "date" argument') raise Exception('Missing "json_request", "eventid", \ "last" or "date" argument') elif mandatory_arg > 1: logging.error('Options "json_request", eventid", "last" \ and "date" are mutually exclusive') raise Exception('Options "json_request", "eventid", "last" \ and "date" are mutually exclusive') body_dict = dict() # Only ONE combination was provided if self.json_request is not None: body_dict = json.loads(self.json_request) logging.info('Option "json_request" set') elif self.eventid: if "," in self.eventid: event_criteria = {} event_list = self.eventid.split(",") event_criteria['OR'] = event_list body_dict['eventid'] = event_criteria else: body_dict['eventid'] = self.eventid logging.info('Option "eventid" set with %s', json.dumps(body_dict['eventid'])) elif self.last: body_dict['last'] = self.last logging.info('Option "last" set with %s', str(body_dict['last'])) else: body_dict['date'] = self.date.split() logging.info('Option "date" set with %s', json.dumps(body_dict['date'])) # Force some values on JSON request body_dict['returnFormat'] = 'json' body_dict['withAttachments'] = False # set proper headers headers = {'Content-type': 'application/json'} headers['Authorization'] = my_args['misp_key'] headers['Accept'] = 'application/json' # Search pagination pagination = True if self.limit is not None: limit = int(self.limit) elif 'limit' in body_dict: limit = int(body_dict['limit']) else: limit = 1000 if limit == 0: pagination = False if self.page is not None: page = int(self.page) elif 'page' in body_dict: page = body_dict['page'] else: page = 1 if self.published is True: body_dict['published'] = True elif self.published is False: body_dict['published'] = False if self.category is not None: if "," in self.category: cat_criteria = {} cat_list = self.category.split(",") cat_criteria['OR'] = cat_list body_dict['category'] = cat_criteria else: body_dict['category'] = self.category if self.type is not None: if "," in self.type: type_criteria = {} type_list = self.type.split(",") type_criteria['OR'] = type_list body_dict['type'] = type_criteria else: body_dict['type'] = self.type if self.tags is not None or self.not_tags is not None: tags_criteria = {} if self.tags is not None: tags_list = self.tags.split(",") tags_criteria['OR'] = tags_list if self.not_tags is not None: tags_list = self.not_tags.split(",") tags_criteria['NOT'] = tags_list body_dict['tags'] = tags_criteria # output filter parameters if self.getioc is True: my_args['getioc'] = True else: my_args['getioc'] = False if self.pipesplit is True: my_args['pipe'] = True else: my_args['pipe'] = False results = [] # add colums for each type in results typelist = [] if pagination is True: body_dict['page'] = page body_dict['limit'] = limit body = json.dumps(body_dict) logging.error('mispgetevent request body: %s', body) # search r = requests.post(my_args['misp_url'], headers=headers, data=body, verify=my_args['misp_verifycert'], cert=my_args['client_cert_full_path'], proxies=my_args['proxies']) # check if status is anything other than 200; # throw an exception if it is r.raise_for_status() # response is 200 by this point or we would have thrown an exception response = r.json() if 'response' in response: for r_item in response['response']: if 'Event' in r_item: for a in list(r_item.values()): v = {} v['misp_event_id'] = str(a['id']) v['misp_orgc_id'] = str(a['orgc_id']) v['misp_event_date'] = str(a['date']) v['threat_level_id'] = str(a['threat_level_id']) v['misp_event_info'] = a['info'] v['misp_event_published'] = str(a['published']) v['misp_event_uuid'] = str(a['uuid']) v['misp_attribute_count'] = str(a['attribute_count']) v['misp_analysis'] = str(a['analysis']) v['misp_timestamp'] = str(a['timestamp']) v['misp_distribution'] = str(a['distribution']) v['misp_publish_timestamp'] = \ str(a['publish_timestamp']) v['misp_sharing_group_id'] = str(a['sharing_group_id']) v['misp_extends_uuid'] = str(a['extends_uuid']) if 'Orgc' in a: v['misp_orgc_name'] = str(a['Orgc']['name']) v['misp_orgc_uuid'] = str(a['Orgc']['uuid']) tag_list = [] if 'Tag' in a: for tag in a['Tag']: try: tag_list.append(str(tag['name'])) except Exception: pass v['misp_tag'] = tag_list if my_args['getioc'] is True: v['Attribute'] = list() v['misp_attribute_count'] = 0 if 'Attribute' in a: v['misp_attribute_count'] = \ v['misp_attribute_count'] + len(a['Attribute']) if my_args['getioc'] is True: for attribute in a['Attribute']: # combined: not part of an object AND # multivalue attribute AND to be split if int(attribute['object_id']) == 0 \ and '|' in attribute['type'] \ and my_args['pipe'] is True: mv_type_list = \ attribute['type'].split('|') mv_value_list = \ str(attribute['value']).split('|') left_a = attribute.copy() left_a['type'] = mv_type_list.pop() left_a['value'] = mv_value_list.pop() v['Attribute'].append( getioc(left_a, typelist, my_args['pipe'], left_a['object_id'])) right_a = attribute.copy() right_a['type'] = mv_type_list.pop() right_a['value'] = mv_value_list.pop() v['Attribute'].append( getioc(right_a, typelist, my_args['pipe'], right_a['object_id'])) else: v['Attribute'].append( getioc(attribute, typelist, my_args['pipe'], attribute['object_id'])) if 'Object' in a: for misp_o in a['Object']: if 'Attribute' in misp_o: v['misp_attribute_count'] = \ v['misp_attribute_count'] \ + len(misp_o['Attribute']) if my_args['getioc'] is True: object_id = misp_o['id'] object_name = misp_o['name'] object_comment = misp_o['comment'] for attribute in misp_o['Attribute']: v['Attribute'].append( getioc(attribute, typelist, my_args['pipe'], object_id, object_name, object_comment)) logging.debug('event is %s', json.dumps(v)) results.append(v) logging.info('typelist is %s', json.dumps(typelist)) # relevant_cat = ['Artifacts dropped', 'Financial fraud', # 'Network activity','Payload delivery','Payload installation'] logging.debug('results is %s', json.dumps(results)) if my_args['getioc'] is False: for e in results: yield e else: output_dict = {} for e in results: if 'Attribute' in e: for r in e['Attribute']: if int(r['misp_object_id']) == 0: # not an object key = str(e['misp_event_id']) + '_' \ + r['misp_attribute_id'] is_object_member = False else: # this is a MISP object key = str(e['misp_event_id']) + \ '_object_' + str(r['misp_object_id']) is_object_member = True if key not in output_dict: v = init_misp_output(e, r) for t in typelist: misp_t = 'misp_' \ + t.replace('-', '_').replace('|', '_p_') if t == r['misp_type']: v[misp_t] = r['misp_value'] else: v[misp_t] = '' to_ids = [] to_ids.append(r['misp_to_ids']) v['misp_to_ids'] = to_ids category = [] category.append(r['misp_category']) v['misp_category'] = category attribute_uuid = [] attribute_uuid.append(r['misp_attribute_uuid']) v['misp_attribute_uuid'] = attribute_uuid if is_object_member is True: v['misp_type'] = v['misp_object_name'] v['misp_value'] = v['misp_object_id'] output_dict[key] = dict(v) else: v = dict(output_dict[key]) misp_t = 'misp_' + r['misp_type'].replace('-', '_') v[misp_t] = r['misp_value'] # set value for type to_ids = v['misp_to_ids'] if r['misp_to_ids'] not in to_ids: to_ids.append(r['misp_to_ids']) v['misp_to_ids'] = to_ids category = v['misp_category'] # append if r['misp_category'] not in category: category.append(r['misp_category']) v['misp_category'] = category attribute_uuid = v['misp_attribute_uuid'] if r['misp_attribute_uuid'] not in attribute_uuid: attribute_uuid.append(r['misp_attribute_uuid']) v['misp_attribute_uuid'] = attribute_uuid if is_object_member is False: misp_type = r['misp_type'] \ + '|' + v['misp_type'] v['misp_type'] = misp_type misp_value = r['misp_value'] + \ '|' + v['misp_value'] v['misp_value'] = misp_value output_dict[key] = dict(v) for k, v in list(output_dict.items()): yield v
class CleanText(StreamingCommand): """ Counts the number of non-overlapping matches to a regular expression in a set of fields. ##Syntax .. code-block:: cleantext textfield=<field> [default_clean=<bool>] [remove_urls=<bool>] [remove_stopwords=<bool>] [base_word=<bool>] [base_type=<string>] [mv=<bool>] [force_nltk_tokenize=<bool>] [pos_tagset=<string>] [custom_stopwords=<comma_separated_string_list>] [term_min_len=<int>] [ngram_range=<int>-<int>] [ngram_mix=<bool>] ##Description Tokenize and normalize text (remove punctuation, digits, change to base_word) Different options result in better and slower cleaning. base_type="lemma_pos" being the slowest option, base_type="lemma" assumes every word is a noun, which is faster but still results in decent lemmatization. Many fields have a default already set, textfield is only required field. By default results in a multi-valued field which is ready for used with stats count by. ##Example .. code-block:: * | cleantext textfield=sentence """ textfield = Option(require=True, doc=''' **Syntax:** **textfield=***<fieldname>* **Description:** Name of the field that will contain the text to search against''', validate=validators.Fieldname()) keep_orig = Option(default=False, doc='''**Syntax:** **keep_orig=***<boolean>* **Description:** Maintain a copy of the original text for comparison or searching into field called orig_text''', validate=validators.Boolean()) default_clean = Option(default=True, doc='''**Syntax:** **default_clean=***<boolean>* **Description:** Change text to lowercase, remove punctuation, and removed numbers, defaults to true''', validate=validators.Boolean()) remove_urls = Option(default=True, doc='''**Syntax:** **remove_urls=***<boolean>* **Description:** Remove html links as part of text cleaning, defaults to true''', validate=validators.Boolean()) remove_stopwords = Option( default=True, doc='''**Syntax:** **remove_stopwords=***<boolean>* **Description:** Remove stopwords as part of text cleaning, defaults to true''', validate=validators.Boolean()) base_word = Option(default=True, doc='''**Syntax:** **base_word=***<boolean>* **Description:** Convert words to a base form as part of text cleaning, defaults to true and subject to value of base_type setting''', validate=validators.Boolean()) base_type = Option( default='lemma', doc='''**Syntax:** **base_type=***<string>* **Description:** Options are lemma, lemma_pos, or stem, defaults to lemma and subject to value of base_word setting being true''', ) mv = Option(default=True, doc='''**Syntax:** **mv=***<boolean>* **Description:** Returns words as multivalue otherwise words are space separated, defaults to true''', validate=validators.Boolean()) force_nltk_tokenize = Option( default=False, doc='''**Syntax:** **force_nltk_tokenize=***<boolean>* **Description:** Forces use of better NLTK word tokenizer but is slower, defaults to false''', validate=validators.Boolean()) pos_tagset = Option( default=None, doc='''**Syntax:** **pos_tagset=***<string>* **Description:** Options are universal, wsj, or brown; defaults to universal and subject to base_type set to "lemma_pos"''', ) custom_stopwords = Option( doc='''**Syntax:** **custom_stopwords=***<string>* **Description:** comma-separated list of custom stopwords, enclose in quotes''', ) term_min_len = Option(default=0, doc='''**Syntax:** **term_min_len=***<int>* **Description:** Only terms greater than or equal to this number will be returned. Useful if data has a lot of HTML markup.''', validate=validators.Integer()) ngram_range = Option( default='1-1', doc='''**Syntax:** **ngram_rane=***<int>-<int>* **Description:** Returns new ngram column with range of ngrams specified if max is greater than 1"''', ) ngram_mix = Option(default=False, doc='''**Syntax:** **ngram_mix=***<boolean>* **Description:** Determines if ngram output is combined or separate columns. Defaults to false which results in separate columns''', validate=validators.Boolean()) #http://dev.splunk.com/view/logging/SP-CAAAFCN def setup_logging(self): logger = logging.getLogger('splunk.foo') SPLUNK_HOME = os.environ['SPLUNK_HOME'] LOGGING_DEFAULT_CONFIG_FILE = os.path.join(SPLUNK_HOME, 'etc', 'log.cfg') LOGGING_LOCAL_CONFIG_FILE = os.path.join(SPLUNK_HOME, 'etc', 'log-local.cfg') LOGGING_STANZA_NAME = 'python' LOGGING_FILE_NAME = "nlp-text-analytics.log" BASE_LOG_PATH = os.path.join('var', 'log', 'splunk') LOGGING_FORMAT = "%(asctime)s %(levelname)-s\t%(module)s:%(lineno)d - %(message)s" splunk_log_handler = logging.handlers.RotatingFileHandler(os.path.join( SPLUNK_HOME, BASE_LOG_PATH, LOGGING_FILE_NAME), mode='a') splunk_log_handler.setFormatter(logging.Formatter(LOGGING_FORMAT)) logger.addHandler(splunk_log_handler) setupSplunkLogger(logger, LOGGING_DEFAULT_CONFIG_FILE, LOGGING_LOCAL_CONFIG_FILE, LOGGING_STANZA_NAME) return logger #https://stackoverflow.com/a/15590384 def get_wordnet_pos(self, treebank_tag): if treebank_tag.startswith('J'): return wordnet.ADJ elif treebank_tag.startswith('V'): return wordnet.VERB elif treebank_tag.startswith('N'): return wordnet.NOUN elif treebank_tag.startswith('R'): return wordnet.ADV else: return 'n' def f_remove_urls(self, text): return re.sub('https?://[^\b\s<]+', '', text) def ngram(self, text, min_n, max_n): ngram_list = [] for n in range(min_n, max_n): for ngram in ngrams(text, n): if len(ngram) > 1: ngram_list.append((len(ngram), ' '.join(ngram))) return ngram_list def stream(self, records): logger = self.setup_logging() logger.info('textfield set to: ' + self.textfield) if self.custom_stopwords: custom_stopwords = self.custom_stopwords.replace(' ', '').split(',') for record in records: if self.keep_orig: record['orig_text'] = record[self.textfield] #URL removal if self.remove_urls: record[self.textfield] = self.f_remove_urls( record[self.textfield]) #Tokenization if (self.base_word and self.base_type == 'lemma_pos') or self.force_nltk_tokenize: #lemma_pos - if option is lemmatization with POS tagging do cleaning and stopword options now if (self.base_word and self.base_type == 'lemma_pos'): record['pos_tuple'] = pos_tag(word_tokenize( record[self.textfield].decode('utf-8').encode( 'ascii', 'ignore')), tagset=self.pos_tagset) if self.default_clean and self.remove_stopwords: if self.custom_stopwords: stopwords = set( stop_words.words('english') + custom_stopwords) else: stopwords = set(stop_words.words('english')) record['pos_tuple'] = [ [re.sub(r'[\W\d]', '', text[0]).lower(), text[1]] for text in record['pos_tuple'] if re.sub(r'[\W\d]', '', text[0]).lower() not in stopwords and not re.search(r'[\W]', text[0]) ] elif self.default_clean and not self.remove_stopwords: record['pos_tuple'] = [ [re.sub(r'[\W\d]', '', text[0]).lower(), text[1]] for text in record['pos_tuple'] if not re.search(r'[\W]', text[0]) ] elif self.force_nltk_tokenize: record[self.textfield] = word_tokenize( record[self.textfield]) elif self.default_clean or (self.base_word and self.base_type == 'lemma'): #https://stackoverflow.com/a/1059601 record[self.textfield] = re.split('\W+', record[self.textfield]) else: record[self.textfield] = record[self.textfield].split() #Default Clean if self.default_clean and not self.base_type == 'lemma_pos': record[self.textfield] = [ re.sub(r'[\W\d]', '', text).lower() for text in record[self.textfield] ] #Lemmatization with POS tagging if self.base_word and self.base_type == 'lemma_pos': lm = WordNetLemmatizer() tuple_list = [] tag_list = [] record[self.textfield] = [] record['pos_tag'] = [] for text in record['pos_tuple']: keep_text = lm.lemmatize(text[0], self.get_wordnet_pos( text[1])).encode( 'ascii', 'ignore') if keep_text: record[self.textfield].append(keep_text) tuple_list.append([keep_text, text[1]]) tag_list.append(text[1]) record['pos_tag'] = tag_list record['pos_tuple'] = tuple_list #Lemmatization or Stemming with stopword removal if self.remove_stopwords and self.base_word and self.base_type != 'lemma_pos': if self.custom_stopwords: stopwords = set( stop_words.words('english') + custom_stopwords) else: stopwords = set(stop_words.words('english')) if self.base_type == 'lemma': lm = WordNetLemmatizer() record[self.textfield] = [ lm.lemmatize(text) for text in record[self.textfield] if text not in stopwords ] if self.base_type == 'stem': ps = PorterStemmer() record[self.textfield] = [ ps.stem(text) for text in record[self.textfield] if text not in stopwords ] #Lemmatization or Stemming without stopword removal if not self.remove_stopwords and self.base_word: if self.base_type == 'lemma': lm = WordNetLemmatizer() record[self.textfield] = [ lm.lemmatize(text) for text in record[self.textfield] ] if self.base_type == 'stem': ps = PorterStemmer() record[self.textfield] = [ ps.stem(text) for text in record[self.textfield] ] #Stopword Removal if self.remove_stopwords and not self.base_word: if self.custom_stopwords: stopwords = set( stop_words.words('english') + custom_stopwords) else: stopwords = set(stop_words.words('english')) record[self.textfield] = [ text for text in record[self.textfield] if text not in stopwords ] #Minimum term length if self.term_min_len > 0: record[self.textfield] = [ i for i in record[self.textfield] if len(i) >= self.term_min_len ] #ngram column creation (min_n, max_n) = self.ngram_range.split('-') if max_n > 1 and max_n >= min_n: max_n = int(max_n) + 1 ngram_extract = self.ngram( filter(None, record[self.textfield]), int(min_n), max_n) if ngram_extract: for i in ngram_extract: if not self.ngram_mix: if 'ngrams_' + str(i[0]) not in record: record['ngrams_' + str(i[0])] = [] record['ngrams_' + str(i[0])].append(i[1]) else: if 'ngrams' not in record: record['ngrams'] = [] record['ngrams'].append(i[1]) else: if not self.ngram_mix: for n in range(int(min_n), int(max_n)): if n != 1: record['ngrams_' + str(n)] = [] else: if 'ngrams' not in record: record['ngrams'] = [] #Final Multi-Value Output if not self.mv: record[self.textfield] = ' '.join(record[self.textfield]) try: record['pos_tag'] = ' '.join(record['pos_tag']) except: pass yield record
class JsonToFieldsCommand(StreamingCommand): json = Option( doc=''' **Syntax:** **json=***<field>* **Description:** Field name that contains the json string''', require=True, validate=validators.Fieldname()) prefix = Option( doc=''' **Syntax:** **prefix=***<string>* **Description:** Prefix to use to expand fields''', require=False) typeprefix = Option( doc=''' **Syntax:** **typeprefix=***<bool>* **Description:** If true, prefix fields with a letter indicating the type (long, int, float, string, json, array)''', require=False, default=False, validate=validators.Boolean()) def stream(self, records): self.logger.info('JsonToFieldsCommand: %s', self) # logs command line for record in records: json_str = record.get(self.json) if json_str: json_obj = json.loads(json_str) if self.prefix: prefix = self.prefix else: prefix = "" for key, value in json_obj.iteritems(): if (not self.fieldnames) or (key in self.fieldnames): if isinstance(value, basestring): if self.typeprefix: tp = "s_" else: tp = "" record[tp + prefix + key] = value elif isinstance(value, collections.Mapping): if self.typeprefix: tp = "j_" else: tp = "" record[tp + prefix + key] = json.dumps(value) elif isinstance(value, collections.Sequence): if self.typeprefix: tp = "a_" else: tp = "" record[tp + prefix + key] = [ json.dumps(s) for s in value ] else: if self.typeprefix: if isinstance(value, int): tp = "i_" elif isinstance(value, float): tp = "f_" elif isinstance(value, long): tp = "l_" else: tp = "x_" else: tp = "" record[tp + prefix + key] = value else: self.logger.warn('JsonToFieldsCommand: no field named %s', self.json) yield record
class EMGroupEntityMatchCommand(StreamingCommand): """ Match groups and entities based on group filter and entity dimensions ##Syntax .. code-block:: emgroupentitymatch selectedGroupIds="states,aws_instances" retainInput=false ##Description This custom search command will add 'group_id' and 'group_title' to all input entity records if they are members of a group - otherwise it will be omitted from the results unless retainInput is 'true'. Options: 1. selectedGroupIds -- indicates the selected groups that you want to match against the entities 2. retainInput -- indicates if the original input records should be attached to the output records if true, those records will have 'group_id' and 'group_title' set to 'N/A' for you to distinguish them. ##Example .. code-block:: | inputlookup em_entities | emgroupentitymatch selectedGroupIds="states,aws_instances" retainInput=false | stats count by group_title """ _group_records = None selected_group_ids = Option(doc='List of selected group ids, separated by comma.', name='selectedGroupIds', default=None, require=False, validate=validators.List()) retain_input_record = Option(doc='Boolean to indicate if user wants the input ' 'record to be added to the output without modification.', name='retainInput', default=False, require=False, validate=validators.Boolean()) def stream(self, records): """ Generator function that processes and yields event records to the Splunk stream pipeline. :param records: splunk event records :return: """ self._setup_group_records() self.logger.debug('EMGroupEntityMatchCommand: %s', self) # logs command line for record in records: if self.retain_input_record: record['group_id'] = 'N/A' record['group_title'] = 'N/A' yield record if len(self._group_records) > 0: for group_record in self._group_records: if self._match_group_entity(record, group_record.group_filter): record['group_id'] = group_record.group_id record['group_title'] = group_record.group_content.get('title') yield record else: yield record def _setup_group_records(self): """ Grabs the groups from KV Store and builds out the filter objects if they have yet to be built :return: None """ if self._group_records is None: collection = self.service.kvstore[STORE_GROUPS] group_data = collection.data.query() if self.selected_group_ids: selected_group_set = set(self.selected_group_ids) group_data = filter(lambda g: g['_key'] in selected_group_set, group_data) group_records = [] for group in group_data: filter_val = group.get('filter') d = {} if filter_val: for v in filter_val.split(','): dim_name, dim_val = v.strip().split('=') d.setdefault(dim_name, set()).add(dim_val) group_records.append(GroupRecord(group_id=group['_key'], group_filter=d, group_content=group)) self._group_records = group_records def _match_group_entity(self, record, filter_dict): """ Verify whether this record matches to group filters. Support wildcard in end of string :param record: :param filter_dict: :return: """ for dim_name, dim_val_set in filter_dict.iteritems(): record_vals = record.get('dimensions.%s' % dim_name) if not record_vals: return False if not isinstance(record_vals, list): record_vals = [record_vals] matched = False for rval in record_vals: # check if record value is one of the filter values if rval in dim_val_set: matched = True break # otherwise check if record value matches any of the fuzzy match values fuzzy_matches = filter(lambda v: v.endswith('*'), dim_val_set) if len(fuzzy_matches): matched = any(rval.startswith(v[:-1]) for v in fuzzy_matches) if not matched: return False return True
class MispSearchCommand(StreamingCommand): """ search in MISP for attributes matching the value of field. ##Syntax code-block:: mispsearch field=<field> onlyids=y|n ##Description body = { "returnFormat": "mandatory", "page": "optional", "limit": "optional", "value": "optional", "type": "optional", "category": "optional", "org": "optional", "tags": "optional", "from": "optional", "to": "optional", "last": "optional", "eventid": "optional", "withAttachments": "optional", "uuid": "optional", "publish_timestamp": "optional", "timestamp": "optional", "enforceWarninglist": "optional", "to_ids": "optional", "deleted": "optional", "includeEventUuid": "optional", "includeEventTags": "optional", "event_timestamp": "optional", "threat_level_id": "optional", "eventinfo": "optional" } ##Example Search in MISP for value of fieldname r_ip (remote IP in proxy logs). code-block:: * | mispsearch field=r_ip """ misp_instance = Option(doc=''' **Syntax:** **misp_instance=instance_name* **Description:**MISP instance parameters as described in local/inputs.conf''', require=True) field = Option(doc=''' **Syntax:** **field=***<fieldname>* **Description:**Name of the field containing the value to search for.''', require=True, validate=validators.Fieldname()) onlyids = Option(doc=''' **Syntax:** **onlyids=***<y|n>* **Description:** Boolean to search only attributes with to_ids set''', require=False, validate=validators.Boolean()) gettag = Option(doc=''' **Syntax:** **gettag=***<y|n>* **Description:** Boolean to return attribute tags''', require=False, validate=validators.Boolean()) includeEventUuid = Option(doc=''' **Syntax:** **includeEventUuid=***y|Y|1|true|True|n|N|0|false|False* **Description:**Boolean to include event UUID(s) to results.''', require=False, validate=validators.Boolean()) includeEventTags = Option(doc=''' **Syntax:** **includeEventTags=***y|Y|1|true|True|n|N|0|false|False* **Description:**Boolean to include event UUID(s) to results.''', require=False, validate=validators.Boolean()) last = Option(doc=''' **Syntax:** **last=***<int>d|h|m* **Description:**publication duration in day(s), hour(s) or minute(s). **eventid**, **last** and **date_from** are mutually exclusive''', require=False, validate=validators.Match("last", r"^[0-9]+[hdm]$")) limit = Option(doc=''' **Syntax:** **limit=***<int>* **Description:**define the limit for each MISP search; default 1000. 0 = no pagination.''', require=False, validate=validators.Match("limit", r"^[0-9]+$")) page = Option(doc=''' **Syntax:** **page=***<int>* **Description:**define the page for each MISP search; default 1.''', require=False, validate=validators.Match("limit", r"^[0-9]+$")) json_request = Option(doc=''' **Syntax:** **json_request=***valid JSON request* **Description:**Valid JSON request''', require=False) def stream(self, records): # Generate args my_args = prepare_config(self) my_args['misp_url'] = my_args['misp_url'] + '/attributes/restSearch' # set proper headers headers = {'Content-type': 'application/json'} headers['Authorization'] = my_args['misp_key'] headers['Accept'] = 'application/json' fieldname = str(self.field) if self.gettag is True: get_tag = True else: get_tag = False pagination = True if self.limit is not None: if int(self.limit) == 0: pagination = False else: limit = int(self.limit) else: limit = 1000 if self.page is not None: page = int(self.page) else: page = 1 if self.json_request is not None: body_dict = json.loads(self.json_request) logging.info('Option "json_request" set') body_dict['returnFormat'] = 'json' body_dict['withAttachments'] = False if 'limit' in body_dict: limit = int(body_dict['limit']) if limit == 0: pagination = False if 'page' in body_dict: page = body_dict['page'] pagination = False else: # build search JSON object body_dict = {"returnFormat": "json", "withAttachments": False} if self.onlyids is True: body_dict['to_ids'] = "True" if self.includeEventUuid is not None: body_dict['includeEventUuid'] = self.includeEventUuid if self.includeEventTags is not None: body_dict['includeEventTags'] = self.includeEventTags if self.last is not None: body_dict['last'] = self.last for record in records: if fieldname in record: value = record.get(fieldname, None) if value is not None: body_dict['value'] = str(value) misp_category = [] misp_event_id = [] misp_event_uuid = [] misp_orgc_id = [] misp_to_ids = [] misp_tag = [] misp_type = [] misp_value = [] misp_uuid = [] # search if pagination is True: body_dict['page'] = page body_dict['limit'] = limit body = json.dumps(body_dict) logging.debug('mispsearch request body: %s', body) r = requests.post(my_args['misp_url'], headers=headers, data=body, verify=my_args['misp_verifycert'], cert=my_args['client_cert_full_path'], proxies=my_args['proxies']) # check if status is anything other than 200; throw an exception if it is r.raise_for_status() # response is 200 by this point or we would have thrown an exception # print >> sys.stderr, "DEBUG MISP REST API response: %s" % response.json() response = r.json() if 'response' in response: if 'Attribute' in response['response']: for a in response['response']['Attribute']: if str(a['type']) not in misp_type: misp_type.append(str(a['type'])) if str(a['value']) not in misp_value: misp_value.append(str(a['value'])) if str(a['to_ids']) not in misp_to_ids: misp_to_ids.append(str(a['to_ids'])) if str(a['category']) not in misp_category: misp_category.append(str(a['category'])) if str(a['uuid']) not in misp_uuid: misp_uuid.append(str(a['uuid'])) if str(a['event_id']) not in misp_event_id: misp_event_id.append(str(a['event_id'])) if 'Tag' in a: for tag in a['Tag']: if str(tag['name']) not in misp_tag: misp_tag.append(str(tag['name'])) if 'Event' in a: if a['Event'][ 'uuid'] not in misp_event_uuid: misp_event_uuid.append( str(a['Event']['uuid'])) if a['Event'][ 'orgc_id'] not in misp_orgc_id: misp_orgc_id.append( str(a['Event']['orgc_id'])) record['misp_type'] = misp_type record['misp_value'] = misp_value record['misp_to_ids'] = misp_to_ids record['misp_category'] = misp_category record['misp_attribute_uuid'] = misp_uuid record['misp_event_id'] = misp_event_id record['misp_event_uuid'] = misp_event_uuid record['misp_orgc_id'] = misp_orgc_id record['misp_tag'] = misp_tag yield record
class B64Command(StreamingCommand): """ Encode a string to Base64 Decode Base64 content | base64 [action=(encode|decode)] field=<field> [mode=(replace|append)] """ field = Option(name='field', require=True, default=None) action = Option(name='action', require=False, default='decode', validate=Base64Actions()) mode = Option(name='mode', require=False, default='replace', validate=OutputModes()) alphabet = Option(name='alphabet', require=False, default=BASE64_CHARS, validate=Base64Alphabet()) backslash_escape = Option(name='backslash_escape', require=False, default=True, validate=validators.Boolean()) encoding = Option(name='encoding', require=False, default=None, validate=OutputEncoding()) recurse = Option(name='recurse', require=False, default=False, validate=validators.Boolean()) suppress_error = Option(name='suppress_error', require=False, default=False, validate=validators.Boolean()) def stream(self, records): # Set the output field if self.mode == 'append': dest_field = 'base64' else: dest_field = self.field for record in records: # Return unchanged record if the field is not present if self.field not in record: yield record continue # Process field field_data_list = record[self.field] output_data_list = [] # Ensure all values are in a list if not isinstance(field_data_list, list): field_data_list = [field_data_list] for field_data in field_data_list: try: # Base64 Encoding if self.action == 'encode': # Expected input is UTF-8 read as Unicode. # To pass other formats, it must be unescaped from backslash_escape if self.backslash_escape: field_data = field_data.encode('utf-8', errors='ignore').decode('unicode_escape') field_data = field_data.encode(self.encoding, errors='ignore') # Add encoded ASCII data to output output_data_list.append(ensure_str( to_b64(field_data, custom_alphabet=self.alphabet) )) # Base64 Decoding else: output_data = from_b64(field_data, custom_alphabet=self.alphabet, recurse=self.recurse) # Try specified encoding if self.encoding: try: decode_attempt = output_data.decode(self.encoding, errors='strict') if '\x00' not in decode_attempt: output_data_list.append(decode_attempt) continue except UnicodeDecodeError: pass # Backlash escape output # Null values will break the data passed back through stdout if self.backslash_escape or b'\x00' in output_data: output_data_list.append( backslash_escape(output_data) ) # If encoding was not set, backslash_escape was not set, and no null found else: output_data_list.append( output_data.decode('utf8', errors='replace') ) except Exception as e: if not self.suppress_error: raise e record[dest_field] = output_data_list yield record
class mispapireport(ReportingCommand): """ MISP API wrapper for endpoint /attributes/restSearch. return format is JSON for the momemnt ##Syntax use paramater names to set values in the POST request body below. .. code-block:: | mispapireport misp_instance=<input> page=<int> limit=<int> value=string type=CSVstring category=CSVstring org=string tags=CSVstring not_tags=CSVstrings date_from=date_string date_to=date_string last=<int>(d|h|m) eventid=CSVint uuid=CSVuuid_string enforceWarninglist=True|False to_ids=True|False deleted=True|False includeEventUuid=True|False includeEventTags==True|False threat_level_id=<int> eventinfo=string forced parameters: "returnFormat": "json" withAttachments: False not handled parameters: "publish_timestamp": "optional", "timestamp": "optional", "event_timestamp": "optional", ##Description { "returnFormat": "mandatory", "page": "optional", "limit": "optional", "value": "optional", "type": "optional", "category": "optional", "org": "optional", "tags": "optional", "from": "optional", "to": "optional", "last": "optional", "eventid": "optional", "withAttachments": "optional", "uuid": "optional", "publish_timestamp": "optional", "timestamp": "optional", "enforceWarninglist": "optional", "to_ids": "optional", "deleted": "optional", "includeEventUuid": "optional", "includeEventTags": "optional", "event_timestamp": "optional", "threat_level_id": "optional", "eventinfo": "optional", "includeProposals": "optional" } # status for mode=p "returnFormat": forced to json, "page": param, "limit": param, "value": param, "type": param, CSV string, "category": param, CSV string, "org": param, CSV string, "tags": param with not_tags, "from": param, "to": param, "last": param, "eventid": param, "withAttachments": forced to false, "uuid": param, "publish_timestamp": not managed, "timestamp": not managed, "enforceWarninglist": param, "to_ids": param, "deleted": param, "includeEventUuid": param, "includeEventTags": param, "event_timestamp": not managed, "threat_level_id": param, "eventinfo": param, "includeProposals": not managed } """ # Superseede MISP instance for this search misp_instance = Option(doc=''' **Syntax:** **misp_instance=instance_name* **Description:**MISP instance parameters as described in local/inputs.conf.''', require=True) # mode: p - give parameters one by one / j provide a complete JSON string # default is mode=p mode = Option(doc=''' **Syntax:** **mode=***p|j<AUTH_KEY>* **Description:**mode to build the JSON request.''', require=False, validate=validators.Match("mode", r"^(p|j)$")) # if mode=j a complete JSON request has to be provided json_request = Option(doc=''' **Syntax:** **json_request=***valid JSON request* **Description:**Valid JSON request''', require=False) # specific formats last = Option(doc=''' **Syntax:** **last=***<int>d|h|m* **Description:**publication duration in day(s), hour(s) or minute(s).''', require=False, validate=validators.Match("last", r"^[0-9]+[hdm]$")) date_from = Option(doc=''' **Syntax:** **date_from=***date_string"* **Description:**starting date.''', require=False) date_to = Option(doc=''' **Syntax:** **date_to=***date_string"* **Description:**(optional)ending date in searches with date_from. if not set default is now''', require=False) threat_level_id = Option(doc=''' **Syntax:** **threat_level_id=***1-4* **Description:**Threat level.''', require=False, validate=validators.Match("threat_level_id", r"^[1-4]$")) org = Option(doc=''' **Syntax:** **org=***CSV string* **Description:**Comma(,)-separated string of org name(s), id(s), uuid(s).''', require=False) # CSV numeric list eventid = Option(doc=''' **Syntax:** **eventid=***id1(,id2,...)* **Description:**list of event ID(s).''', require=False, validate=validators.Match("eventid", r"^[0-9,]+$")) # strings value = Option(doc=''' **Syntax:** **value=***string* **Description:**value.''', require=False) eventinfo = Option(doc=''' **Syntax:** **eventinfo=***string* **Description:**eventinfo string''', require=False) # numeric values limit = Option(doc=''' **Syntax:** **limit=***<int>* **Description:**define the limit for each MISP search; default 10000. 0 = no pagination.''', require=False, validate=validators.Match("limit", r"^[0-9]+$")) page = Option(doc=''' **Syntax:** **page=***<int>* **Description:**define the page of result to get.''', require=False, validate=validators.Match("limit", r"^[0-9]+$")) # CSV strings uuid = Option(doc=''' **Syntax:** **uuid=***id1(,id2,...)* **Description:**list of event UUID(s).''', require=False) type = Option(doc=''' **Syntax:** **type=***CSV string* **Description:**Comma(,)-separated string of categories to search for. Wildcard is %.''', require=False) category = Option(doc=''' **Syntax:** **category=***CSV string* **Description:**Comma(,)-separated string of categories to search for. Wildcard is %.''', require=False) tags = Option(doc=''' **Syntax:** **tags=***CSV string* **Description:**Comma(,)-separated string of tags to search for. Wildcard is %.''', require=False) not_tags = Option(doc=''' **Syntax:** **not_tags=***CSV string* **Description:**Comma(,)-separated string of tags to exclude from results. Wildcard is %.''', require=False) # Booleans to_ids = Option(doc=''' **Syntax:** **to_ids=***y|Y|1|true|True|n|N|0|false|False* **Description:**Boolean to search only attributes with the flag "to_ids" set to true.''', require=False, validate=validators.Boolean()) enforceWarninglist = Option(doc=''' **Syntax:** **enforceWarninglist=***y|Y|1|true|True|n|N|0|false|False* **Description:**Boolean to apply warning lists to results.''', require=False, validate=validators.Boolean()) deleted = Option(doc=''' **Syntax:** **deleted=***y|Y|1|true|True|n|N|0|false|False* **Description:**Boolean to include deleted attributes to results.''', require=False, validate=validators.Boolean()) includeEventUuid = Option(doc=''' **Syntax:** **includeEventUuid=***y|Y|1|true|True|n|N|0|false|False* **Description:**Boolean to include event UUID(s) to results.''', require=False, validate=validators.Boolean()) includeEventTags = Option(doc=''' **Syntax:** **includeEventTags=***y|Y|1|true|True|n|N|0|false|False* **Description:**Boolean to include event UUID(s) to results.''', require=False, validate=validators.Boolean()) pipesplit = Option(doc=''' **Syntax:** **pipesplit=***<1|y|Y|t|true|True|0|n|N|f|false|False>* **Description:**Boolean to split multivalue attributes into 2 attributes.''', require=False, validate=validators.Boolean()) @Configuration() def map(self, records): # self.logger.debug('mispgetioc.map') return records def reduce(self, records): # Phase 1: Preparation my_args = prepare_config(self) my_args['misp_url'] = my_args['misp_url'] + '/attributes/restSearch' jsonmode = False if self.mode is not None: if 'j' in self.mode and self.json_request is not None: jsonmode = True if jsonmode is True: pagination = True other_page = True body_dict = json.loads(self.json_request) logging.info('Option "json_request" set') body_dict['returnFormat'] = 'json' body_dict['withAttachments'] = False if 'limit' in body_dict: limit = int(body_dict['limit']) if limit == 0: pagination = False else: limit = 10000 if 'page' in body_dict: page = body_dict['page'] else: page = 1 page_length = 0 else: # build search JSON object body_dict = {"returnFormat": "json", "withAttachments": False} # add provided parameters to JSON request body # specific formats if self.last is not None: body_dict['last'] = self.last logging.info('Option "last" set with %s', body_dict['last']) if self.date_from is not None: body_dict['from'] = self.date_from logging.info('Option "date_from" set with %s', body_dict['from']) if self.date_to is not None: body_dict['to'] = self.date_to logging.info('Option "date_to" set with %s', body_dict['to']) else: logging.info('Option "date_to" will be set to now().') if self.threat_level_id is not None: body_dict['threat_level_id'] = self.threat_level_id logging.info('Option "threat_level_id" set with %s', body_dict['threat_level_id']) if self.org is not None: body_dict['org'] = self.org logging.info('Option "org" set') if self.eventid: if "," in self.eventid: event_criteria = {} event_list = self.eventid.split(",") event_criteria['OR'] = event_list body_dict['eventid'] = event_criteria else: body_dict['eventid'] = self.eventid logging.info('Option "eventid" set') if self.value is not None: body_dict['value'] = self.value logging.info('Option "value" set') if self.eventinfo is not None: body_dict['eventinfo'] = self.eventinfo logging.info('Option "eventinfo" set') # CSV strings if self.category is not None: cat_criteria = {} cat_list = self.category.split(",") cat_criteria['OR'] = cat_list body_dict['category'] = cat_criteria if self.type is not None: type_criteria = {} type_list = self.type.split(",") type_criteria['OR'] = type_list body_dict['type'] = type_criteria if self.tags is not None or self.not_tags is not None: tags_criteria = {} if self.tags is not None: tags_list = self.tags.split(",") tags_criteria['OR'] = tags_list if self.not_tags is not None: tags_list = self.not_tags.split(",") tags_criteria['NOT'] = tags_list body_dict['tags'] = tags_criteria if self.uuid is not None: uuid_criteria = {} uuid_list = self.uuid.split(",") uuid_criteria['OR'] = uuid_list body_dict['uuid'] = uuid_criteria # Booleans if self.to_ids is not None: body_dict['to_ids'] = self.to_ids logging.info('Option "to_ids" set with %s', body_dict['to_ids']) if self.enforceWarninglist is not None: body_dict['enforceWarninglist'] = self.enforceWarninglist logging.info('Option "enforceWarninglist" set with %s', body_dict['enforceWarninglist']) if self.deleted is not None: body_dict['deleted'] = self.deleted logging.info('Option "deleted" set with %s', body_dict['deleted']) if self.includeEventUuid is not None: body_dict['includeEventUuid'] = self.includeEventUuid logging.info('Option "includeEventUuid" set with %s', body_dict['includeEventUuid']) if self.includeEventTags is not None: body_dict['includeEventTags'] = self.includeEventTags logging.info('Option "includeEventTags" set with %s', body_dict['includeEventTags']) # Search pagination pagination = True other_page = True if self.page: page = self.page else: page = 1 page_length = 0 if self.limit is not None: if int(self.limit) == 0: pagination = False else: limit = int(self.limit) else: limit = 10000 # set proper headers headers = {'Content-type': 'application/json'} headers['Authorization'] = my_args['misp_key'] headers['Accept'] = 'application/json' results = [] # add colums for each type in results while other_page: if pagination is True: body_dict['page'] = page body_dict['limit'] = limit body = json.dumps(body_dict) logging.debug('mispapireport request body: %s', body) # search r = requests.post(my_args['misp_url'], headers=headers, data=body, verify=my_args['misp_verifycert'], cert=my_args['client_cert_full_path'], proxies=my_args['proxies']) # check if status is anything other than 200; throw an exception if it is r.raise_for_status() # response is 200 by this point or we would have thrown an exception response = r.json() if 'response' in response: if 'Attribute' in response['response']: page_length = len(response['response']['Attribute']) for a in response['response']['Attribute']: v = {} v['misp_Object'] = "-" if self.includeEventTags is True: v['misp_tag'] = "-" for ak, av in a.items(): if ak == 'Event': json_event = a['Event'] for ek, ev in json_event.items(): key = 'misp_event_' + ek v[key] = str(ev) elif ak == 'Tag': tag_list = [] for tag in a['Tag']: try: tag_list.append(str(tag['name'])) except Exception: pass v['misp_tag'] = tag_list else: vkey = 'misp_' + ak v[vkey] = av results.append(v) if pagination is True: if page_length < limit: other_page = False else: page = page + 1 else: other_page = False # add colums for each type in results typelist = [] for r in results: if r['misp_type'] not in typelist: typelist.append(r['misp_type']) output_dict = {} increment = 1 for r in results: key = str(r['misp_event_id']) + '_' + str(increment) increment = increment + 1 v = r for t in typelist: misp_t = 'misp_' + t.replace('-', '_').replace('|', '_p_') if t == r['misp_type']: v[misp_t] = r['misp_value'] else: v[misp_t] = '' output_dict[key] = v for k, v in output_dict.items(): yield v
class ToSFXCommand(EventingCommand): """ ## Syntax <command> | tosfx ## Description One or more datapoints are generated for each input event's field(s) of the form `gauge_*`, `counter_*` or `cumulative_counter_*`. The metric name in SignalFx will be the `*` part of the field name. Any additional fields on the event will be attached as dimensions to the generated datapoints. """ access_token = Option() debug = Option(validate=validators.Boolean(), default=False) dry_run = Option(validate=validators.Boolean(), default=False) signalfx_realm = Option() ingest_url = Option() dp_endpoint = Option(default="/v2/event") def ensure_default_config(self): configs = configparser.ConfigParser(allow_no_value=True) local_config = os.path.abspath( os.path.join(os.getcwd(), "..", "local", "sfx.conf")) configs.read(local_config) def read_conf_value(field): try: return configs.get("setupentity", field) except configparser.NoOptionError: return None if not self.signalfx_realm: self.signalfx_realm = read_conf_value("signalfx_realm") if not self.ingest_url: self.ingest_url = read_conf_value("ingest_url") self.logger.error("getting access token") if not self.access_token: self.access_token = get_access_token(self.service) def transform(self, records): self.ensure_default_config() out = [] payload = [] for event in records: add_event_to_payload(self, event=event, payload=payload) if self.debug: event["endpoint"] = self.ingest_url + self.dp_endpoint out.append(event) self.logger.error(out) self.logger.error(payload) if not self.dry_run: resp = send_payload( payload=payload, target_url=compose_ingest_url(self.signalfx_realm, self.ingest_url, self.dp_endpoint), token=self.access_token, ) for event in out: event["status"] = resp.status_code if resp.status_code != 200: event["response_error"] = resp.content for event in out: yield event
class Bs4(StreamingCommand): """ A wrapper for BeautifulSoup4 to extract html/xml tags and text from them to use in Splunk. ##Syntax .. code-block:: bs4 textfield=<field> [get_text=<bool>] [get_text_label=<string>] [parser=<string>] [find=<tag>] [find_attrs=<quoted_key:value_pairs>] [find_all=<tag>] [find_all_attrs=<quoted_key:value_pairs>] [find_child=<tag>] [find_child_attrs=<quoted_key:value_pairs>] [find_children=<tag>] [find_children_attrs=<quoted_key:value_pairs>] ##Description A wrapper script to bring some functionality from BeautifulSoup to Splunk. Default is to get the text and send it to a new field 'get_text', otherwise the selection is returned in a field named 'soup'. Default is to use the 'lxml' parser, though you can specify others, 'html5lib' is not currently included. The find methods can be used in conjuction, their order of operation is find > find_all > find_child > find children. Each option has a similar named option appended '_attrs' that will accept inner and outer quoted key:value pairs for more precise selections. ##Example .. code-block:: * | bs4 textfield=_raw find="div" get_text=t """ textfield = Option( require=True, doc=''' **Syntax:** **textfield=***<fieldname>* **Description:** Name of the field that will contain the text to search against''', validate=validators.Fieldname()) parser = Option( default='lxml', doc=''' **Syntax:** **parser=***<string>* **Description:** Corresponds to parsers listed here https://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser (currently html5lib not packaged with so not an option)''', ) find = Option( default=False, doc=''' **Syntax:** **find=***<tag>* **Description:** Corresponds to the name attribute of BeautifulSoup's find method''', ) find_attrs = Option( default=None, doc=''' **Syntax:** **find_attrs=***<quoted_key:value_pairs>* **Description:** Corresponds to the attrs attribute of BeautifulSoup's find method. Expects inner and outer quoted "'key1':'value1','key2':'value2'" pairs comma-separated but contained in outer quotes.''', ) find_all = Option( default=False, doc=''' **Syntax:** **find_all=***<tag>* **Description:** Corresponds to the name attribute of BeautifulSoup's find_all method. Order of operation is find > find_all > find_child > find_children so can be used in conjunction''', ) find_all_attrs = Option( default=None, doc=''' **Syntax:** **find_all_attrs=***<quoted_key:value_pairs>* **Description:** Corresponds to the attrs attribute of BeautifulSoup's find_all method. Expects inner and outer quoted "'key1':'value1','key2':'value2'" pairs comma-separated but contained in outer quotes.''', ) find_child = Option( default=False, doc=''' **Syntax:** **find_child=***<tag>* **Description:** Corresponds to the name attribute of BeautifulSoup's find_child method. Order of operation is find > find_all > find_child > find_children so can be used in conjunction''', ) find_child_attrs = Option( default=None, doc=''' **Syntax:** **find_child_attrs=***<quoted_key:value_pairs>* **Description:** Corresponds to the attrs attribute of BeautifulSoup's find_child method. Expects inner and outer quoted "'key1':'value1','key2':'value2'" pairs comma-separated but contained in outer quotes.''', ) find_children = Option( default=False, doc=''' **Syntax:** **find_children=***<tag>* **Description:** Corresponds to the name attribute of BeautifulSoup's find_children method. Order of operation is find > find_all > find_child > find_children so can be used in conjunction''', ) find_children_attrs = Option( default=None, doc=''' **Syntax:** **find_children_attrs=***<quoted_key:value_pairs>* **Description:** Corresponds to the attrs attribute of BeautifulSoup's find_children method. Expects inner and outer quoted "'key1':'value1','key2':'value2'" pairs comma-separated but contained in outer quotes.''', ) get_text = Option( default=True, doc=''' **Syntax:** **get_text=***<bool>* **Description:** If true, returns text minus html/xml formatting for given selection and places in field `get_text` otherwise returns the selection in a field called `soup1`''', validate=validators.Boolean()) get_text_label = Option( default='get_text', doc=''' **Syntax:** **get_text_label=***<string>* **Description:** If get_text is true, sets the label for the return field''', ) #http://dev.splunk.com/view/logging/SP-CAAAFCN def setup_logging(self): logger = logging.getLogger('splunk.foo') SPLUNK_HOME = os.environ['SPLUNK_HOME'] LOGGING_DEFAULT_CONFIG_FILE = os.path.join(SPLUNK_HOME, 'etc', 'log.cfg') LOGGING_LOCAL_CONFIG_FILE = os.path.join(SPLUNK_HOME, 'etc', 'log-local.cfg') LOGGING_STANZA_NAME = 'python' LOGGING_FILE_NAME = "nlp-text-analytics.log" BASE_LOG_PATH = os.path.join('var', 'log', 'splunk') LOGGING_FORMAT = "%(asctime)s %(levelname)-s\t%(module)s:%(lineno)d - %(message)s" splunk_log_handler = logging.handlers.RotatingFileHandler( os.path.join( SPLUNK_HOME, BASE_LOG_PATH, LOGGING_FILE_NAME ), mode='a') splunk_log_handler.setFormatter(logging.Formatter(LOGGING_FORMAT)) logger.addHandler(splunk_log_handler) setupSplunkLogger( logger, LOGGING_DEFAULT_CONFIG_FILE, LOGGING_LOCAL_CONFIG_FILE, LOGGING_STANZA_NAME ) return logger def stream(self, records): for record in records: soup = BeautifulSoup(record[self.textfield], self.parser) if self.find: if self.find_attrs is not None: soup = soup.find( self.find, literal_eval('{'+self.find_attrs+'}') ) else: soup = soup.find(self.find) if self.find_all: if self.find_all_attrs is not None: soup = soup.find_all( self.find_all, literal_eval('{'+self.find_all_attrs+'}') ) else: soup = soup.find_all(self.find_all) if self.find_child: if self.find_child_attrs is not None: soup = soup.findChild( self.find_child, literal_eval('{'+self.find_child_attrs+'}') ) else: soup = soup.findChild(self.find_child) if self.find_children: if self.find_children_attrs is not None: soup = soup.findChildren( self.find_children, literal_eval('{'+self.find_children_attrs+'}') ) else: soup = soup.findChildren(self.find_children) if self.get_text and not (self.find_all or self.find_children): record[self.get_text_label] = \ soup.get_text().decode('unicode_escape').encode('ascii','ignore') elif self.get_text and (self.find_all or self.find_children): record[self.get_text_label] = [ i.get_text().decode('unicode_escape').encode('ascii','ignore') for i in soup ] else: record['soup'] = soup yield record
class INSEECommand(GeneratingCommand): """ Synopsis ##Syntax | insee [dtr=date_to_retrieve] [proxy=true] [debug=true] ##Description Request the Sirene API """ dtr = Option(require=False, validate=Date()) debug = Option(require=False, validate=validators.Boolean()) proxy = Option(require=False, validate=validators.Boolean()) # https://www.sirene.fr/sirene/public/variable/tefet LIBTEFET = { 'NN': 'Unités non employeuses', '00': '0 salarié', '01': '1 ou 2 salariés', '02': '3 à 5 salariés', '03': '6 à 9 salariés', '11': '10 à 19 salariés', '12': '20 à 49 salariés', '21': '50 à 99 salariés', '22': '100 à 199 salariés', '31': '200 à 249 salariés', '32': '250 à 499 salariés', '41': '500 à 999 salariés', '42': '1 000 à 1 999 salariés', '51': '2 000 à 4 999 salariés', '52': '5 000 à 9 999 salariés', '53': '10 000 salariés et plus' } # https://www.sirene.fr/sirene/public/variable/rpen RPEN = { '01': ['971'], '02': ['972'], '03': ['973'], '04': ['974'], '06': ['976'], '07': ['977'], '08': ['978'], '11': ['75', '77', '78', '91', '92', '93', '94', '95'], '24': ['18', '28', '36', '37', '41', '45'], '27': ['21', '25', '39', '58', '70', '71', '89', '90'], '28': ['14', '27', '50', '61', '76'], '32': ['02', '59', '60', '62', '80'], '44': ['08', '10', '51', '52', '54', '55', '57', '67', '68', '88'], '52': ['44', '49', '53', '72', '85'], '53': ['22', '29', '35', '56'], '75': [ '16', '17', '19', '23', '24', '33', '40', '47', '64', '79', '86', '87' ], '76': [ '09', '11', '12', '30', '31', '32', '34', '46', '48', '65', '66', '81', '82' ], '84': [ '01', '03', '07', '15', '26', '38', '42', '43', '63', '69', '73', '74' ], '93': ['04', '05', '06', '13', '83', '84'], '94': ['2A', '2B'], '98': ['975', '984', '986', '987', '988'], '99': ['99'], } # https://www.sirene.fr/sirene/public/variable/depet DEPET = {''} count_in = 0 count_out = 0 def set_configuration(self): # Open the configuration file try: with open( os.path.dirname(os.path.abspath(__file__)) + '/configuration_json.txt', 'r') as conf_file: conf = json.load(conf_file) except ValueError: self.logger.error(' invalid JSON configuration file') raise ExceptionConfiguration( 'Invalid JSON in the configuration file') except IOError: self.logger.error(' configuration file doesn\'t exist') raise ExceptionConfiguration('Missing configuration file') # Verify the configuration if self.proxy: if 'http_proxy' not in conf or 'https_proxy' not in conf: self.logger.error( ' proxies are not defined in the configuration file') raise ExceptionConfiguration( 'Proxies are not defined in the configuration file') self.proxies = dict() self.proxies['http'] = conf['http_proxy'] self.proxies['https'] = conf['https_proxy'] if 'consumer_key' not in conf or 'consumer_secret' not in conf: self.logger.error( ' API credentials are not defined in the configuration file') raise ExceptionConfiguration( 'Missing API credentials in the configuration file') if 'endpoint_token' not in conf or 'endpoint_etablissement' not in conf or 'endpoint_informations' not in conf: self.logger.error( ' API endpoints are not defined in the configuration file') raise ExceptionConfiguration( 'Missing API endpoints in the configuration file') self.consumer_key = conf['consumer_key'] self.consumer_secret = conf['consumer_secret'] self.endpoint_token = conf['endpoint_token'] self.endpoint_etablissement = conf['endpoint_etablissement'] self.endpoint_informations = conf['endpoint_informations'] self.bearer_token = self.get_api_token() def get_api_token(self): payload = {'grant_type': 'client_credentials'} basic_auth = HTTPBasicAuth(self.consumer_key, self.consumer_secret) if self.proxy: r = requests.post(self.endpoint_token, auth=basic_auth, data=payload, proxies=self.proxies) else: r = requests.post(self.endpoint_token, auth=basic_auth, data=payload) if self.debug: self.logger.debug(' token response %s\n%s', r.headers, r.text) if r.headers['Content-Type'] and 'application/json' in r.headers[ 'Content-Type']: if r.status_code == 200: return r.json()['access_token'] elif r.status_code == 401: self.logger.error(' incorrect credentials : %s', r.json()['error_description']) else: self.logger.error( ' error during token retrieval. Code received : %d', r.status_code) else: self.logger.error( ' error during token retrieval. Code received : %d', r.status_code) raise ExceptionToken('Error during API token retrieval') def get_status(self): # Initialize headers = {'Authorization': 'Bearer ' + self.bearer_token} if self.proxy: r = requests.get(self.endpoint_informations, headers=headers, proxies=self.proxies) else: r = requests.get(self.endpoint_informations, headers=headers) if self.debug: self.logger.debug(' status response %s\n%s', r.headers, r.text) while r.status_code == 429: # We made too many requests. We wait for the next rounded minute current_second = datetime.now().time().strftime('%S') time.sleep(60 - int(current_second) + 1) if self.proxy: r = requests.get(self.endpoint_informations, headers=headers, proxies=self.proxies) else: r = requests.get(self.endpoint_informations, headers=headers) if self.debug: self.logger.debug(' status response %s\n%s', r.headers, r.text) if r.headers['Content-Type'] and 'application/json' in r.headers[ 'Content-Type']: if r.status_code == 200: return r.json() elif r.status_code == 401: self.logger.error( ' invalid bearer token %s in status request', self.bearer_token) elif r.status_code == 406: self.logger.error(' invalid Accept header in status request') else: self.logger.error( ' error during status retrieval. Code received : %d', r.status_code) else: self.logger.error( ' error during status retrieval. Code received : %d', r.status_code) raise ExceptionStatus('Error during information retrieval') def get_siret(self, q=None, nombre=None, curseur=None, champs=None, gzip=False): # Initialize payload = dict() if champs: payload['champs'] = champs if q: payload['q'] = q if nombre: payload['nombre'] = nombre if curseur: payload['curseur'] = curseur headers = {'Authorization': 'Bearer ' + self.bearer_token} if gzip: # Request GZip content headers['Accept-Encoding'] = 'gzip' if self.proxy: r = requests.get(self.endpoint_etablissement, headers=headers, params=payload, proxies=self.proxies) else: r = requests.get(self.endpoint_etablissement, headers=headers, params=payload) if self.debug: self.logger.debug(' siret response %s\n%s', r.headers, r.text) while r.status_code == 429: # We made too many requests. We wait for the next rounded minute current_second = datetime.now().time().strftime('%S') time.sleep(60 - int(current_second) + 1) if self.proxy: r = requests.get(self.endpoint_etablissement, headers=headers, params=payload, proxies=self.proxies) else: r = requests.get(self.endpoint_etablissement, headers=headers, params=payload) if self.debug: self.logger.debug(' siret response %s\n%s', r.headers, r.text) internal_error_counter = 0 while r.status_code == 500: # In case we get a 500 we prefer to retry our request before raising an error internal_error_counter += 1 time.sleep(60) if self.proxy: r = requests.get(self.endpoint_etablissement, headers=headers, params=payload, proxies=self.proxies) else: r = requests.get(self.endpoint_etablissement, headers=headers, params=payload) if self.debug: self.logger.debug(' siret response %s\n%s', r.headers, r.text) if internal_error_counter == 10: break if r.headers['Content-Type'] and 'application/json' in r.headers[ 'Content-Type']: if r.status_code == 200: return r.json() elif r.status_code == 400: self.logger.error(' invalid parameters in query: %s', r.json()['header']['message']) elif r.status_code == 401: self.logger.error(' invalid bearer token %s in siret request', self.bearer_token) elif r.status_code == 404: self.logger.error(' unknown siret: %s', r.json()['header']['message']) elif r.status_code == 406: self.logger.error(' invalid Accept header in siret request') elif r.status_code == 414: self.logger.error(' siret request URI too long') else: self.logger.error( ' error during siret retrieval. Code received : %d', r.status_code) else: self.logger.error( ' error during siret retrieval. Code received : %d', r.status_code) raise ExceptionSiret('Error during siret retrieval') def get_updated_siret_records(self, date, curseur): # Which fields do we need champs = 'siren,nic,siret,complementAdresseEtablissement,numeroVoieEtablissement,indiceRepetitionEtablissement,' \ 'typeVoieEtablissement,libelleVoieEtablissement,codePostalEtablissement,libelleCedexEtablissement,' \ 'codeCommuneEtablissement,libelleCommuneEtablissement' # Build the filter q = 'dateDernierTraitementEtablissement:' + date j = self.get_siret(q=q, curseur=curseur, nombre=1000, gzip=True) try: header = j['header'] etablissements = j['etablissements'] curseur_suivant = header['curseurSuivant'] total = header['total'] # Get header for debugging purposes if self.debug: self.logger.debug(' header siret %s', header) except KeyError as e: self.logger.error(' missing key in response from API: %s', e) raise ExceptionUpdatedSiret('Error during headquarters retrieval') return total, curseur_suivant, etablissements @staticmethod def chunks(l, n): """Yield successive n-sized chunks from l.""" for i in xrange(0, len(l), n): yield l[i:i + n] def get_etablissements_siege(self, siret_to_retrieve): # Which fields do we need champs = 'siren,nic,siret,etablissementSiege,codeCommuneEtablissement,codePaysEtrangerEtablissement' # Retrieve 85 records at each request # If we have more than 85 siret, the query is too long and blocked by INSEE step = 85 sieges = dict() for chunk in list(self.chunks(siret_to_retrieve, step)): q = '' for siret in chunk: q += 'siret:' + siret + ' OR ' q = q[:-4] try: j = self.get_siret(q=q, nombre=step, champs=champs, gzip=True) except ExceptionSiret: continue try: header = j['header'] for s in j['etablissements']: sieges[s['siret']] = s # Get header for debugging purposes if self.debug: self.logger.debug(' header siret %s', header) except KeyError as e: self.logger.error(' missing key in response from API: %s', e) raise ExceptionHeadquarters( 'Error during headquarters retrieval') self.logger.info(' retrieved %d of %d headquarters', len(sieges), len(siret_to_retrieve)) return sieges def generate_siret(self, siret, siret_siege): new_siret = OrderedDict() v = lambda t: '' if t is None else t.encode('utf-8') try: u = siret['uniteLegale'] a = siret['adresseEtablissement'] # This field is unused a2 = siret['adresse2Etablissement'] p = siret['periodesEtablissement'][0] new_siret['SIREN'] = v(siret['siren']) new_siret['NIC'] = v(siret['nic']) # Physical person sul = None if v(u['categorieJuridiqueUniteLegale']) == '1000': if v(u['sexeUniteLegale']): sul = v(u['sexeUniteLegale']) if sul == 'F': sul = 'MADAME' elif sul == 'M': sul = 'MONSIEUR' if v(u['nomUsageUniteLegale']): nul = v(u['nomUsageUniteLegale']) else: nul = v(u['nomUniteLegale']) puul = v(u['prenomUsuelUniteLegale']) new_siret['L1_NORMALISEE'] = ' '.join( filter(None, [sul, puul, nul])) else: new_siret['L1_NORMALISEE'] = v(u['denominationUniteLegale']) new_siret['L2_NORMALISEE'] = '' nve = v(a['numeroVoieEtablissement']) tve = v(a['typeVoieEtablissement']) lve = v(a['libelleVoieEtablissement']) new_siret['L3_NORMALISEE'] = ' '.join(filter( None, [nve, tve, lve])) new_siret['L4_NORMALISEE'] = '' new_siret['L5_NORMALISEE'] = '' cpe = v(a['codePostalEtablissement']) lce = v(a['libelleCommuneEtablissement']) new_siret['L6_NORMALISEE'] = ' '.join(filter(None, [cpe, lce])) if a['codePaysEtrangerEtablissement'] and a[ 'libellePaysEtrangerEtablissement']: new_siret['L7_NORMALISEE'] = a[ 'libellePaysEtrangerEtablissement'].encode('utf-8') else: new_siret['L7_NORMALISEE'] = 'FRANCE'.encode('utf-8') new_siret['L1_DECLAREE'] = new_siret['L1_NORMALISEE'] new_siret['L2_DECLAREE'] = '' new_siret['L3_DECLAREE'] = new_siret['L3_NORMALISEE'] new_siret['L4_DECLAREE'] = '' new_siret['L5_DECLAREE'] = '' new_siret['L6_DECLAREE'] = '' new_siret['L7_DECLAREE'] = new_siret['L7_NORMALISEE'] new_siret['NUMVOIE'] = v(a['numeroVoieEtablissement']) new_siret['INDREP'] = v(a['indiceRepetitionEtablissement']) new_siret['TYPVOIE'] = v(a['typeVoieEtablissement']) new_siret['LIBVOIE'] = v(a['libelleVoieEtablissement']) new_siret['CODPOS'] = v(a['codePostalEtablissement']) new_siret['CEDEX'] = v(a['codeCedexEtablissement']) new_siret['RPET'] = '' new_siret['LIBREG'] = '' new_siret['DEPET'] = v(a['codeCommuneEtablissement'])[:2] new_siret['ARRONET'] = '' new_siret['CTONET'] = '' new_siret['COMET'] = v(a['codeCommuneEtablissement']) new_siret['LIBCOM'] = v(a['libelleCommuneEtablissement']) new_siret['DU'] = '' new_siret['TU'] = '' new_siret['UU'] = '' new_siret['EPCI'] = '' new_siret['TCD'] = '' new_siret['ZEMET'] = '' if siret['etablissementSiege']: new_siret['SIEGE'] = 1 else: new_siret['SIEGE'] = 0 new_siret['ENSEIGNE'] = v(p['enseigne1Etablissement']) new_siret['IND_PUBLIPO'] = '' new_siret['DIFFCOM'] = 'O'.encode('utf-8') new_siret['AMINTRET'] = date.today().strftime('%Y%m') new_siret['NATETAB'] = '' new_siret['LIBNATETAB'] = '' new_siret['APET700'] = v( p['activitePrincipaleEtablissement']).replace('.', '') new_siret['LIBAPET'] = v(p['activitePrincipaleEtablissement']) new_siret['DAPET'] = '' new_siret['TEFET'] = v(siret['trancheEffectifsEtablissement']) if siret['trancheEffectifsEtablissement']: new_siret['LIBTEFET'] = self.LIBTEFET[ siret['trancheEffectifsEtablissement']] else: new_siret['LIBTEFET'] = '' new_siret['EFETCENT'] = '' new_siret['DEFET'] = v(siret['anneeEffectifsEtablissement']) new_siret['ORIGINE'] = '' new_siret['DCRET'] = v(siret['dateCreationEtablissement']).replace( '-', '') new_siret['DDEBACT'] = '' new_siret['ACTIVNAT'] = '' new_siret['LIEUACT'] = '' new_siret['ACTISURF'] = '' new_siret['SAISONAT'] = '' new_siret['MODET'] = '' new_siret['PRODET'] = '' new_siret['PRODPART'] = '' new_siret['AUXILT'] = '' # Physical person if v(u['categorieJuridiqueUniteLegale']) == '1000': nul = v(u['nomUniteLegale']) p1ul = v(u['prenom1UniteLegale']) p2ul = v(u['prenom2UniteLegale']) p3ul = v(u['prenom3UniteLegale']) p4ul = v(u['prenom4UniteLegale']) pul = ' '.join(filter(None, [p1ul, p2ul, p3ul, p4ul])) if v(u['nomUsageUniteLegale']): new_siret['NOMEN_LONG'] = nul + '*' + v( u['nomUsageUniteLegale']) + '/' + pul + '/' else: new_siret['NOMEN_LONG'] = nul + '*' + pul + '/' else: new_siret['NOMEN_LONG'] = v(u['denominationUniteLegale']) new_siret['SIGLE'] = v(u['sigleUniteLegale']) new_siret['NOM'] = v(u['nomUniteLegale']) new_siret['PRENOM'] = v(u['prenom1UniteLegale']) new_siret['CIVILITE'] = '' if v(u['sexeUniteLegale']) == 'F': new_siret['CIVILITE'] = 2 elif v(u['sexeUniteLegale']) == 'M': new_siret['CIVILITE'] = 1 new_siret['RNA'] = v(u['identifiantAssociationUniteLegale']) new_siret['NICSIEGE'] = v(u['nicSiegeUniteLegale']) if siret['etablissementSiege']: if v(a['codePaysEtrangerEtablissement']): cce = v(a['codePaysEtrangerEtablissement']) else: cce = v(a['codeCommuneEtablissement']) department = cce[:3] rpen = '' for key, value in self.RPEN.items(): if department in value: rpen = key if rpen == '': department = cce[:2] for key, value in self.RPEN.items(): if department in value: rpen = key else: rpen = '' cce = '' try: siege = siret_siege[v(siret['siren']) + v(u['nicSiegeUniteLegale'])] except KeyError as e: self.logger.info( ' siret %s has an invalid headquarter %s', v(siret['siret']), v(siret['siren']) + v(u['nicSiegeUniteLegale'])) else: if v(siege['adresseEtablissement'] ['codePaysEtrangerEtablissement']): cce = v(siege['adresseEtablissement'] ['codePaysEtrangerEtablissement']) else: cce = v(siege['adresseEtablissement'] ['codeCommuneEtablissement']) department = cce[:3] rpen = '' for key, value in self.RPEN.items(): if department in value: rpen = key if rpen == '': department = cce[:2] for key, value in self.RPEN.items(): if department in value: rpen = key new_siret['RPEN'] = rpen new_siret['DEPCOMEN'] = cce new_siret['ADR_MAIL'] = '' new_siret['NJ'] = v(u['categorieJuridiqueUniteLegale']) new_siret['LIBNJ'] = v(u['categorieJuridiqueUniteLegale']) new_siret['APEN700'] = v( u['activitePrincipaleUniteLegale']).replace('.', '') new_siret['LIBAPEN'] = v(u['activitePrincipaleUniteLegale']) new_siret['DAPEN'] = '' new_siret['APRM'] = v( siret['activitePrincipaleRegistreMetiersEtablissement']) new_siret['ESS'] = v(u['economieSocialeSolidaireUniteLegale']) new_siret['DATEESS'] = '' new_siret['TEFEN'] = v(u['trancheEffectifsUniteLegale']) if u['trancheEffectifsUniteLegale']: new_siret['LIBTEFEN'] = self.LIBTEFET[ u['trancheEffectifsUniteLegale']] else: new_siret['LIBTEFEN'] = '' new_siret['EFENCENT'] = '' new_siret['DEFEN'] = v(u['anneeEffectifsUniteLegale']) new_siret['CATEGORIE'] = v(u['categorieEntreprise']) new_siret['DCREN'] = v(u['dateCreationUniteLegale']) new_siret['AMINTREN'] = date.today().strftime('%Y%m') new_siret['MONOACT'] = '' new_siret['MODEN'] = '' new_siret['PRODEN'] = '' new_siret['ESAANN'] = '' new_siret['TCA'] = '' new_siret['ESAAPEN'] = '' new_siret['ESASEC1N'] = '' new_siret['ESASEC2N'] = '' new_siret['ESASEC3N'] = '' new_siret['ESASEC4N'] = '' if v(p['etatAdministratifEtablissement']) == 'A': new_siret['VMAJ'] = 'C' self.count_in += 1 elif v(p['etatAdministratifEtablissement']) == 'F': new_siret['VMAJ'] = 'O' self.count_out += 1 new_siret['VMAJ1'] = '' new_siret['VMAJ2'] = '' new_siret['VMAJ3'] = '' new_siret['DATEMAJ'] = v( siret['dateDernierTraitementEtablissement']) if v(p['etatAdministratifEtablissement']) == 'A': new_siret['EVE'] = 'CE' elif v(p['etatAdministratifEtablissement']) == 'F': new_siret['EVE'] = 'O' new_siret['DATEVE'] = v( siret['dateDernierTraitementEtablissement'])[:10].replace( '-', '') new_siret['TYPCREH'] = '' new_siret['DREACTET'] = '' new_siret['DREACTEN'] = '' new_siret['MADRESSE'] = '' new_siret['MENSEIGNE'] = '' new_siret['MAPET'] = '' new_siret['MPRODET'] = '' new_siret['MAUXILT'] = '' new_siret['MNOMEN'] = '' new_siret['MSIGLE'] = '' new_siret['MNICSIEGE'] = '' new_siret['MNJ'] = '' new_siret['MAPEN'] = '' new_siret['MPRODEN'] = '' new_siret['SIRETPS'] = '' new_siret['TEL'] = '' except KeyError as e: self.logger.error(' missing key in siret received from API: %s', e) if self.debug: self.logger.debug(' siret to update: %s', siret) self.logger.debug(' new_siret object: %s', new_siret) raise ExceptionTranslation('Error during siret translation') raw = ''.join(k + '=' + '\"{0}\"'.format(v) + ' ' for k, v in new_siret.items()) return raw def generate(self): try: self.set_configuration() # CSV header csv_header = [ 'SIREN', 'NIC', 'L1_NORMALISEE', 'L2_NORMALISEE', 'L3_NORMALISEE', 'L4_NORMALISEE', 'L5_NORMALISEE', 'L6_NORMALISEE', 'L7_NORMALISEE', 'L1_DECLAREE', 'L2_DECLAREE', 'L3_DECLAREE', 'L4_DECLAREE', 'L5_DECLAREE', 'L6_DECLAREE', 'L7_DECLAREE', 'NUMVOIE', 'INDREP', 'TYPVOIE', 'LIBVOIE', 'CODPOS', 'CEDEX', 'RPET', 'LIBREG', 'DEPET', 'ARRONET', 'CTONET', 'COMET', 'LIBCOM', 'DU', 'TU', 'UU', 'EPCI', 'TCD', 'ZEMET', 'SIEGE', 'ENSEIGNE', 'IND_PUBLIPO', 'DIFFCOM', 'AMINTRET', 'NATETAB', 'LIBNATETAB', 'APET700', 'LIBAPET', 'DAPET', 'TEFET', 'LIBTEFET', 'EFETCENT', 'DEFET', 'ORIGINE', 'DCRET', 'DDEBACT', 'ACTIVNAT', 'LIEUACT', 'ACTISURF', 'SAISONAT', 'MODET', 'PRODET', 'PRODPART', 'AUXILT', 'NOMEN_LONG', 'SIGLE', 'NOM', 'PRENOM', 'CIVILITE', 'RNA', 'NICSIEGE', 'RPEN', 'DEPCOMEN', 'ADR_MAIL', 'NJ', 'LIBNJ', 'APEN700', 'LIBAPEN', 'DAPEN', 'APRM', 'ESS', 'DATEESS', 'TEFEN', 'LIBTEFEN', 'EFENCENT', 'DEFEN', 'CATEGORIE', 'DCREN', 'AMINTREN', 'MONOACT', 'MODEN', 'PRODEN', 'ESAANN', 'TCA', 'ESAAPEN', 'ESASEC1N', 'ESASEC2N', 'ESASEC3N', 'ESASEC4N', 'VMAJ', 'VMAJ1', 'VMAJ2', 'VMAJ3', 'DATEMAJ', 'EVE', 'DATEVE', 'TYPCREH', 'DREACTET', 'DREACTEN', 'MADRESSE', 'MENSEIGNE', 'MAPET', 'MPRODET', 'MAUXILT', 'MNOMEN', 'MSIGLE', 'MNICSIEGE', 'MNJ', 'MAPEN', 'MPRODEN', 'SIRETPS', 'TEL' ] # Get status status_object = self.get_status() if status_object: if 'versionService' in status_object: self.logger.info( ' versionService %s', status_object['versionService'].encode('utf-8')) if 'datesDernieresMisesAJourDesDonnees' in status_object: for collection in status_object[ 'datesDernieresMisesAJourDesDonnees']: msg = '' if 'collection' in collection and collection[ 'collection']: msg += 'collection %s' % collection[ 'collection'].encode('utf-8') msg += ' ' if 'dateDerniereMiseADisposition' in collection and collection[ 'dateDerniereMiseADisposition']: msg += 'dateDerniereMiseADisposition %s' %\ collection['dateDerniereMiseADisposition'].encode('utf-8') msg += ' ' if 'dateDernierTraitementDeMasse' in collection and collection[ 'dateDernierTraitementDeMasse']: msg += 'dateDernierTraitementDeMasse %s' %\ collection['dateDernierTraitementDeMasse'].encode('utf-8') msg += ' ' if 'dateDernierTraitementMaximum' in collection and collection[ 'dateDernierTraitementMaximum']: msg += 'dateDernierTraitementMaximum %s' % \ collection['dateDernierTraitementMaximum'].encode('utf-8') msg += ' ' self.logger.info(' %s', msg.encode('utf-8')) # Date to retrieve has been set if self.dtr: day_to_retrieve = self.dtr # Day before yesterday else: day_to_retrieve = (date.today() - timedelta(1)).strftime('%Y-%m-%d') # Log the requested date to help debugging self.logger.info(' dtr: %s', day_to_retrieve.encode('utf-8')) # Log the username to help debugging self.logger.info( ' Splunk username: %s', self._metadata.searchinfo.username.encode('utf-8')) event = 1 curseur = '*' first_call = True received_siret = 0 while True: _, curseur_suivant, updated_siret_list = self.get_updated_siret_records( day_to_retrieve, curseur) if first_call: self.logger.info( ' retrieved a total of %d siret to update', _) first_call = False self.logger.info( ' retrieved %d siret to update in this window', len(updated_siret_list)) received_siret += len(updated_siret_list) self.logger.info(' retrieved %d siret / %d', received_siret, _) siret_to_retrieve = list() for siret in updated_siret_list: if not siret['etablissementSiege']: if siret['siren'] + siret['uniteLegale'][ 'nicSiegeUniteLegale'] not in siret_to_retrieve: siret_to_retrieve.append( siret['siren'] + siret['uniteLegale']['nicSiegeUniteLegale']) # We retrieve all headquarters siret_siege = self.get_etablissements_siege(siret_to_retrieve) for siret in updated_siret_list: raw_data = self.generate_siret(siret, siret_siege) yield { '_time': time.time(), 'event_no': event, '_raw': raw_data } event += 1 # We get the same curseur so we get all updated siret if curseur_suivant == curseur: break curseur = curseur_suivant self.logger.info(' generated %d events', event - 1) self.logger.info(' found %d SIRET to create', self.count_in) self.logger.info(' found %d SIRET to delete', self.count_out) except (ExceptionTranslation, ExceptionHeadquarters, ExceptionUpdatedSiret, ExceptionSiret, ExceptionStatus, ExceptionToken, ExceptionConfiguration): raise # This is a bad practise, but we want a specific message in log file # This case means that the code is missing an Exception handling except Exception as e: self.logger.error( ' unhandled exception has occurred. Traceback is in splunklib.log: %s', e.message) raise
class genatklayerCommand(StreamingCommand): """ Counts the number of non-overlapping matches to a regular expression in a set of fields. ##Syntax .. code-block:: genatklayer name=<string> description=<string> reset=<bool> ##Description Takes input from a search and attempts to map it to Att&ck framework techniques and create a layer file. The parameter `name` is the name for the layer file as it will be stored in KVStore. The parameter `description` is the description for the layer file as it will be stored in KVstore. ##Example Read in some data (tweets) and atttempt to save layer file to KVStore with name "my name" and description "my description" .. code-block:: | inputlookup tweets | genatklayer name="my name" description="my description" """ atkfield = Option( doc=''' **Syntax:** **atkfield=***<field that stores att&ck technique id>* **Description:** The name of the field in your search results that has the att&ck technique id''', require=False, validate=None) layername = Option( doc=''' **Syntax:** **name=***<layer name>* **Description:** What name you want to give the layer in KVStore''', require=False, validate=None) description = Option( doc=''' **Syntax:** **description=***<layer description>* **Description:** What description you want to provide for the layer in KVStore''', require=False, validate=None) reset = Option( doc=''' **Syntax:** **reset=***<bool>* **Description:** Reset the default layer back to its original state''', require=False, validate=validators.Boolean()) # Base ATT&CK Navigator layer / template layer_json = { \ "version": VERSION, \ "name": NAME, \ "description": DESCRIPTION, \ "domain": DOMAIN, \ "techniques": [] \ } # per attack map - makes some color codes # add a color gradient (white -> red) to layer # ranging from zero (white) to the maximum score in the file (red) # To be implemented later """ layer_json["gradient"] = { "colors": [ "#ffffff", "#ff6666" ], "minValue": 0, "maxValue": max([technique["score"] for technique in layer_json["techniques"]]) } """ def getDefaultLayer(self, uri): r, c = splunk.rest.simpleRequest(uri, sessionKey=self.metadata.searchinfo.session_key, rawResult=True) # case where default layer is found via its _key value in kvstore if r.status == 200: return json.loads(c) # case where our default layer wasnt yet loaded into KVStore if r.status == 404: r, c = splunk.rest.simpleRequest(COLLECTION_URI, jsonargs=json.dumps(layer_default), sessionKey=self.metadata.searchinfo.session_key, rawResult=True) return {"error":"Layer file not loaded in KVStore, it has now been loaded on your behalf"} else: return {"error":json.loads(c)} def resetDefaultLayer(self,uri): # if layer exists - error similar to the below will be thrown # {"messages":[{"type":"ERROR","text":"A document with the same key and user already exists."}]} r, c = splunk.rest.simpleRequest(uri, jsonargs=json.dumps(layer_default), sessionKey=self.metadata.searchinfo.session_key, rawResult=True) if r.status == 200: return json.loads(c) else: return json.loads(c) def saveCustomLayer(self, layer_data): CUSTOM_LAYER_URI = "/servicesNS/Nobody/{}/storage/collections/data/attack_layers/{}?output_mode=json".format(appname,self.layername) # test to see if the custom layer already exists r, c = splunk.rest.simpleRequest(CUSTOM_LAYER_URI, sessionKey=self.metadata.searchinfo.session_key, rawResult=True) # yes, layer does exist if r.status == 200: # lets overwrite it r, c = splunk.rest.simpleRequest(CUSTOM_LAYER_URI, jsonargs=json.dumps(layer_data), sessionKey=self.metadata.searchinfo.session_key, rawResult=True) return {"error":"Layer file not loaded in KVStore, it has now been loaded on your behalf"} # no, layer does not exist, lets creat it and save it, note we need to drop down to the collection URI to POST our args if r.status == 404: r, c = splunk.rest.simpleRequest(COLLECTION_URI, jsonargs=json.dumps(layer_data), sessionKey=self.metadata.searchinfo.session_key, rawResult=True) return {"error":"Layer file not loaded in KVStore, it has now been loaded on your behalf"} #some other generic error else: return {"error":json.loads(c)} def stream(self, records): self.logger.debug('genatklayerCommand: %s', self) # logs command line if self.reset: resp = self.resetDefaultLayer(DEFAULT_URI) for record in records: record['_raw'] = json.dumps(resp) yield record # attempt to get the default layer default_layer = self.getDefaultLayer(DEFAULT_URI) if "error" in default_layer: raise Exception("Error retrieving layer. {}".format(str(default_layer['error']))) # iterate through our search results for record in records: # determine if the user specified a field to key off of for Technique ID # and if so, proceed if self.atkfield in record: # iterate through the techniques array in our layer file # we also will set our layers "scores" values per technique ID # to zero if we dont have a value yet, otherwise, proceed for tech in default_layer['techniques']: # Scoring example - reserving for later #if 'score' not in tech: # tech['score'] = 0 # determine if we have a match in this case between # a technique ID in our layer file and in our splunk record # as well as check if our atkfiled is mv # case where it is an mvfield if(isinstance(record[self.atkfield],list)): for item in record[self.atkfield]: if tech['techniqueID'] == six.text_type(item): # if there is a match, see if there's also a detected field in our splunk results # and if so, update the layer info to reflect that if 'detected' in record: if six.text_type(record['detected']) == "-1": tech['color'] = RED_LT #tech['score'] = tech['score'] + 1 elif six.text_type(record['detected']) == "0": tech['color'] = YELLOW_DK elif six.text_type(record['detected']) == "1": tech['color'] = BLUE_1 elif six.text_type(record['detected']) == "2": tech['color'] = BLUE_2 elif six.text_type(record['detected']) == "3": tech['color'] = BLUE_3 elif six.text_type(record['detected']) == "4": tech['color'] = BLUE_4 #case where it is not an mv field elif tech['techniqueID'] == six.text_type(record[self.atkfield]): # if there is a match, see if there's also a detected field in our splunk results # and if so, update the layer info to reflect that if 'detected' in record: if six.text_type(record['detected']) == "-1": tech['color'] = RED_LT #tech['score'] = tech['score'] + 1 elif six.text_type(record['detected']) == "0": tech['color'] = YELLOW_DK elif six.text_type(record['detected']) == "1": tech['color'] = BLUE_1 elif six.text_type(record['detected']) == "2": tech['color'] = BLUE_2 elif six.text_type(record['detected']) == "3": tech['color'] = BLUE_3 elif six.text_type(record['detected']) == "4": tech['color'] = BLUE_4 else: record['_raw'] = "Error no field with that name exists {}".format(self.atkfield) raise Exception("Error no field with that name exists {}".format(self.atkfield)) yield record # if the user passes a name arg then create the new kvstore entry for that new layer # will want to update this code in the future to handle error cases better, and user feedback if self.layername is not None: default_layer['_key'] = self.layername status = self.saveCustomLayer(default_layer) self.logger.debug('custom layer file requested: {}'.format(status)) # post updated default layer if all was successful & we didn't get a name argument else: r, c = splunk.rest.simpleRequest(DEFAULT_URI, jsonargs=json.dumps(default_layer), sessionKey=self.metadata.searchinfo.session_key, rawResult=True) if r.status == 200: self.logger.debug('updated default layer successfully: {}'.format(json.loads(c))) else: self.logger.debug('error updating default layer successfully: {}'.format(json.loads(c)))
class MispCollectCommand(GeneratingCommand): """ get the attributes from a MISP instance. ##Syntax .. code-block:: | mispgetioc misp_instance=<input> last=<int>(d|h|m) | mispgetioc misp_instance=<input> event=<id1>(,<id2>,...) | mispgetioc misp_instance=<input> date=<<YYYY-MM-DD> (date_to=<YYYY-MM-DD>) ##Description { "returnFormat": "mandatory", "page": "optional", "limit": "optional", "value": "optional", "type": "optional", "category": "optional", "org": "optional", "tags": "optional", "date": "optional", "last": "optional", "eventid": "optional", "withAttachments": "optional", "uuid": "optional", "publish_timestamp": "optional", "timestamp": "optional", "enforceWarninglist": "optional", "to_ids": "optional", "deleted": "optional", "includeEventUuid": "optional", "includeEventTags": "optional", "event_timestamp": "optional", "threat_level_id": "optional", "eventinfo": "optional", "includeProposals": "optional", "includeDecayScore": "optional", "includeFullModel": "optional", "decayingModel": "optional", "excludeDecayed": "optional", "score": "optional" } # status "returnFormat": forced to json, "page": param, "limit": param, "value": not managed, "type": param, CSV string, "category": param, CSV string, "org": not managed, "tags": param, see also not_tags "date": param, "last": param, "eventid": param, "withAttachments": forced to false, "uuid": not managed, "publish_timestamp": managed via param last "timestamp": not managed, "enforceWarninglist": param, "to_ids": param, "deleted": forced to False, "includeEventUuid": set to True, "includeEventTags": param, "event_timestamp": not managed, "threat_level_id": not managed, "eventinfo": not managed, "includeProposals": not managed "includeDecayScore": not managed, "includeFullModel": not managed, "decayingModel": not managed, "excludeDecayed": not managed, "score": not managed } """ # MANDATORY MISP instance for this search misp_instance = Option( doc=''' **Syntax:** **misp_instance=instance_name* **Description:** MISP instance parameters as described in local/misp42splunk_instances.conf.''', require=True) # MANDATORY: json_request XOR eventid XOR last XOR date json_request = Option( doc=''' **Syntax:** **json_request=***valid JSON request* **Description:**Valid JSON request''', require=False) eventid = Option( doc=''' **Syntax:** **eventid=***id1(,id2,...)* **Description:**list of event ID(s) or event UUID(s).''', require=False, validate=validators.Match("eventid", r"^[0-9a-f,\-]+$")) last = Option( doc=''' **Syntax:** **last=***<int>d|h|m* **Description:** publication duration in day(s), hour(s) or minute(s). **nota bene:** last is an alias of published_timestamp''', require=False, validate=validators.Match("last", r"^[0-9]+[hdm]$")) date = Option( doc=''' **Syntax:** **date=***The user set event date field - any of valid time related filters"* **Description:**starting date. **eventid**, **last** and **date** are mutually exclusive''', require=False) # Other params category = Option( doc=''' **Syntax:** **category=***CSV string* **Description:**Comma(,)-separated string of categories to search for. Wildcard is %.''', require=False) endpoint = Option( doc=''' **Syntax:** **endpoint=***<events|attributes>* **Description:**selection of MISP API restSearch endpoint. **default**: /attributes/restSearch''', require=False, validate=validators.Match("endpoint", r"(events|attributes)")) geteventtag = Option( doc=''' **Syntax:** **geteventtag=***<1|y|Y|t|true|True|0|n|N|f|false|False>* **Description:**Boolean includeEventTags. By default only attribute tag(s) are returned.''', require=False, validate=validators.Boolean()) keep_related = Option( doc=''' **Syntax:** **keep_related=***<1|y|Y|t|true|True|0|n|N|f|false|False>* **Description:**Boolean to keep related events. default is to drop RelatedEvents to reduce volume.''', require=False, validate=validators.Boolean()) limit = Option( doc=''' **Syntax:** **limit=***<int>* **Description:**define the limit for each MISP search; default 1000. 0 = no pagination.''', require=False, validate=validators.Match("limit", r"^[0-9]+$")) not_tags = Option( doc=''' **Syntax:** **not_tags=***CSV string* **Description:**Comma(,)-separated string of tags to exclude. Wildcard is %.''', require=False) page = Option( doc=''' **Syntax:** **page=***<int>* **Description:**define the page for each MISP search; default 1.''', require=False, validate=validators.Match("page", r"^[0-9]+$")) tags = Option( doc=''' **Syntax:** **tags=***CSV string* **Description:**Comma(,)-separated string of tags to search for. Wildcard is %.''', require=False) to_ids = Option( doc=''' **Syntax:** **to_ids=***<1|y|Y|t|true|True|0|n|N|f|false|False>* **Description:**Boolean to search only attributes with the flag "to_ids" set to true.''', require=False, validate=validators.Boolean()) type = Option( doc=''' **Syntax:** **type=***CSV string* **Description:**Comma(,)-separated string of types to search for. Wildcard is %.''', require=False) warning_list = Option( doc=''' **Syntax:** **warning_list=***<1|y|Y|t|true|True|0|n|N|f|false|False>* **Description:**Boolean to filter out well known values.''', require=False, validate=validators.Boolean()) @staticmethod def _record(serial_number, time_stamp, host, attributes, attribute_names, encoder): raw = encoder.encode(attributes) # Formulate record fields = dict() for f in attribute_names: if f in attributes: fields[f] = attributes[f] if serial_number > 0: fields['_serial'] = serial_number fields['_time'] = time_stamp fields['_raw'] = raw fields['host'] = host return fields record = OrderedDict(chain( (('_serial', serial_number), ('_time', time_stamp), ('_raw', raw), ('host', host)), map(lambda name: (name, fields.get(name, '')), attribute_names))) return record def generate(self): # Phase 1: Preparation misp_instance = self.misp_instance storage = self.service.storage_passwords my_args = prepare_config(self, 'misp42splunk', misp_instance, storage) if my_args is None: raise Exception("Sorry, no configuration for misp_instance={}".format(misp_instance)) my_args['host'] = my_args['misp_url'].replace('https://', '') # check that ONE of mandatory fields is present mandatory_arg = 0 if self.json_request is not None: mandatory_arg = mandatory_arg + 1 if self.eventid: mandatory_arg = mandatory_arg + 1 if self.last: mandatory_arg = mandatory_arg + 1 if self.date: mandatory_arg = mandatory_arg + 1 if mandatory_arg == 0: raise Exception('Missing "json_request", "eventid", "last" or "date" argument') elif mandatory_arg > 1: raise Exception('Options "json_request", "eventid", "last" and "date" are mutually exclusive') body_dict = dict() # Only ONE combination was provided if self.json_request is not None: body_dict = json.loads(self.json_request) logging.info('Option "json_request" set') elif self.eventid: if "," in self.eventid: event_criteria = {} event_list = self.eventid.split(",") event_criteria['OR'] = event_list body_dict['eventid'] = event_criteria else: body_dict['eventid'] = self.eventid logging.info('Option "eventid" set with %s', json.dumps(body_dict['eventid'])) elif self.last: body_dict['last'] = self.last logging.info('Option "last" set with %s', str(body_dict['last'])) else: body_dict['date'] = self.date.split() logging.info('Option "date" set with %s', json.dumps(body_dict['date'])) # Force some values on JSON request body_dict['returnFormat'] = 'json' body_dict['withAttachments'] = False body_dict['deleted'] = False body_dict['includeEventUuid'] = True # set proper headers headers = {'Content-type': 'application/json'} headers['Authorization'] = my_args['misp_key'] headers['Accept'] = 'application/json' # Search pagination pagination = True if self.limit is not None: limit = int(self.limit) elif 'limit' in body_dict: limit = int(body_dict['limit']) else: limit = 1000 if limit == 0: pagination = False if self.page is not None: page = int(self.page) elif 'page' in body_dict: page = body_dict['page'] else: page = 1 # Search parameters: boolean and filter # manage to_ids and enforceWarninglist # to avoid FP enforceWarninglist is set to True if # to_ids is set to True (search criterion) if self.category is not None: if "," in self.category: cat_criteria = {} cat_list = self.category.split(",") cat_criteria['OR'] = cat_list body_dict['category'] = cat_criteria else: body_dict['category'] = self.category if self.endpoint == 'events': my_args['misp_url'] = my_args['misp_url'] + '/events/restSearch' else: my_args['misp_url'] = my_args['misp_url'] + '/attributes/restSearch' if self.geteventtag is True: body_dict['includeEventTags'] = True if self.keep_related is True: keep_related = True else: keep_related = False if self.to_ids is True: body_dict['to_ids'] = True body_dict['enforceWarninglist'] = True # protection elif self.to_ids is False: body_dict['to_ids'] = False if self.type is not None: if "," in self.type: type_criteria = {} type_list = self.type.split(",") type_criteria['OR'] = type_list body_dict['type'] = type_criteria else: body_dict['type'] = self.type if self.warning_list is True: body_dict['enforceWarninglist'] = True elif self.warning_list is False: body_dict['enforceWarninglist'] = False if self.tags is not None or self.not_tags is not None: tags_criteria = {} if self.tags is not None: tags_list = self.tags.split(",") tags_criteria['OR'] = tags_list if self.not_tags is not None: tags_list = self.not_tags.split(",") tags_criteria['NOT'] = tags_list body_dict['tags'] = tags_criteria if pagination is True: body_dict['page'] = page body_dict['limit'] = limit body = json.dumps(body_dict) logging.debug('mispgetioc request body: %s', body) # search r = requests.post(my_args['misp_url'], headers=headers, data=body, verify=my_args['misp_verifycert'], cert=my_args['client_cert_full_path'], proxies=my_args['proxies']) # check if status is anything other than 200; # throw an exception if it is if r.status_code in (200, 201, 204): logging.info( "[CO301] INFO mispcollect successful. " "url={}, HTTP status={}".format(my_args['misp_url'], r.status_code) ) else: logging.error( "[CO302] ERROR mispcollect failed. " "url={}, data={}, HTTP Error={}, content={}" .format(my_args['misp_url'], body, r.status_code, r.text) ) raise Exception( "[CO302] ERROR mispcollect failed. " "url={}, data={}, HTTP Error={}, content={}" .format(my_args['misp_url'], body, r.status_code, r.text) ) # response is 200 by this point or we would have thrown an exception response = r.json() encoder = json.JSONEncoder(ensure_ascii=False, separators=(',', ':')) if self.endpoint == "events": if 'response' in response: for r_item in response['response']: if 'Event' in r_item: attribute_names = [] serial_number = 0 for e in list(r_item.values()): if keep_related is False: e.pop('RelatedEvent', None) if serial_number == 0: for k in list(e.keys()): attribute_names.append(k) yield MispCollectCommand._record( serial_number, e['timestamp'], my_args['host'], e, attribute_names, encoder) serial_number += 1 GeneratingCommand.flush else: if 'response' in response: if 'Attribute' in response['response']: attribute_names = [] serial_number = 0 for a in response['response']['Attribute']: if serial_number == 0: for k in list(a.keys()): attribute_names.append(k) yield MispCollectCommand._record( serial_number, a['timestamp'], my_args['host'], a, attribute_names, encoder) serial_number += 1 GeneratingCommand.flush
class curlCommand(GeneratingCommand): # Authorization : Bearer cn389ncoiwuencr url = Option(require=True) paramMap = Option(require=False) output = Option(require=False, default='json') timeout = Option(require=False, default=10, validate=validators.Integer()) auth = Option(require=False) headers = Option(require=False) proxies = Option(require=False) unsetProxy = Option(require=False, validate=validators.Boolean()) def generate(self): url = self.url paramMap = self.parseParamMap(self.paramMap) if self.paramMap != None else None output = self.output timeout = self.timeout if self.timeout != None else None auth = self.parseAuth(self.auth) if self.auth != None else None headers = self.parseHeaders(self.headers) if self.headers != None else None proxies = self.parseProxies(self.proxies) if self.proxies != None else None unsetProxy = self.unsetProxy # Unset proxy, if unsetProxy = True if unsetProxy == True: if 'HTTP' in os.environ.keys(): del os.environ['HTTP'] if 'HTTPS' in os.environ.keys(): del os.environ['HTTPS'] # Load data from REST API record = {} try: request = requests.get( url, params=paramMap, auth=auth, headers=headers, timeout=timeout, proxies=proxies ) # Choose right output format if output == 'json': record = request.json() else: record = {'reponse': request.content} except requests.exceptions.RequestException as err: record = ({"Error:": err}) yield record ''' HELPERS ''' ''' Parse paramMap into python dict @paramMap string: Pattern 'foo=bar&hello=world, ...' @return dict ''' def parseParamMap(self, paramMap): paramStr = '' # Check, if params contain \, or \= and replace it with placeholder paramMap = paramMap.replace(r'\,', ',') paramMap = paramMap.split(',') for param in paramMap: paramStr += param.replace(',', ',').strip() + '&' # Delete last & return paramStr[:-1] ''' Parse proxy into python dict @proxy string: Comma separated proxies -> http,https @return dict ''' def parseProxies(self, proxies): proxies = proxies.split(',') return { 'http': proxies[0].strip(), 'https' : proxies[1].strip() } ''' Parse auth into python dict with correct method @proxy string: Comma separated auth params -> method,user,pass @return object/bool ''' def parseAuth(self, auth): # Password could use commas, so just split 2 times auth = auth.rsplit(',', 2) # Use correcht auth method if auth[0].lower() == 'basic': return (auth[1].strip(), auth[2].strip()) elif auth[0].lower() == 'digest': return HTTPDigestAuth(auth[0].strip(), auth[1].strip()) # Return false in case of no valid method return False ''' Convert headers string into dict @headers string: Headers as json string @return dict ''' def parseHeaders(self, headers): # Replace single quotes with double quotes for valid json return json.loads( headers.replace('\'', '"') )
class MkJSONCommand(StreamingCommand): """ ##Syntax ##Description ##Example """ includehidden = Option(require=False, validate=validators.Boolean()) outputfield = Option(require=False, validate=validators.Fieldname()) sortkeys = Option(require=False, validate=validators.Boolean()) def stream(self, events): if not self.outputfield: outputfield = "_raw" else: outputfield = self.outputfield if not self.includehidden: self.includehidden = False if not self.sortkeys: self.sortkeys = False for event in events: includedfields = set() if len(self.fieldnames) > 0: for fieldname in self.fieldnames: if fieldname in event: includedfields.add(fieldname) outputdict = {} for field in includedfields: if len(event[field]) > 0: outputdict[field] = event[field] event[outputfield] = json.dumps(outputdict, sort_keys=self.sortkeys) else: outputdict = {} for field in event: if self.includehidden or not re.match('^\_[^\_]',field): if len(event[field]) > 0: outputdict[field] = event[field] event[outputfield] = json.dumps(outputdict, sort_keys=self.sortkeys) yield event
class TestSearchCommand(SearchCommand): boolean = Option( doc=''' **Syntax:** **boolean=***<value>* **Description:** A boolean value''', validate=validators.Boolean()) required_boolean = Option( doc=''' **Syntax:** **boolean=***<value>* **Description:** A boolean value''', require=True, validate=validators.Boolean()) aliased_required_boolean = Option( doc=''' **Syntax:** **boolean=***<value>* **Description:** A boolean value''', name='foo', require=True, validate=validators.Boolean()) code = Option( doc=''' **Syntax:** **code=***<value>* **Description:** A Python expression, if mode == "eval", or statement, if mode == "exec"''', validate=validators.Code()) required_code = Option( doc=''' **Syntax:** **code=***<value>* **Description:** A Python expression, if mode == "eval", or statement, if mode == "exec"''', require=True, validate=validators.Code()) duration = Option( doc=''' **Syntax:** **duration=***<value>* **Description:** A length of time''', validate=validators.Duration()) required_duration = Option( doc=''' **Syntax:** **duration=***<value>* **Description:** A length of time''', require=True, validate=validators.Duration()) fieldname = Option( doc=''' **Syntax:** **fieldname=***<value>* **Description:** Name of a field''', validate=validators.Fieldname()) required_fieldname = Option( doc=''' **Syntax:** **fieldname=***<value>* **Description:** Name of a field''', require=True, validate=validators.Fieldname()) file = Option( doc=''' **Syntax:** **file=***<value>* **Description:** Name of a file''', validate=validators.File()) required_file = Option( doc=''' **Syntax:** **file=***<value>* **Description:** Name of a file''', require=True, validate=validators.File()) integer = Option( doc=''' **Syntax:** **integer=***<value>* **Description:** An integer value''', validate=validators.Integer()) required_integer = Option( doc=''' **Syntax:** **integer=***<value>* **Description:** An integer value''', require=True, validate=validators.Integer()) map = Option( doc=''' **Syntax:** **map=***<value>* **Description:** A mapping from one value to another''', validate=validators.Map(foo=1, bar=2, test=3)) required_map = Option( doc=''' **Syntax:** **map=***<value>* **Description:** A mapping from one value to another''', require=True, validate=validators.Map(foo=1, bar=2, test=3)) match = Option( doc=''' **Syntax:** **match=***<value>* **Description:** A value that matches a regular expression pattern''', validate=validators.Match('social security number', r'\d{3}-\d{2}-\d{4}')) required_match = Option( doc=''' **Syntax:** **required_match=***<value>* **Description:** A value that matches a regular expression pattern''', require=True, validate=validators.Match('social security number', r'\d{3}-\d{2}-\d{4}')) optionname = Option( doc=''' **Syntax:** **optionname=***<value>* **Description:** The name of an option (used internally)''', validate=validators.OptionName()) required_optionname = Option( doc=''' **Syntax:** **optionname=***<value>* **Description:** The name of an option (used internally)''', require=True, validate=validators.OptionName()) regularexpression = Option( doc=''' **Syntax:** **regularexpression=***<value>* **Description:** Regular expression pattern to match''', validate=validators.RegularExpression()) required_regularexpression = Option( doc=''' **Syntax:** **regularexpression=***<value>* **Description:** Regular expression pattern to match''', require=True, validate=validators.RegularExpression()) set = Option( doc=''' **Syntax:** **set=***<value>* **Description:** A member of a set''', validate=validators.Set('foo', 'bar', 'test')) required_set = Option( doc=''' **Syntax:** **set=***<value>* **Description:** A member of a set''', require=True, validate=validators.Set('foo', 'bar', 'test')) class ConfigurationSettings(SearchCommand.ConfigurationSettings): @classmethod def fix_up(cls, command_class): pass
class GeoDistanceCommand(ReportingCommand): """ Computes the distance of adjacent events ##Syntax .. code-block:: geodistance latfield=<field> longfield=<field> output_field=<field> miles=<bool> group_by=field_to_group_by haversine=<bool> ##Description This search command calculates the relative vincenty distances of adjacent events given their coordinates (latitudes and longitudes). It computes the distances in miles by default, but changed to Km by setting `miles=F` It can also compute the adjacent distances for a groups when the `group_by` is specified. ##Note: *Events that do not have latitudes or longitudes, as is the output when geocoding private non-routable IP addresses, will be given a distance of 0.0. The next relative distance will still be based on last public address found. *The first event in the result will also have a distance of 0.0 ##Example This example computes the relative distance for adjacent VPN connection attempts made by each user CLI: ..code-block:: "index=vpn | stats count by src_ip , user | iplocation src_ip | fields src_ip, user, lat, lon | geodistance latfield=lat longfield=lon output_field=distance miles=F group_by=user haversine=False" """ latfield = Option(doc=''' **Syntax:** **latfield=** *<fieldname>* **Description:** Name of the field that holds the latitude''', require=True, validate=validators.Fieldname()) longfield = Option(doc=''' **Syntax:** **longfield=** *<fieldname>* **Description:** Name of the field that holds the longitude''', require=True, validate=validators.Fieldname()) group_by = Option(doc=''' **Syntax:** **group_by=** *<fieldname>* **Description:** Name of the field to be used to categorize events when computing distances''', require=False, validate=validators.Fieldname()) miles = Option(doc=''' **Syntax:** **miles=** *<bool>* **Description:** If set to true, this converts the distance to miles instead of km''', require=False, validate=validators.Boolean(), default=False) output_field = Option(doc=''' **Syntax:** **output_field=** *<fieldname>* **Description:** Name of the field that will hold the relative distance returned in the output''', require=True, validate=validators.Fieldname()) use_haversine = Option(name='haversine', doc=''' **Syntax:** **haversine=** *<fieldname>* **Description:** If set to true, this calculates the harversine distance instead of the vincenty distance''', require=False, validate=validators.Boolean(), default=False) def __init__(self): super(GeoDistanceCommand, self).__init__() environment.splunklib_logger = self._logger @Configuration() def map(self, events): for event in events: yield event def reduce(self, events): latitude = self.latfield longitude = self.longfield relative_distance = self.output_field use_haversine = bool(self.use_haversine) self.logger.info("[%s] - Starting geodistance instance" % str(self.metadata.searchinfo.sid)) self.logger.debug( "[%s] - Using parameters - %s" % (str(self.metadata.searchinfo.sid), str(self.metadata))) if self.group_by: position_tracker = {} for event in events: current = event if not (current[latitude] or current[longitude]): current[relative_distance] = 0.0 self.logger.debug( "[%s] - Using distance=0 for private IPs or unknown coordinates. " "Exclude if undesired." % str(self.metadata.searchinfo.sid)) else: current_pos = (float(current[latitude]), float(current[longitude])) if current[self.group_by] not in position_tracker.keys(): last_pos = None else: last_pos = position_tracker[current[self.group_by]] if last_pos is None: current[relative_distance] = 0.0 self.logger.debug( "[%s] - Initializing the first location with distance=0" % str(self.metadata.searchinfo.sid)) else: if use_haversine: current[relative_distance] = haversine( last_pos, current_pos, miles=bool(self.miles)) else: current[relative_distance] = vincenty( last_pos, current_pos, miles=bool(self.miles)) position_tracker[current[self.group_by]] = current_pos yield current else: last_pos = None for event in events: current = event if not (current[latitude] or current[longitude]): current[relative_distance] = 0.0 self.logger.debug( "[%s] - Using distance=0 for private IPs or unknown coordinates. Exclude if undesired." % str(self.metadata.searchinfo.sid)) else: current_pos = (float(current[latitude]), float(current[longitude])) if last_pos is None: current[relative_distance] = 0.0 self.logger.debug( "[%s] - Initializing the first location with distance=0" % str(self.metadata.searchinfo.sid)) else: if use_haversine: current[relative_distance] = haversine( last_pos, current_pos, miles=bool(self.miles)) else: current[relative_distance] = vincenty( last_pos, current_pos, miles=bool(self.miles)) last_pos = current_pos self.logger.debug(current) yield current self.logger.info("[%s] - Completed successfully." % str(self.metadata.searchinfo.sid))