def _import_collection(self, client, site, collection, data_set=False): collection_name = collection.name sanitized_feed_name = cleanup_string( "%s%s" % (site.get('site'), collection_name)) feed_summary = "%s %s" % (site.get('site'), collection_name) available = collection.available collection_type = collection.type default_score = site.get('default_score') logger.info("%s,%s,%s,%s,%s" % (site.get('site'), collection_name, sanitized_feed_name, available, collection_type)) if not available: return False # # Sanity check on start date # start_date_str = site.get('start_date') if not start_date_str or len(start_date_str) == 0: start_date_str = "2017-01-01 00:00:00" # # Create a feed helper object # feed_helper = FeedHelper(site.get('output_path'), sanitized_feed_name, site.get('minutes_to_advance'), start_date_str) if not data_set: logger.info("Feed start time %s" % feed_helper.start_date) logger.info("polling Collection: {}...".format(collection.name)) # # Build up the URI for polling # if not site.get('poll_path', ''): uri = None else: uri = '' if site.get('use_https'): uri += 'https://' else: uri += 'http://' uri += site.get('site') uri += site.get('poll_path') logger.info('Poll path: {}'.format(uri)) reports = [] while True: try: try: content_blocks = client.poll( uri=uri, collection_name=collection.name, begin_date=feed_helper.start_date, end_date=feed_helper.end_date, content_bindings=BINDING_CHOICES) except Exception as e: logger.info(e.message) content_blocks = [] # # Iterate through all content_blocks # num_blocks = 0 if not data_set: logger.info("polling start_date: {}, end_date: {}".format( feed_helper.start_date, feed_helper.end_date)) for block in content_blocks: # # if in export mode then save off this content block # if self.export_dir: self.export_xml(collection_name, feed_helper.start_date, feed_helper.end_date, num_blocks, block.content) # # This code accounts for a case found with ThreatCentral.io where the content is url encoded. # etree.fromstring can parse this data. # try: root = etree.fromstring(block.content) content = root.find( './/{http://taxii.mitre.org/messages/taxii_xml_binding-1.1}Content' ) if content is not None and len(content) == 0 and len( list(content)) == 0: # # Content has no children. So lets make sure we parse the xml text for content and re-add # it as valid XML so we can parse # new_stix_package = etree.fromstring( root.find( "{http://taxii.mitre.org/messages/taxii_xml_binding-1.1}Content_Block/{http://taxii.mitre.org/messages/taxii_xml_binding-1.1}Content" ).text) content.append(new_stix_package) # # Since we modified the xml, we need create a new xml message string to parse # message = etree.tostring(root) # # Write the content block to disk so we can parse with python stix # file_handle, file_path = self.write_to_temp_file( message) # # Parse STIX data # stix_package = STIXPackage.from_xml(file_path) # # if it is a DATA_SET make feed_summary from the stix_header description # NOTE: this is for RecordedFuture, also note that we only do this for data_sets. # to date I have only seen RecordedFuture use data_sets # if data_set and stix_package.stix_header and stix_package.stix_header.descriptions: for desc in stix_package.stix_header.descriptions: feed_summary = desc.value break # # Get the timestamp of the STIX Package so we can use this in our feed # timestamp = total_seconds(stix_package.timestamp) if stix_package.indicators: for indicator in stix_package.indicators: if not indicator or not indicator.observable: continue if not indicator.timestamp: timestamp = 0 else: timestamp = int( (indicator.timestamp - datetime.datetime(1970, 1, 1).replace( tzinfo=dateutil.tz.tzutc()) ).total_seconds()) reports.extend( cybox_parse_observable( indicator.observable, indicator, timestamp, default_score)) # # Now lets find some data. Iterate through all observables and parse # if stix_package.observables: for observable in stix_package.observables: if not observable: continue # # Cybox observable returns a list # reports.extend( cybox_parse_observable( observable, None, timestamp, default_score)) # # Delete our temporary file # file_handle.close() num_blocks += 1 # # end for loop through content blocks # except Exception as e: #logger.info(traceback.format_exc()) logger.info(e.message) continue logger.info("content blocks read: {}".format(num_blocks)) logger.info("current number of reports: {}".format( len(reports))) # # DEBUG CODE # #if len(reports) > 10: # break # # Attempt to advance the start time and end time # except Exception as e: logger.info(traceback.format_exc()) # # If it is just a data_set, the data is unordered, so we can just break out of the while loop # if data_set: break if feed_helper.advance(): continue else: break # # end While True # logger.info("Found {} new reports.".format(len(reports))) reports = feed_helper.load_existing_feed_data() + reports logger.info("Total number of reports: {}".format(len(reports))) data = build_feed_data(sanitized_feed_name, "%s %s" % (site.get('site'), collection_name), feed_summary, site.get('site'), site.get('icon_link'), reports) if feed_helper.write_feed(data): feed_helper.save_details() # # Create Cb Response Feed if necessary # feed_id = self.cb.feed_get_id_by_name(sanitized_feed_name) if not feed_id: self.cb.feed_add_from_url("file://" + feed_helper.path, site.get('feeds_enable'), False, False)
def _import_collection(self, client, site, collection): collection_name = collection.get('collection_name', '') sanitized_feed_name = cleanup_string("%s%s" % (site.get('site'), collection_name)) available = collection.get('available', False) collection_type = collection.get('collection_type', '').upper() _logger.info("%s,%s,%s,%s,%s" % (site.get('site'), collection_name, sanitized_feed_name, available, collection_type)) if not available or collection_type != "DATA_FEED": return start_date_str = site.get('start_date') if not start_date_str or len(start_date_str) == 0: start_date_str = "2015-04-01 00:00:00" feed_helper = FeedHelper(site.get('output_path'), sanitized_feed_name, site.get('minutes_to_advance'), start_date_str, self.export_mode) _logger.info("Feed start time %s" % feed_helper.start_date) reports = [] # CATCHUP -- TODO, move to a function?? while True: these_reports = [] tries = 0 while tries < 5: try: if feed_helper.start_date > feed_helper.end_date: break t1 = time.time() message = client.retrieve_collection(collection_name, feed_helper.start_date, feed_helper.end_date) t2 = time.time() message_len = len(message) if self.export_mode: path = self._export_message_to_disk(sanitized_feed_name, feed_helper.start_date, feed_helper.end_date, message) _logger.info("%s - %s - %s - %d (%f)- %s" % (feed_helper.start_date, feed_helper.end_date, collection_name, message_len, (t2-t1), path)) message = None else: filepath = self._write_message_to_disk(message) message = None site_url = "%s://%s" % ("https" if site.get('use_https') else "http", site.get('site')) these_reports = self._message_to_reports(filepath, site.get('site'), site_url, collection_name, site.get('enable_ip_ranges')) t3 = time.time() os.remove(filepath) count = len(these_reports) _logger.info("%s - %s - %s - %d (%d)(%.2f)(%.2f)" % (feed_helper.start_date, feed_helper.end_date, collection_name, count, message_len, (t2-t1), (t3-t2))) break except: _logger.error("%s" % traceback.format_exc()) time.sleep(5) tries += 1 if tries == 5: _logger.error("Giving up for site %s, collection %s" % (site.get('site'), collection)) return if not self.export_mode: reports.extend(these_reports) if not feed_helper.advance(): break ########## end while (for iterating across time) _logger.info("COMPLETED %s,%s,%s,%s,%s (%d)" % (site.get('site'), collection_name, sanitized_feed_name, available, collection_type, len(reports))) if not self.export_mode: # TODO -- clean this up if len(reports) > 0: # load existing data and convert new data reports = feed_helper.load_existing_feed_data() + reports # convert feed info and reports to json data = build_feed_data(sanitized_feed_name, "%s %s" % (site.get('site'), collection_name), site.get('site'), site.get('icon_link'), reports) # SAVE THE DATA: write out the feed file and save the details for when we last queried it if feed_helper.write_feed(data): feed_helper.save_details() # Actually add CB feed if necessary feed_id = self.cb.feed_get_id_by_name(sanitized_feed_name) if not feed_id: data = self.cb.feed_add_from_url("file://" + feed_helper.path, site.get('feeds_enable'), False, False) # FEED ALERTING!! feed_id = data.get('id') url = "https://127.0.0.1/api/v1/feed/%d/action" % feed_id alert_types = site.get('feeds_alerting', '').split(',') headers = {'X-Auth-Token' : self.api_token, "Accept" : "application/json"} for alert in alert_types: if alert.lower() == "syslog": action_data = {"action_data": """{"email_recipients":[1]}""", "action_type": 1, "group_id": feed_id, "watchlist_id": ""} resp = requests.post(url, headers=headers, data=json.dumps(action_data), verify=False) if resp.status_code != 200: _logger.warn("Error for syslog action (%d): %s" % (feed_id, resp.content)) elif alert.lower() == "cb": action_data = {"action_data": """{"email_recipients":[1]}""", "action_type": 3, "group_id": feed_id, "watchlist_id": ""} resp = requests.post(url, headers=headers, data=json.dumps(action_data), verify=False) if resp.status_code != 200: _logger.warn("Error for cb action (%d): %s" % (feed_id, resp.content)) else: # no reports feed_helper.save_details()
def _import_collection(self, client, site, collection): collection_name = collection.name sanitized_feed_name = cleanup_string("%s%s" % (site.get('site'), collection_name)) available = collection.available collection_type = collection.type logger.info("%s,%s,%s,%s,%s" % (site.get('site'), collection_name, sanitized_feed_name, available, collection_type)) # # We only care about DATA_FEED type # if not available or collection_type != "DATA_FEED": return False # # Sanity check on start date # start_date_str = site.get('start_date') if not start_date_str or len(start_date_str) == 0: start_date_str = "2016-12-01 00:00:00" # # Create a feed helper object # feed_helper = FeedHelper( site.get('output_path'), sanitized_feed_name, site.get('minutes_to_advance'), start_date_str) logger.info("Feed start time %s" % feed_helper.start_date) logger.info("polling Collection: {}...".format(collection.name)) # # Build up the URI for polling # if not site.get('poll_path', ''): uri = None else: uri = '' if site.get('use_https'): uri += 'https://' else: uri += 'http://' uri += site.get('site') uri += site.get('poll_path') logger.info('Poll path: {}'.format(uri)) reports = [] while True: content_blocks = client.poll(uri=uri, collection_name=collection.name, begin_date=feed_helper.start_date, end_date=feed_helper.end_date, #content_bindings=BINDING_CHOICES) content_bindings=[CB_STIX_XML_12]) # # Iterate through all content_blocks # num_blocks = 0 for block in content_blocks: # # if in export mode then save off this content block # if self.export_dir: self.export_xml(collection_name, feed_helper.start_date, feed_helper.end_date, num_blocks, block.content) # # This code accounts for a case found with ThreatCentral.io where the content is url encoded. # etree.fromstring can parse this data. # root = etree.fromstring(block.content) content = root.find('.//{http://taxii.mitre.org/messages/taxii_xml_binding-1.1}Content') if content is not None and len(content) == 0 and len(list(content)) == 0: # # Content has no children. So lets make sure we parse the xml text for content and re-add # it as valid XML so we can parse # new_stix_package = etree.fromstring(root.find( "{http://taxii.mitre.org/messages/taxii_xml_binding-1.1}Content_Block/{http://taxii.mitre.org/messages/taxii_xml_binding-1.1}Content").text) content.append(new_stix_package) # # Since we modified the xml, we need create a new xml message string to parse # message = etree.tostring(root) # # Write the content block to disk so we can parse with python stix # file_handle, file_path = self.write_to_temp_file(message) # # Parse STIX data # stix_package = STIXPackage.from_xml(file_path) # # Get the timestamp of the STIX Package so we can use this in our feed # timestamp = total_seconds(stix_package.timestamp) # # Now lets find some data. Iterate through all observables and parse # if stix_package.observables: for observable in stix_package.observables: # # Cybox observable returns a list # reports.extend(cybox_parse_observable(observable, timestamp)) # # Delete our temporary file # file_handle.close() num_blocks += 1 # # end for loop through content blocks # logger.info("content blocks read: {}".format(num_blocks)) logger.info("current number of reports: {}".format(len(reports))) # # DEBUG CODE # #if len(reports) > 10: # break # # Attempt to advance the start time and end time # if feed_helper.advance(): continue else: break # # end While True # logger.info("Found {} new reports.".format(len(reports))) reports = feed_helper.load_existing_feed_data() + reports logger.info("Total number of reports: {}".format(len(reports))) data = build_feed_data(sanitized_feed_name, "%s %s" % (site.get('site'), collection_name), site.get('site'), site.get('icon_link'), reports) if feed_helper.write_feed(data): feed_helper.save_details() # # Create Cb Response Feed if necessary # feed_id = self.cb.feed_get_id_by_name(sanitized_feed_name) if not feed_id: data = self.cb.feed_add_from_url("file://" + feed_helper.path, site.get('feeds_enable'), False, False)
def _import_collection(self, client, site, collection, data_set=False): collection_name = collection.name sanitized_feed_name = cleanup_string( "%s%s" % (site.get('site'), collection_name)) feed_summary = "%s %s" % (site.get('site'), collection_name) available = collection.available collection_type = collection.type default_score = site.get('default_score') logger.info("%s,%s,%s,%s,%s" % (site.get('site'), collection_name, sanitized_feed_name, available, collection_type)) if not available: return False # # Sanity check on start date # start_date_str = site.get('start_date') if not start_date_str or len(start_date_str) == 0: start_date_str = "2019-01-01 00:00:00" # # Create a feed helper object # feed_helper = FeedHelper(site.get('output_path'), sanitized_feed_name, site.get('minutes_to_advance'), start_date_str) if not data_set: logger.info("Feed start time %s" % feed_helper.start_date) logger.info("polling Collection: {}...".format(collection.name)) # # Build up the URI for polling # if not site.get('poll_path', ''): uri = None else: uri = '' if site.get('use_https'): uri += 'https://' else: uri += 'http://' uri += site.get('site') uri += site.get('poll_path') logger.info('Poll path: {}'.format(uri)) reports = [] while True: num_times_empty_content_blocks = 0 try: try: logger.info("Polling Collection: {0}".format( collection.name)) content_blocks = client.poll( uri=uri, collection_name=collection.name, begin_date=feed_helper.start_date, end_date=feed_helper.end_date, content_bindings=BINDING_CHOICES) except Exception as e: logger.info(e.message) content_blocks = [] # # Iterate through all content_blocks # num_blocks = 0 if not data_set: logger.info("polling start_date: {}, end_date: {}".format( feed_helper.start_date, feed_helper.end_date)) for block in content_blocks: logger.debug(block.content) # # if in export mode then save off this content block # if self.export_dir: self.export_xml(collection_name, feed_helper.start_date, feed_helper.end_date, num_blocks, block.content) # # This code accounts for a case found with ThreatCentral.io where the content is url encoded. # etree.fromstring can parse this data. # try: root = etree.fromstring(block.content) content = root.find( './/{http://taxii.mitre.org/messages/taxii_xml_binding-1.1}Content' ) if content is not None and len(content) == 0 and len( list(content)) == 0: # # Content has no children. So lets make sure we parse the xml text for content and re-add # it as valid XML so we can parse # new_stix_package = etree.fromstring( root.find( "{http://taxii.mitre.org/messages/taxii_xml_binding-1.1}Content_Block/{http://taxii.mitre.org/messages/taxii_xml_binding-1.1}Content" ).text) content.append(new_stix_package) # # Since we modified the xml, we need create a new xml message string to parse # message = etree.tostring(root) # # Write the content block to disk so we can parse with python stix # file_handle, file_path = self.write_to_temp_file( message) # # Parse STIX data # stix_package = STIXPackage.from_xml(file_path) # # if it is a DATA_SET make feed_summary from the stix_header description # NOTE: this is for RecordedFuture, also note that we only do this for data_sets. # to date I have only seen RecordedFuture use data_sets # if data_set and stix_package.stix_header and stix_package.stix_header.descriptions: for desc in stix_package.stix_header.descriptions: feed_summary = "{}: {}".format( desc.value, collection_name) break # # Get the timestamp of the STIX Package so we can use this in our feed # timestamp = total_seconds(stix_package.timestamp) if not stix_package.indicators and not stix_package.observables: num_times_empty_content_blocks += 1 if num_times_empty_content_blocks > 10: break if stix_package.indicators: for indicator in stix_package.indicators: if not indicator or not indicator.observable: continue if indicator.confidence: if str(indicator.confidence.value).isdigit( ): # # Get the confidence score and use it for our score # score = int( indicator.confidence.to_dict().get( "value", default_score)) else: if str(indicator.confidence.value ).lower() == "high": score = 75 elif str(indicator.confidence.value ).lower() == "medium": score = 50 elif str(indicator.confidence.value ).lower() == "low": score = 25 else: score = default_score else: score = default_score if not indicator.timestamp: timestamp = 0 else: timestamp = int( (indicator.timestamp - datetime.datetime(1970, 1, 1).replace( tzinfo=dateutil.tz.tzutc()) ).total_seconds()) reports.extend( cybox_parse_observable( indicator.observable, indicator, timestamp, score)) # # Now lets find some data. Iterate through all observables and parse # if stix_package.observables: for observable in stix_package.observables: if not observable: continue # # Cybox observable returns a list # reports.extend( cybox_parse_observable( observable, None, timestamp, default_score)) # # Delete our temporary file # file_handle.close() num_blocks += 1 # # end for loop through content blocks # except Exception as e: # logger.info(traceback.format_exc()) logger.info(e.message) continue logger.info("content blocks read: {}".format(num_blocks)) logger.info("current number of reports: {}".format( len(reports))) if len(reports) > site.get('reports_limit'): logger.info( "We have reached the reports limit of {0}".format( site.get('reports_limit'))) break # # DEBUG CODE # # if len(reports) > 10: # break # # Attempt to advance the start time and end time # except Exception as e: logger.info(traceback.format_exc()) # # If it is just a data_set, the data is unordered, so we can just break out of the while loop # if data_set: break if feed_helper.advance(): continue else: break # # end While True # logger.info("Found {} new reports.".format(len(reports))) if not data_set: # # We only want to concatenate if we are NOT a data set, otherwise we want to refresh all the reports # logger.info("Adding existing reports...") reports = feed_helper.load_existing_feed_data() + reports logger.info("Total number of reports: {}".format(len(reports))) if site.get('reports_limit') < len(reports): logger.info("Truncating reports to length {0}".format( site.get('reports_limit'))) reports = reports[:site.get('reports_limit')] data = build_feed_data(sanitized_feed_name, "%s %s" % (site.get('site'), collection_name), feed_summary, site.get('site'), site.get('icon_link'), reports) if feed_helper.write_feed(data): feed_helper.save_details() # # Create Cb Response Feed if necessary # feed_id = None try: feeds = get_object_by_name_or_id(self.cb, Feed, name=sanitized_feed_name) if not feeds: logger.info( "Feed {} was not found, so we are going to create it". format(sanitized_feed_name)) elif len(feeds) > 1: logger.warning( "Multiple feeds found, selecting Feed id {}".format( feeds[0].id)) feed_id = feeds[0].id elif feeds: feed_id = feeds[0].id logger.info("Feed {} was found as Feed ID {}".format( sanitized_feed_name, feed_id)) except Exception as e: logger.info(e.message) if not feed_id: logger.info("Creating {} feed for the first time".format( sanitized_feed_name)) f = self.cb.create(Feed) f.feed_url = "file://" + feed_helper.path f.enabled = site.get('feeds_enable') f.use_proxy = False f.validate_server_cert = False try: f.save() except ServerError as se: if se.error_code == 500: logger.info("Could not add feed:") logger.info( " Received error code 500 from server. This is usually because the server cannot retrieve the feed." ) logger.info( " Check to ensure the Cb server has network connectivity and the credentials are correct." ) else: logger.info("Could not add feed: {0:s}".format(str(se))) except Exception as e: logger.info("Could not add feed: {0:s}".format(str(e))) else: logger.info("Feed data: {0:s}".format(str(f))) logger.info("Added feed. New feed ID is {0:d}".format(f.id)) feed_id = f.id return feed_id