Example #1
0
    def _import_collection(self, client, site, collection, data_set=False):

        collection_name = collection.name
        sanitized_feed_name = cleanup_string(
            "%s%s" % (site.get('site'), collection_name))
        feed_summary = "%s %s" % (site.get('site'), collection_name)
        available = collection.available
        collection_type = collection.type
        default_score = site.get('default_score')
        logger.info("%s,%s,%s,%s,%s" %
                    (site.get('site'), collection_name, sanitized_feed_name,
                     available, collection_type))

        if not available:
            return False

        #
        # Sanity check on start date
        #
        start_date_str = site.get('start_date')
        if not start_date_str or len(start_date_str) == 0:
            start_date_str = "2017-01-01 00:00:00"

        #
        # Create a feed helper object
        #
        feed_helper = FeedHelper(site.get('output_path'), sanitized_feed_name,
                                 site.get('minutes_to_advance'),
                                 start_date_str)

        if not data_set:
            logger.info("Feed start time %s" % feed_helper.start_date)
        logger.info("polling Collection: {}...".format(collection.name))

        #
        # Build up the URI for polling
        #

        if not site.get('poll_path', ''):
            uri = None
        else:
            uri = ''
            if site.get('use_https'):
                uri += 'https://'
            else:
                uri += 'http://'

            uri += site.get('site')
            uri += site.get('poll_path')
            logger.info('Poll path: {}'.format(uri))

        reports = []
        while True:

            try:
                try:
                    content_blocks = client.poll(
                        uri=uri,
                        collection_name=collection.name,
                        begin_date=feed_helper.start_date,
                        end_date=feed_helper.end_date,
                        content_bindings=BINDING_CHOICES)

                except Exception as e:
                    logger.info(e.message)
                    content_blocks = []

                #
                # Iterate through all content_blocks
                #
                num_blocks = 0

                if not data_set:
                    logger.info("polling start_date: {}, end_date: {}".format(
                        feed_helper.start_date, feed_helper.end_date))
                for block in content_blocks:

                    #
                    # if in export mode then save off this content block
                    #
                    if self.export_dir:
                        self.export_xml(collection_name,
                                        feed_helper.start_date,
                                        feed_helper.end_date, num_blocks,
                                        block.content)

                    #
                    # This code accounts for a case found with ThreatCentral.io where the content is url encoded.
                    # etree.fromstring can parse this data.
                    #
                    try:
                        root = etree.fromstring(block.content)
                        content = root.find(
                            './/{http://taxii.mitre.org/messages/taxii_xml_binding-1.1}Content'
                        )
                        if content is not None and len(content) == 0 and len(
                                list(content)) == 0:
                            #
                            # Content has no children.  So lets make sure we parse the xml text for content and re-add
                            # it as valid XML so we can parse
                            #
                            new_stix_package = etree.fromstring(
                                root.find(
                                    "{http://taxii.mitre.org/messages/taxii_xml_binding-1.1}Content_Block/{http://taxii.mitre.org/messages/taxii_xml_binding-1.1}Content"
                                ).text)
                            content.append(new_stix_package)

                        #
                        # Since we modified the xml, we need create a new xml message string to parse
                        #
                        message = etree.tostring(root)

                        #
                        # Write the content block to disk so we can parse with python stix
                        #
                        file_handle, file_path = self.write_to_temp_file(
                            message)

                        #
                        # Parse STIX data
                        #
                        stix_package = STIXPackage.from_xml(file_path)

                        #
                        # if it is a DATA_SET make feed_summary from the stix_header description
                        # NOTE: this is for RecordedFuture, also note that we only do this for data_sets.
                        #       to date I have only seen RecordedFuture use data_sets
                        #
                        if data_set and stix_package.stix_header and stix_package.stix_header.descriptions:
                            for desc in stix_package.stix_header.descriptions:
                                feed_summary = desc.value
                                break

                        #
                        # Get the timestamp of the STIX Package so we can use this in our feed
                        #
                        timestamp = total_seconds(stix_package.timestamp)

                        if stix_package.indicators:
                            for indicator in stix_package.indicators:
                                if not indicator or not indicator.observable:
                                    continue

                                if not indicator.timestamp:
                                    timestamp = 0
                                else:
                                    timestamp = int(
                                        (indicator.timestamp -
                                         datetime.datetime(1970, 1, 1).replace(
                                             tzinfo=dateutil.tz.tzutc())
                                         ).total_seconds())

                                reports.extend(
                                    cybox_parse_observable(
                                        indicator.observable, indicator,
                                        timestamp, default_score))

                        #
                        # Now lets find some data.  Iterate through all observables and parse
                        #
                        if stix_package.observables:
                            for observable in stix_package.observables:
                                if not observable:
                                    continue
                                #
                                # Cybox observable returns a list
                                #
                                reports.extend(
                                    cybox_parse_observable(
                                        observable, None, timestamp,
                                        default_score))

                        #
                        # Delete our temporary file
                        #
                        file_handle.close()

                        num_blocks += 1

                        #
                        # end for loop through content blocks
                        #

                    except Exception as e:
                        #logger.info(traceback.format_exc())
                        logger.info(e.message)
                        continue

                logger.info("content blocks read: {}".format(num_blocks))
                logger.info("current number of reports: {}".format(
                    len(reports)))

                #
                # DEBUG CODE
                #
                #if len(reports) > 10:
                #    break

                #
                # Attempt to advance the start time and end time
                #

            except Exception as e:
                logger.info(traceback.format_exc())

            #
            # If it is just a data_set, the data is unordered, so we can just break out of the while loop
            #
            if data_set:
                break

            if feed_helper.advance():
                continue
            else:
                break
            #
            # end While True
            #

        logger.info("Found {} new reports.".format(len(reports)))

        reports = feed_helper.load_existing_feed_data() + reports

        logger.info("Total number of reports: {}".format(len(reports)))

        data = build_feed_data(sanitized_feed_name,
                               "%s %s" % (site.get('site'), collection_name),
                               feed_summary, site.get('site'),
                               site.get('icon_link'), reports)

        if feed_helper.write_feed(data):
            feed_helper.save_details()

        #
        # Create Cb Response Feed if necessary
        #

        feed_id = self.cb.feed_get_id_by_name(sanitized_feed_name)
        if not feed_id:
            self.cb.feed_add_from_url("file://" + feed_helper.path,
                                      site.get('feeds_enable'), False, False)
Example #2
0
    def _import_collection(self, client, site, collection):
        collection_name = collection.get('collection_name', '')
        sanitized_feed_name = cleanup_string("%s%s" % (site.get('site'), collection_name))
        available = collection.get('available', False)
        collection_type = collection.get('collection_type', '').upper()
        _logger.info("%s,%s,%s,%s,%s" % (site.get('site'),
                                              collection_name,
                                              sanitized_feed_name,
                                              available,
                                              collection_type))

        if not available or collection_type != "DATA_FEED":
            return

        start_date_str = site.get('start_date')
        if not start_date_str or len(start_date_str) == 0:
            start_date_str = "2015-04-01 00:00:00"

        feed_helper = FeedHelper(site.get('output_path'), sanitized_feed_name, site.get('minutes_to_advance'), start_date_str, self.export_mode)

        _logger.info("Feed start time %s" % feed_helper.start_date)

        reports = []
        # CATCHUP -- TODO, move to a function??
        while True:
            these_reports = []
            tries = 0
            while tries < 5:
                try:
                    if feed_helper.start_date > feed_helper.end_date:
                        break

                    t1 = time.time()
                    message = client.retrieve_collection(collection_name, feed_helper.start_date, feed_helper.end_date)
                    t2 = time.time()

                    message_len = len(message)

                    if self.export_mode:
                        path = self._export_message_to_disk(sanitized_feed_name, feed_helper.start_date, feed_helper.end_date, message)
                        _logger.info("%s - %s - %s - %d (%f)- %s" % (feed_helper.start_date, feed_helper.end_date, collection_name, message_len, (t2-t1), path))
                        message = None
                    else:
                        filepath = self._write_message_to_disk(message)
                        message = None
                        site_url = "%s://%s" % ("https" if site.get('use_https') else "http", site.get('site'))
                        these_reports = self._message_to_reports(filepath, site.get('site'), site_url, collection_name, site.get('enable_ip_ranges'))
                        t3 = time.time()
                        os.remove(filepath)
                        count = len(these_reports)
                        _logger.info("%s - %s - %s - %d (%d)(%.2f)(%.2f)" % (feed_helper.start_date, feed_helper.end_date, collection_name, count, message_len, (t2-t1), (t3-t2)))
                    break
                except:
                    _logger.error("%s" % traceback.format_exc())
                    time.sleep(5)
                    tries += 1

            if tries == 5:
                _logger.error("Giving up for site %s, collection %s" % (site.get('site'), collection))
                return

            if not self.export_mode:
                reports.extend(these_reports)

            if not feed_helper.advance():
                break
        ########## end while (for iterating across time)

        _logger.info("COMPLETED %s,%s,%s,%s,%s (%d)" % (site.get('site'),
                                              collection_name,
                                              sanitized_feed_name,
                                              available,
                                              collection_type,
                                              len(reports)))

        if not self.export_mode:
            # TODO -- clean this up
            if len(reports) > 0:
                # load existing data and convert new data
                reports = feed_helper.load_existing_feed_data() + reports

                # convert feed info and reports to json
                data = build_feed_data(sanitized_feed_name,
                                       "%s %s" % (site.get('site'), collection_name),
                                       site.get('site'),
                                       site.get('icon_link'),
                                       reports)

                # SAVE THE DATA: write out the feed file and save the details for when we last queried it
                if feed_helper.write_feed(data):
                    feed_helper.save_details()

                # Actually add CB feed if necessary
                feed_id = self.cb.feed_get_id_by_name(sanitized_feed_name)
                if not feed_id:
                    data = self.cb.feed_add_from_url("file://" + feed_helper.path,
                                              site.get('feeds_enable'),
                                              False,
                                              False)

                    # FEED ALERTING!!
                    feed_id = data.get('id')
                    url = "https://127.0.0.1/api/v1/feed/%d/action" % feed_id
                    alert_types = site.get('feeds_alerting', '').split(',')
                    headers = {'X-Auth-Token' : self.api_token, "Accept" : "application/json"}
                    for alert in alert_types:
                        if alert.lower() == "syslog":
                            action_data = {"action_data": """{"email_recipients":[1]}""", "action_type": 1, "group_id": feed_id, "watchlist_id": ""}
                            resp = requests.post(url, headers=headers, data=json.dumps(action_data), verify=False)
                            if resp.status_code != 200:
                                _logger.warn("Error for syslog action (%d): %s" % (feed_id, resp.content))
                        elif alert.lower() == "cb":
                            action_data = {"action_data": """{"email_recipients":[1]}""", "action_type": 3, "group_id": feed_id, "watchlist_id": ""}
                            resp = requests.post(url, headers=headers, data=json.dumps(action_data), verify=False)
                            if resp.status_code != 200:
                                _logger.warn("Error for cb action (%d): %s" % (feed_id, resp.content))
            else: # no reports
                feed_helper.save_details()
Example #3
0
    def _import_collection(self, client, site, collection):

        collection_name = collection.name
        sanitized_feed_name = cleanup_string("%s%s" % (site.get('site'), collection_name))
        available = collection.available
        collection_type = collection.type
        logger.info("%s,%s,%s,%s,%s" % (site.get('site'),
                                         collection_name,
                                         sanitized_feed_name,
                                         available,
                                         collection_type))

        #
        # We only care about DATA_FEED type
        #
        if not available or collection_type != "DATA_FEED":
            return False

        #
        # Sanity check on start date
        #
        start_date_str = site.get('start_date')
        if not start_date_str or len(start_date_str) == 0:
            start_date_str = "2016-12-01 00:00:00"


        #
        # Create a feed helper object
        #
        feed_helper = FeedHelper(
            site.get('output_path'),
            sanitized_feed_name,
            site.get('minutes_to_advance'),
            start_date_str)

        logger.info("Feed start time %s" % feed_helper.start_date)
        logger.info("polling Collection: {}...".format(collection.name))

        #
        # Build up the URI for polling
        #

        if not site.get('poll_path', ''):
            uri = None
        else:
            uri = ''
            if site.get('use_https'):
                uri += 'https://'
            else:
                uri += 'http://'

            uri += site.get('site')
            uri += site.get('poll_path')
            logger.info('Poll path: {}'.format(uri))

        reports = []
        while True:

            content_blocks = client.poll(uri=uri,
                                         collection_name=collection.name,
                                         begin_date=feed_helper.start_date,
                                         end_date=feed_helper.end_date,
                                         #content_bindings=BINDING_CHOICES)
                                         content_bindings=[CB_STIX_XML_12])

            #
            # Iterate through all content_blocks
            #
            num_blocks = 0
            for block in content_blocks:

                #
                # if in export mode then save off this content block
                #
                if self.export_dir:
                    self.export_xml(collection_name,
                                    feed_helper.start_date,
                                    feed_helper.end_date,
                                    num_blocks,
                                    block.content)

                #
                # This code accounts for a case found with ThreatCentral.io where the content is url encoded.
                # etree.fromstring can parse this data.
                #
                root = etree.fromstring(block.content)
                content = root.find('.//{http://taxii.mitre.org/messages/taxii_xml_binding-1.1}Content')
                if content is not None and len(content) == 0 and len(list(content)) == 0:
                    #
                    # Content has no children.  So lets make sure we parse the xml text for content and re-add
                    # it as valid XML so we can parse
                    #
                    new_stix_package = etree.fromstring(root.find(
                        "{http://taxii.mitre.org/messages/taxii_xml_binding-1.1}Content_Block/{http://taxii.mitre.org/messages/taxii_xml_binding-1.1}Content").text)
                    content.append(new_stix_package)

                #
                # Since we modified the xml, we need create a new xml message string to parse
                #
                message = etree.tostring(root)

                #
                # Write the content block to disk so we can parse with python stix
                #
                file_handle, file_path = self.write_to_temp_file(message)

                #
                # Parse STIX data
                #
                stix_package = STIXPackage.from_xml(file_path)

                #
                # Get the timestamp of the STIX Package so we can use this in our feed
                #
                timestamp = total_seconds(stix_package.timestamp)

                #
                # Now lets find some data.  Iterate through all observables and parse
                #
                if stix_package.observables:
                    for observable in stix_package.observables:
                        #
                        # Cybox observable returns a list
                        #
                        reports.extend(cybox_parse_observable(observable, timestamp))

                #
                # Delete our temporary file
                #
                file_handle.close()

                num_blocks += 1

                #
                # end for loop through content blocks
                #

            logger.info("content blocks read: {}".format(num_blocks))
            logger.info("current number of reports: {}".format(len(reports)))

            #
            # DEBUG CODE
            #
            #if len(reports) > 10:
            #    break

            #
            # Attempt to advance the start time and end time
            #
            if feed_helper.advance():
                continue
            else:
                break
            #
            # end While True
            #

        logger.info("Found {} new reports.".format(len(reports)))

        reports = feed_helper.load_existing_feed_data() + reports

        logger.info("Total number of reports: {}".format(len(reports)))

        data = build_feed_data(sanitized_feed_name,
                               "%s %s" % (site.get('site'), collection_name),
                               site.get('site'),
                               site.get('icon_link'),
                               reports)

        if feed_helper.write_feed(data):
            feed_helper.save_details()

        #
        # Create Cb Response Feed if necessary
        #
        feed_id = self.cb.feed_get_id_by_name(sanitized_feed_name)
        if not feed_id:
            data = self.cb.feed_add_from_url("file://" + feed_helper.path,
                                             site.get('feeds_enable'),
                                             False,
                                             False)
Example #4
0
    def _import_collection(self, client, site, collection, data_set=False):

        collection_name = collection.name
        sanitized_feed_name = cleanup_string(
            "%s%s" % (site.get('site'), collection_name))
        feed_summary = "%s %s" % (site.get('site'), collection_name)
        available = collection.available
        collection_type = collection.type
        default_score = site.get('default_score')
        logger.info("%s,%s,%s,%s,%s" %
                    (site.get('site'), collection_name, sanitized_feed_name,
                     available, collection_type))

        if not available:
            return False

        #
        # Sanity check on start date
        #
        start_date_str = site.get('start_date')
        if not start_date_str or len(start_date_str) == 0:
            start_date_str = "2019-01-01 00:00:00"

        #
        # Create a feed helper object
        #
        feed_helper = FeedHelper(site.get('output_path'), sanitized_feed_name,
                                 site.get('minutes_to_advance'),
                                 start_date_str)

        if not data_set:
            logger.info("Feed start time %s" % feed_helper.start_date)
        logger.info("polling Collection: {}...".format(collection.name))

        #
        # Build up the URI for polling
        #

        if not site.get('poll_path', ''):
            uri = None
        else:
            uri = ''
            if site.get('use_https'):
                uri += 'https://'
            else:
                uri += 'http://'

            uri += site.get('site')
            uri += site.get('poll_path')
            logger.info('Poll path: {}'.format(uri))

        reports = []
        while True:
            num_times_empty_content_blocks = 0
            try:
                try:
                    logger.info("Polling Collection: {0}".format(
                        collection.name))
                    content_blocks = client.poll(
                        uri=uri,
                        collection_name=collection.name,
                        begin_date=feed_helper.start_date,
                        end_date=feed_helper.end_date,
                        content_bindings=BINDING_CHOICES)

                except Exception as e:
                    logger.info(e.message)
                    content_blocks = []

                #
                # Iterate through all content_blocks
                #
                num_blocks = 0

                if not data_set:
                    logger.info("polling start_date: {}, end_date: {}".format(
                        feed_helper.start_date, feed_helper.end_date))
                for block in content_blocks:
                    logger.debug(block.content)

                    #
                    # if in export mode then save off this content block
                    #
                    if self.export_dir:
                        self.export_xml(collection_name,
                                        feed_helper.start_date,
                                        feed_helper.end_date, num_blocks,
                                        block.content)

                    #
                    # This code accounts for a case found with ThreatCentral.io where the content is url encoded.
                    # etree.fromstring can parse this data.
                    #
                    try:
                        root = etree.fromstring(block.content)
                        content = root.find(
                            './/{http://taxii.mitre.org/messages/taxii_xml_binding-1.1}Content'
                        )
                        if content is not None and len(content) == 0 and len(
                                list(content)) == 0:
                            #
                            # Content has no children.  So lets make sure we parse the xml text for content and re-add
                            # it as valid XML so we can parse
                            #
                            new_stix_package = etree.fromstring(
                                root.find(
                                    "{http://taxii.mitre.org/messages/taxii_xml_binding-1.1}Content_Block/{http://taxii.mitre.org/messages/taxii_xml_binding-1.1}Content"
                                ).text)
                            content.append(new_stix_package)

                        #
                        # Since we modified the xml, we need create a new xml message string to parse
                        #
                        message = etree.tostring(root)

                        #
                        # Write the content block to disk so we can parse with python stix
                        #
                        file_handle, file_path = self.write_to_temp_file(
                            message)

                        #
                        # Parse STIX data
                        #
                        stix_package = STIXPackage.from_xml(file_path)

                        #
                        # if it is a DATA_SET make feed_summary from the stix_header description
                        # NOTE: this is for RecordedFuture, also note that we only do this for data_sets.
                        #       to date I have only seen RecordedFuture use data_sets
                        #
                        if data_set and stix_package.stix_header and stix_package.stix_header.descriptions:
                            for desc in stix_package.stix_header.descriptions:
                                feed_summary = "{}: {}".format(
                                    desc.value, collection_name)
                                break

                        #
                        # Get the timestamp of the STIX Package so we can use this in our feed
                        #
                        timestamp = total_seconds(stix_package.timestamp)

                        if not stix_package.indicators and not stix_package.observables:
                            num_times_empty_content_blocks += 1
                            if num_times_empty_content_blocks > 10:
                                break

                        if stix_package.indicators:
                            for indicator in stix_package.indicators:

                                if not indicator or not indicator.observable:
                                    continue

                                if indicator.confidence:

                                    if str(indicator.confidence.value).isdigit(
                                    ):
                                        #
                                        # Get the confidence score and use it for our score
                                        #
                                        score = int(
                                            indicator.confidence.to_dict().get(
                                                "value", default_score))
                                    else:
                                        if str(indicator.confidence.value
                                               ).lower() == "high":
                                            score = 75
                                        elif str(indicator.confidence.value
                                                 ).lower() == "medium":
                                            score = 50
                                        elif str(indicator.confidence.value
                                                 ).lower() == "low":
                                            score = 25
                                        else:
                                            score = default_score
                                else:
                                    score = default_score

                                if not indicator.timestamp:
                                    timestamp = 0
                                else:
                                    timestamp = int(
                                        (indicator.timestamp -
                                         datetime.datetime(1970, 1, 1).replace(
                                             tzinfo=dateutil.tz.tzutc())
                                         ).total_seconds())

                                reports.extend(
                                    cybox_parse_observable(
                                        indicator.observable, indicator,
                                        timestamp, score))

                        #
                        # Now lets find some data.  Iterate through all observables and parse
                        #
                        if stix_package.observables:
                            for observable in stix_package.observables:
                                if not observable:
                                    continue
                                #
                                # Cybox observable returns a list
                                #
                                reports.extend(
                                    cybox_parse_observable(
                                        observable, None, timestamp,
                                        default_score))

                        #
                        # Delete our temporary file
                        #
                        file_handle.close()

                        num_blocks += 1

                        #
                        # end for loop through content blocks
                        #

                    except Exception as e:
                        # logger.info(traceback.format_exc())
                        logger.info(e.message)
                        continue

                logger.info("content blocks read: {}".format(num_blocks))
                logger.info("current number of reports: {}".format(
                    len(reports)))

                if len(reports) > site.get('reports_limit'):
                    logger.info(
                        "We have reached the reports limit of {0}".format(
                            site.get('reports_limit')))
                    break
                #
                # DEBUG CODE
                #
                # if len(reports) > 10:
                #    break

                #
                # Attempt to advance the start time and end time
                #

            except Exception as e:
                logger.info(traceback.format_exc())

            #
            # If it is just a data_set, the data is unordered, so we can just break out of the while loop
            #
            if data_set:
                break

            if feed_helper.advance():
                continue
            else:
                break
            #
            # end While True
            #

        logger.info("Found {} new reports.".format(len(reports)))

        if not data_set:
            #
            # We only want to concatenate if we are NOT a data set, otherwise we want to refresh all the reports
            #
            logger.info("Adding existing reports...")
            reports = feed_helper.load_existing_feed_data() + reports

        logger.info("Total number of reports: {}".format(len(reports)))

        if site.get('reports_limit') < len(reports):
            logger.info("Truncating reports to length {0}".format(
                site.get('reports_limit')))
            reports = reports[:site.get('reports_limit')]

        data = build_feed_data(sanitized_feed_name,
                               "%s %s" % (site.get('site'), collection_name),
                               feed_summary, site.get('site'),
                               site.get('icon_link'), reports)

        if feed_helper.write_feed(data):
            feed_helper.save_details()

        #
        # Create Cb Response Feed if necessary
        #

        feed_id = None

        try:
            feeds = get_object_by_name_or_id(self.cb,
                                             Feed,
                                             name=sanitized_feed_name)

            if not feeds:
                logger.info(
                    "Feed {} was not found, so we are going to create it".
                    format(sanitized_feed_name))

            elif len(feeds) > 1:
                logger.warning(
                    "Multiple feeds found, selecting Feed id {}".format(
                        feeds[0].id))
                feed_id = feeds[0].id

            elif feeds:
                feed_id = feeds[0].id
                logger.info("Feed {} was found as Feed ID {}".format(
                    sanitized_feed_name, feed_id))

        except Exception as e:
            logger.info(e.message)

        if not feed_id:
            logger.info("Creating {} feed for the first time".format(
                sanitized_feed_name))

            f = self.cb.create(Feed)
            f.feed_url = "file://" + feed_helper.path
            f.enabled = site.get('feeds_enable')
            f.use_proxy = False
            f.validate_server_cert = False
            try:
                f.save()
            except ServerError as se:
                if se.error_code == 500:
                    logger.info("Could not add feed:")
                    logger.info(
                        " Received error code 500 from server. This is usually because the server cannot retrieve the feed."
                    )
                    logger.info(
                        " Check to ensure the Cb server has network connectivity and the credentials are correct."
                    )
                else:
                    logger.info("Could not add feed: {0:s}".format(str(se)))
            except Exception as e:
                logger.info("Could not add feed: {0:s}".format(str(e)))
            else:
                logger.info("Feed data: {0:s}".format(str(f)))
                logger.info("Added feed. New feed ID is {0:d}".format(f.id))
                feed_id = f.id

        return feed_id