Example #1
0
    def _import_collection(self, client, site, collection, data_set=False):

        collection_name = collection.name
        sanitized_feed_name = cleanup_string(
            "%s%s" % (site.get('site'), collection_name))
        feed_summary = "%s %s" % (site.get('site'), collection_name)
        available = collection.available
        collection_type = collection.type
        default_score = site.get('default_score')
        logger.info("%s,%s,%s,%s,%s" %
                    (site.get('site'), collection_name, sanitized_feed_name,
                     available, collection_type))

        if not available:
            return False

        #
        # Sanity check on start date
        #
        start_date_str = site.get('start_date')
        if not start_date_str or len(start_date_str) == 0:
            start_date_str = "2017-01-01 00:00:00"

        #
        # Create a feed helper object
        #
        feed_helper = FeedHelper(site.get('output_path'), sanitized_feed_name,
                                 site.get('minutes_to_advance'),
                                 start_date_str)

        if not data_set:
            logger.info("Feed start time %s" % feed_helper.start_date)
        logger.info("polling Collection: {}...".format(collection.name))

        #
        # Build up the URI for polling
        #

        if not site.get('poll_path', ''):
            uri = None
        else:
            uri = ''
            if site.get('use_https'):
                uri += 'https://'
            else:
                uri += 'http://'

            uri += site.get('site')
            uri += site.get('poll_path')
            logger.info('Poll path: {}'.format(uri))

        reports = []
        while True:

            try:
                try:
                    content_blocks = client.poll(
                        uri=uri,
                        collection_name=collection.name,
                        begin_date=feed_helper.start_date,
                        end_date=feed_helper.end_date,
                        content_bindings=BINDING_CHOICES)

                except Exception as e:
                    logger.info(e.message)
                    content_blocks = []

                #
                # Iterate through all content_blocks
                #
                num_blocks = 0

                if not data_set:
                    logger.info("polling start_date: {}, end_date: {}".format(
                        feed_helper.start_date, feed_helper.end_date))
                for block in content_blocks:

                    #
                    # if in export mode then save off this content block
                    #
                    if self.export_dir:
                        self.export_xml(collection_name,
                                        feed_helper.start_date,
                                        feed_helper.end_date, num_blocks,
                                        block.content)

                    #
                    # This code accounts for a case found with ThreatCentral.io where the content is url encoded.
                    # etree.fromstring can parse this data.
                    #
                    try:
                        root = etree.fromstring(block.content)
                        content = root.find(
                            './/{http://taxii.mitre.org/messages/taxii_xml_binding-1.1}Content'
                        )
                        if content is not None and len(content) == 0 and len(
                                list(content)) == 0:
                            #
                            # Content has no children.  So lets make sure we parse the xml text for content and re-add
                            # it as valid XML so we can parse
                            #
                            new_stix_package = etree.fromstring(
                                root.find(
                                    "{http://taxii.mitre.org/messages/taxii_xml_binding-1.1}Content_Block/{http://taxii.mitre.org/messages/taxii_xml_binding-1.1}Content"
                                ).text)
                            content.append(new_stix_package)

                        #
                        # Since we modified the xml, we need create a new xml message string to parse
                        #
                        message = etree.tostring(root)

                        #
                        # Write the content block to disk so we can parse with python stix
                        #
                        file_handle, file_path = self.write_to_temp_file(
                            message)

                        #
                        # Parse STIX data
                        #
                        stix_package = STIXPackage.from_xml(file_path)

                        #
                        # if it is a DATA_SET make feed_summary from the stix_header description
                        # NOTE: this is for RecordedFuture, also note that we only do this for data_sets.
                        #       to date I have only seen RecordedFuture use data_sets
                        #
                        if data_set and stix_package.stix_header and stix_package.stix_header.descriptions:
                            for desc in stix_package.stix_header.descriptions:
                                feed_summary = desc.value
                                break

                        #
                        # Get the timestamp of the STIX Package so we can use this in our feed
                        #
                        timestamp = total_seconds(stix_package.timestamp)

                        if stix_package.indicators:
                            for indicator in stix_package.indicators:
                                if not indicator or not indicator.observable:
                                    continue

                                if not indicator.timestamp:
                                    timestamp = 0
                                else:
                                    timestamp = int(
                                        (indicator.timestamp -
                                         datetime.datetime(1970, 1, 1).replace(
                                             tzinfo=dateutil.tz.tzutc())
                                         ).total_seconds())

                                reports.extend(
                                    cybox_parse_observable(
                                        indicator.observable, indicator,
                                        timestamp, default_score))

                        #
                        # Now lets find some data.  Iterate through all observables and parse
                        #
                        if stix_package.observables:
                            for observable in stix_package.observables:
                                if not observable:
                                    continue
                                #
                                # Cybox observable returns a list
                                #
                                reports.extend(
                                    cybox_parse_observable(
                                        observable, None, timestamp,
                                        default_score))

                        #
                        # Delete our temporary file
                        #
                        file_handle.close()

                        num_blocks += 1

                        #
                        # end for loop through content blocks
                        #

                    except Exception as e:
                        #logger.info(traceback.format_exc())
                        logger.info(e.message)
                        continue

                logger.info("content blocks read: {}".format(num_blocks))
                logger.info("current number of reports: {}".format(
                    len(reports)))

                #
                # DEBUG CODE
                #
                #if len(reports) > 10:
                #    break

                #
                # Attempt to advance the start time and end time
                #

            except Exception as e:
                logger.info(traceback.format_exc())

            #
            # If it is just a data_set, the data is unordered, so we can just break out of the while loop
            #
            if data_set:
                break

            if feed_helper.advance():
                continue
            else:
                break
            #
            # end While True
            #

        logger.info("Found {} new reports.".format(len(reports)))

        reports = feed_helper.load_existing_feed_data() + reports

        logger.info("Total number of reports: {}".format(len(reports)))

        data = build_feed_data(sanitized_feed_name,
                               "%s %s" % (site.get('site'), collection_name),
                               feed_summary, site.get('site'),
                               site.get('icon_link'), reports)

        if feed_helper.write_feed(data):
            feed_helper.save_details()

        #
        # Create Cb Response Feed if necessary
        #

        feed_id = self.cb.feed_get_id_by_name(sanitized_feed_name)
        if not feed_id:
            self.cb.feed_add_from_url("file://" + feed_helper.path,
                                      site.get('feeds_enable'), False, False)
Example #2
0
    def _import_collection(self, client, site, collection, data_set=False):

        collection_name = collection.name
        sanitized_feed_name = cleanup_string(
            "%s%s" % (site.get('site'), collection_name))
        feed_summary = "%s %s" % (site.get('site'), collection_name)
        available = collection.available
        collection_type = collection.type
        default_score = site.get('default_score')
        logger.info("%s,%s,%s,%s,%s" %
                    (site.get('site'), collection_name, sanitized_feed_name,
                     available, collection_type))

        if not available:
            return False

        #
        # Sanity check on start date
        #
        start_date_str = site.get('start_date')
        if not start_date_str or len(start_date_str) == 0:
            start_date_str = "2019-01-01 00:00:00"

        #
        # Create a feed helper object
        #
        feed_helper = FeedHelper(site.get('output_path'), sanitized_feed_name,
                                 site.get('minutes_to_advance'),
                                 start_date_str)

        if not data_set:
            logger.info("Feed start time %s" % feed_helper.start_date)
        logger.info("polling Collection: {}...".format(collection.name))

        #
        # Build up the URI for polling
        #

        if not site.get('poll_path', ''):
            uri = None
        else:
            uri = ''
            if site.get('use_https'):
                uri += 'https://'
            else:
                uri += 'http://'

            uri += site.get('site')
            uri += site.get('poll_path')
            logger.info('Poll path: {}'.format(uri))

        reports = []
        while True:
            num_times_empty_content_blocks = 0
            try:
                try:
                    logger.info("Polling Collection: {0}".format(
                        collection.name))
                    content_blocks = client.poll(
                        uri=uri,
                        collection_name=collection.name,
                        begin_date=feed_helper.start_date,
                        end_date=feed_helper.end_date,
                        content_bindings=BINDING_CHOICES)

                except Exception as e:
                    logger.info(e.message)
                    content_blocks = []

                #
                # Iterate through all content_blocks
                #
                num_blocks = 0

                if not data_set:
                    logger.info("polling start_date: {}, end_date: {}".format(
                        feed_helper.start_date, feed_helper.end_date))
                for block in content_blocks:
                    logger.debug(block.content)

                    #
                    # if in export mode then save off this content block
                    #
                    if self.export_dir:
                        self.export_xml(collection_name,
                                        feed_helper.start_date,
                                        feed_helper.end_date, num_blocks,
                                        block.content)

                    #
                    # This code accounts for a case found with ThreatCentral.io where the content is url encoded.
                    # etree.fromstring can parse this data.
                    #
                    try:
                        root = etree.fromstring(block.content)
                        content = root.find(
                            './/{http://taxii.mitre.org/messages/taxii_xml_binding-1.1}Content'
                        )
                        if content is not None and len(content) == 0 and len(
                                list(content)) == 0:
                            #
                            # Content has no children.  So lets make sure we parse the xml text for content and re-add
                            # it as valid XML so we can parse
                            #
                            new_stix_package = etree.fromstring(
                                root.find(
                                    "{http://taxii.mitre.org/messages/taxii_xml_binding-1.1}Content_Block/{http://taxii.mitre.org/messages/taxii_xml_binding-1.1}Content"
                                ).text)
                            content.append(new_stix_package)

                        #
                        # Since we modified the xml, we need create a new xml message string to parse
                        #
                        message = etree.tostring(root)

                        #
                        # Write the content block to disk so we can parse with python stix
                        #
                        file_handle, file_path = self.write_to_temp_file(
                            message)

                        #
                        # Parse STIX data
                        #
                        stix_package = STIXPackage.from_xml(file_path)

                        #
                        # if it is a DATA_SET make feed_summary from the stix_header description
                        # NOTE: this is for RecordedFuture, also note that we only do this for data_sets.
                        #       to date I have only seen RecordedFuture use data_sets
                        #
                        if data_set and stix_package.stix_header and stix_package.stix_header.descriptions:
                            for desc in stix_package.stix_header.descriptions:
                                feed_summary = "{}: {}".format(
                                    desc.value, collection_name)
                                break

                        #
                        # Get the timestamp of the STIX Package so we can use this in our feed
                        #
                        timestamp = total_seconds(stix_package.timestamp)

                        if not stix_package.indicators and not stix_package.observables:
                            num_times_empty_content_blocks += 1
                            if num_times_empty_content_blocks > 10:
                                break

                        if stix_package.indicators:
                            for indicator in stix_package.indicators:

                                if not indicator or not indicator.observable:
                                    continue

                                if indicator.confidence:

                                    if str(indicator.confidence.value).isdigit(
                                    ):
                                        #
                                        # Get the confidence score and use it for our score
                                        #
                                        score = int(
                                            indicator.confidence.to_dict().get(
                                                "value", default_score))
                                    else:
                                        if str(indicator.confidence.value
                                               ).lower() == "high":
                                            score = 75
                                        elif str(indicator.confidence.value
                                                 ).lower() == "medium":
                                            score = 50
                                        elif str(indicator.confidence.value
                                                 ).lower() == "low":
                                            score = 25
                                        else:
                                            score = default_score
                                else:
                                    score = default_score

                                if not indicator.timestamp:
                                    timestamp = 0
                                else:
                                    timestamp = int(
                                        (indicator.timestamp -
                                         datetime.datetime(1970, 1, 1).replace(
                                             tzinfo=dateutil.tz.tzutc())
                                         ).total_seconds())

                                reports.extend(
                                    cybox_parse_observable(
                                        indicator.observable, indicator,
                                        timestamp, score))

                        #
                        # Now lets find some data.  Iterate through all observables and parse
                        #
                        if stix_package.observables:
                            for observable in stix_package.observables:
                                if not observable:
                                    continue
                                #
                                # Cybox observable returns a list
                                #
                                reports.extend(
                                    cybox_parse_observable(
                                        observable, None, timestamp,
                                        default_score))

                        #
                        # Delete our temporary file
                        #
                        file_handle.close()

                        num_blocks += 1

                        #
                        # end for loop through content blocks
                        #

                    except Exception as e:
                        # logger.info(traceback.format_exc())
                        logger.info(e.message)
                        continue

                logger.info("content blocks read: {}".format(num_blocks))
                logger.info("current number of reports: {}".format(
                    len(reports)))

                if len(reports) > site.get('reports_limit'):
                    logger.info(
                        "We have reached the reports limit of {0}".format(
                            site.get('reports_limit')))
                    break
                #
                # DEBUG CODE
                #
                # if len(reports) > 10:
                #    break

                #
                # Attempt to advance the start time and end time
                #

            except Exception as e:
                logger.info(traceback.format_exc())

            #
            # If it is just a data_set, the data is unordered, so we can just break out of the while loop
            #
            if data_set:
                break

            if feed_helper.advance():
                continue
            else:
                break
            #
            # end While True
            #

        logger.info("Found {} new reports.".format(len(reports)))

        if not data_set:
            #
            # We only want to concatenate if we are NOT a data set, otherwise we want to refresh all the reports
            #
            logger.info("Adding existing reports...")
            reports = feed_helper.load_existing_feed_data() + reports

        logger.info("Total number of reports: {}".format(len(reports)))

        if site.get('reports_limit') < len(reports):
            logger.info("Truncating reports to length {0}".format(
                site.get('reports_limit')))
            reports = reports[:site.get('reports_limit')]

        data = build_feed_data(sanitized_feed_name,
                               "%s %s" % (site.get('site'), collection_name),
                               feed_summary, site.get('site'),
                               site.get('icon_link'), reports)

        if feed_helper.write_feed(data):
            feed_helper.save_details()

        #
        # Create Cb Response Feed if necessary
        #

        feed_id = None

        try:
            feeds = get_object_by_name_or_id(self.cb,
                                             Feed,
                                             name=sanitized_feed_name)

            if not feeds:
                logger.info(
                    "Feed {} was not found, so we are going to create it".
                    format(sanitized_feed_name))

            elif len(feeds) > 1:
                logger.warning(
                    "Multiple feeds found, selecting Feed id {}".format(
                        feeds[0].id))
                feed_id = feeds[0].id

            elif feeds:
                feed_id = feeds[0].id
                logger.info("Feed {} was found as Feed ID {}".format(
                    sanitized_feed_name, feed_id))

        except Exception as e:
            logger.info(e.message)

        if not feed_id:
            logger.info("Creating {} feed for the first time".format(
                sanitized_feed_name))

            f = self.cb.create(Feed)
            f.feed_url = "file://" + feed_helper.path
            f.enabled = site.get('feeds_enable')
            f.use_proxy = False
            f.validate_server_cert = False
            try:
                f.save()
            except ServerError as se:
                if se.error_code == 500:
                    logger.info("Could not add feed:")
                    logger.info(
                        " Received error code 500 from server. This is usually because the server cannot retrieve the feed."
                    )
                    logger.info(
                        " Check to ensure the Cb server has network connectivity and the credentials are correct."
                    )
                else:
                    logger.info("Could not add feed: {0:s}".format(str(se)))
            except Exception as e:
                logger.info("Could not add feed: {0:s}".format(str(e)))
            else:
                logger.info("Feed data: {0:s}".format(str(f)))
                logger.info("Added feed. New feed ID is {0:d}".format(f.id))
                feed_id = f.id

        return feed_id