Example #1
0
 def get_hash_of_all_results(self):
     return hash_helper.hash_data(self.result_hashes)
Example #2
0
 def get_hash_of_all_matches(self):
     return hash_helper.hash_data(self.match_hashes)
Example #3
0
    def output_results(self,
                       results,
                       index,
                       source,
                       sourcetype,
                       host,
                       checkpoint_data,
                       output_results_policy,
                       result_info=None):
        """
        Output the results to Splunk unless the results don't match the export policy.

        Returns an integer indicating how many results were outputted.

        Arguments:
        results -- The results from scrape_page (a list of dictionaries containing the matches and related data)
        index -- The index to send the data to
        source -- The name of the source
        sourcetype -- The name of the sourcetype
        host -- The name of the host
        checkpoint_data -- The checkpoint data dictionary provided to the modular input
        output_results_policy -- A string representing how output should be exported
        result_info -- An instance of WebInputResult for tracking information such as result hashes
        """

        # Create an instance of the web-result output
        if result_info is None:
            result_info = WebInputResult()

        # Process the result (if we got one)
        if results is not None:

            # Compute the hash of the matches
            with Timer() as timer:

                # Hash the results
                result_info.latest_content_hash = hash_helper.hash_data(
                    results, WebScraper.GENERATED_FIELDS)

                # Accumulate the matches hashes so that we can generate a hash of the matches
                matches_content = []

                for result in results:
                    # Handle MV based match content
                    if 'match' in result:
                        matches_content.append(result['match'])

                    # Handle non-MV based match content by looking for fields that are not generated as meta fields
                    else:
                        for key, value in result.items():
                            if key not in WebScraper.GENERATED_FIELDS:
                                matches_content.append(value)

                result_info.latest_matches_hash = hash_helper.hash_data(
                    matches_content)

            # Add to the list of the matches
            result_info.match_hashes.append(result_info.latest_matches_hash)

            # Calculate the hash of all of the matches
            hash_of_all_matches = result_info.get_hash_of_all_matches()
            logger.debug(
                "Hash of results calculated, time=%sms, hash=%s, prior_hash=%s",
                round(timer.msecs, 3), hash_of_all_matches,
                checkpoint_data.get('matches_hash', ''))

            # Don't output the results if we are set to not output results unless the matches change
            # Note: we will compare the content later
            if output_results_policy == WebInput.OUTPUT_RESULTS_WHEN_MATCHES_CHANGE and checkpoint_data.get(
                    'matches_hash', '') == hash_of_all_matches:
                logger.info(
                    "Matches data matched the prior result, it will be skipped since output_results=%s, hash=%s",
                    output_results_policy, hash_of_all_matches)

            else:
                # Build up a list of the hashes so that we can determine if the content changed
                for r in results:

                    # Add the hash
                    if r.get('content_sha224', None) != None:
                        result_info.result_hashes.append(
                            r.get('content_sha224', ''))

                # Check to see if the content changed
                # Don't output the results if we are set to not output results unless the content changes
                hash_of_all_results = result_info.get_hash_of_all_results()
                if output_results_policy == WebInput.OUTPUT_RESULTS_WHEN_CONTENTS_CHANGE and checkpoint_data.get(
                        'content_hash', '') == hash_of_all_results:
                    logger.info(
                        "Content data matched the prior result, it will be skipped since output_results=%s, hash=%s",
                        output_results_policy, hash_of_all_results)

                else:
                    # Process each event
                    for r in results:
                        # Send the event
                        if self.OUTPUT_USING_STASH:
                            # Write the event as a stash new file
                            writer = StashNewWriter(
                                index=index,
                                source_name=source,
                                file_extension=".stash_web_input",
                                sourcetype=sourcetype,
                                host=host)
                            logger.debug("Wrote stash file=%s",
                                         writer.write_event(r))

                        else:
                            # Write the event using the built-in modular input method
                            self.output_event(
                                r,
                                source,
                                index=index,
                                source=source,
                                sourcetype=sourcetype,
                                host=host,
                                unbroken=True,
                                close=True,
                                encapsulate_value_in_double_quotes=True)

                        # Keep a count of the results sent
                        result_info.results_outputted += 1

        return result_info
 def get_hash_of_all_results(self):
     return hash_helper.hash_data(self.result_hashes)
 def get_hash_of_all_matches(self):
     return hash_helper.hash_data(self.match_hashes)
    def output_results(self, results, index, source, sourcetype, host, checkpoint_data, output_results_policy, result_info = None):
        """
        Output the results to Splunk unless the results don't match the export policy.

        Returns an integer indicating how many results were outputted.

        Arguments:
        results -- The results from scrape_page (a list of dictionaries containing the matches and related data)
        index -- The index to send the data to
        source -- The name of the source
        sourcetype -- The name of the sourcetype
        host -- The name of the host
        checkpoint_data -- The checkpoint data dictionary provided to the modular input
        output_results_policy -- A string representing how output should be exported
        result_info -- An instance of WebInputResult for tracking information such as result hashes
        """

        # Keep a record of
        results_outputted = 0

        # Create an instance of the web-result output
        if result_info is None: 
            result_info = WebInputResult()

        # Process the result (if we got one)
        if results is not None:

            # Compute the hash of the matches
            with Timer() as timer:

                # Hash the results
                result_info.latest_content_hash = hash_helper.hash_data(results, WebScraper.GENERATED_FIELDS)

                # Accumulate the matches hashes so that we can generate a hash of the matches
                matches_content = []

                for result in results:
                    # Handle MV based match content
                    if 'match' in result:
                        matches_content.append(result['match'])

                    # Handle non-MV based match content by looking for fields that are not generated as meta fields
                    else:
                        for key, value in result:
                            if key not in WebScraper.GENERATED_FIELDS:
                                 matches_content.append(value)

                result_info.latest_matches_hash = hash_helper.hash_data(matches_content)

            # Add to the list of the matches
            result_info.match_hashes.append(result_info.latest_matches_hash)

            # Calculate the hash of all of the matches
            hash_of_all_matches = result_info.get_hash_of_all_matches()
            logger.debug("Hash of results calculated, time=%sms, hash=%s, prior_hash=%s", round(timer.msecs, 3), hash_of_all_matches, checkpoint_data.get('matches_hash', ''))

            # Don't output the results if we are set to not output results unless the matches change
            # Note: we will compare the content later
            if output_results_policy == WebInput.OUTPUT_RESULTS_WHEN_MATCHES_CHANGE and checkpoint_data.get('matches_hash', '') == hash_of_all_matches:
                logger.info("Matches data matched the prior result, it will be skipped since output_results=%s, hash=%s", output_results_policy, hash_of_all_matches)

            else:
                # Build up a list of the hashes so that we can determine if the content changed
                for r in results:

                    # Add the hash
                    if r.get('content_sha224', None) != None:
                        result_info.result_hashes.append(r.get('content_sha224', ''))

                # Check to see if the content changed
                # Don't output the results if we are set to not output results unless the content changes
                hash_of_all_results = result_info.get_hash_of_all_results()
                if output_results_policy == WebInput.OUTPUT_RESULTS_WHEN_CONTENTS_CHANGE and checkpoint_data.get('content_hash', '') == hash_of_all_results:
                    logger.info("Content data matched the prior result, it will be skipped since output_results=%s, hash=%s", output_results_policy, hash_of_all_results)

                else:
                    # Process each event
                    for r in results:
                        # Send the event
                        if self.OUTPUT_USING_STASH:
                            # Write the event as a stash new file
                            writer = StashNewWriter(index=index, source_name=source, file_extension=".stash_web_input", sourcetype=sourcetype, host=host)
                            logger.debug("Wrote stash file=%s", writer.write_event(r))

                        else:
                            # Write the event using the built-in modular input method
                            self.output_event(r, source, index=index, source=source, sourcetype=sourcetype, host=host, unbroken=True, close=True, encapsulate_value_in_double_quotes=True)

                        # Keep a count of the results sent
                        result_info.results_outputted += 1

        return result_info