def get_hash_of_all_results(self): return hash_helper.hash_data(self.result_hashes)
def get_hash_of_all_matches(self): return hash_helper.hash_data(self.match_hashes)
def output_results(self, results, index, source, sourcetype, host, checkpoint_data, output_results_policy, result_info=None): """ Output the results to Splunk unless the results don't match the export policy. Returns an integer indicating how many results were outputted. Arguments: results -- The results from scrape_page (a list of dictionaries containing the matches and related data) index -- The index to send the data to source -- The name of the source sourcetype -- The name of the sourcetype host -- The name of the host checkpoint_data -- The checkpoint data dictionary provided to the modular input output_results_policy -- A string representing how output should be exported result_info -- An instance of WebInputResult for tracking information such as result hashes """ # Create an instance of the web-result output if result_info is None: result_info = WebInputResult() # Process the result (if we got one) if results is not None: # Compute the hash of the matches with Timer() as timer: # Hash the results result_info.latest_content_hash = hash_helper.hash_data( results, WebScraper.GENERATED_FIELDS) # Accumulate the matches hashes so that we can generate a hash of the matches matches_content = [] for result in results: # Handle MV based match content if 'match' in result: matches_content.append(result['match']) # Handle non-MV based match content by looking for fields that are not generated as meta fields else: for key, value in result.items(): if key not in WebScraper.GENERATED_FIELDS: matches_content.append(value) result_info.latest_matches_hash = hash_helper.hash_data( matches_content) # Add to the list of the matches result_info.match_hashes.append(result_info.latest_matches_hash) # Calculate the hash of all of the matches hash_of_all_matches = result_info.get_hash_of_all_matches() logger.debug( "Hash of results calculated, time=%sms, hash=%s, prior_hash=%s", round(timer.msecs, 3), hash_of_all_matches, checkpoint_data.get('matches_hash', '')) # Don't output the results if we are set to not output results unless the matches change # Note: we will compare the content later if output_results_policy == WebInput.OUTPUT_RESULTS_WHEN_MATCHES_CHANGE and checkpoint_data.get( 'matches_hash', '') == hash_of_all_matches: logger.info( "Matches data matched the prior result, it will be skipped since output_results=%s, hash=%s", output_results_policy, hash_of_all_matches) else: # Build up a list of the hashes so that we can determine if the content changed for r in results: # Add the hash if r.get('content_sha224', None) != None: result_info.result_hashes.append( r.get('content_sha224', '')) # Check to see if the content changed # Don't output the results if we are set to not output results unless the content changes hash_of_all_results = result_info.get_hash_of_all_results() if output_results_policy == WebInput.OUTPUT_RESULTS_WHEN_CONTENTS_CHANGE and checkpoint_data.get( 'content_hash', '') == hash_of_all_results: logger.info( "Content data matched the prior result, it will be skipped since output_results=%s, hash=%s", output_results_policy, hash_of_all_results) else: # Process each event for r in results: # Send the event if self.OUTPUT_USING_STASH: # Write the event as a stash new file writer = StashNewWriter( index=index, source_name=source, file_extension=".stash_web_input", sourcetype=sourcetype, host=host) logger.debug("Wrote stash file=%s", writer.write_event(r)) else: # Write the event using the built-in modular input method self.output_event( r, source, index=index, source=source, sourcetype=sourcetype, host=host, unbroken=True, close=True, encapsulate_value_in_double_quotes=True) # Keep a count of the results sent result_info.results_outputted += 1 return result_info
def output_results(self, results, index, source, sourcetype, host, checkpoint_data, output_results_policy, result_info = None): """ Output the results to Splunk unless the results don't match the export policy. Returns an integer indicating how many results were outputted. Arguments: results -- The results from scrape_page (a list of dictionaries containing the matches and related data) index -- The index to send the data to source -- The name of the source sourcetype -- The name of the sourcetype host -- The name of the host checkpoint_data -- The checkpoint data dictionary provided to the modular input output_results_policy -- A string representing how output should be exported result_info -- An instance of WebInputResult for tracking information such as result hashes """ # Keep a record of results_outputted = 0 # Create an instance of the web-result output if result_info is None: result_info = WebInputResult() # Process the result (if we got one) if results is not None: # Compute the hash of the matches with Timer() as timer: # Hash the results result_info.latest_content_hash = hash_helper.hash_data(results, WebScraper.GENERATED_FIELDS) # Accumulate the matches hashes so that we can generate a hash of the matches matches_content = [] for result in results: # Handle MV based match content if 'match' in result: matches_content.append(result['match']) # Handle non-MV based match content by looking for fields that are not generated as meta fields else: for key, value in result: if key not in WebScraper.GENERATED_FIELDS: matches_content.append(value) result_info.latest_matches_hash = hash_helper.hash_data(matches_content) # Add to the list of the matches result_info.match_hashes.append(result_info.latest_matches_hash) # Calculate the hash of all of the matches hash_of_all_matches = result_info.get_hash_of_all_matches() logger.debug("Hash of results calculated, time=%sms, hash=%s, prior_hash=%s", round(timer.msecs, 3), hash_of_all_matches, checkpoint_data.get('matches_hash', '')) # Don't output the results if we are set to not output results unless the matches change # Note: we will compare the content later if output_results_policy == WebInput.OUTPUT_RESULTS_WHEN_MATCHES_CHANGE and checkpoint_data.get('matches_hash', '') == hash_of_all_matches: logger.info("Matches data matched the prior result, it will be skipped since output_results=%s, hash=%s", output_results_policy, hash_of_all_matches) else: # Build up a list of the hashes so that we can determine if the content changed for r in results: # Add the hash if r.get('content_sha224', None) != None: result_info.result_hashes.append(r.get('content_sha224', '')) # Check to see if the content changed # Don't output the results if we are set to not output results unless the content changes hash_of_all_results = result_info.get_hash_of_all_results() if output_results_policy == WebInput.OUTPUT_RESULTS_WHEN_CONTENTS_CHANGE and checkpoint_data.get('content_hash', '') == hash_of_all_results: logger.info("Content data matched the prior result, it will be skipped since output_results=%s, hash=%s", output_results_policy, hash_of_all_results) else: # Process each event for r in results: # Send the event if self.OUTPUT_USING_STASH: # Write the event as a stash new file writer = StashNewWriter(index=index, source_name=source, file_extension=".stash_web_input", sourcetype=sourcetype, host=host) logger.debug("Wrote stash file=%s", writer.write_event(r)) else: # Write the event using the built-in modular input method self.output_event(r, source, index=index, source=source, sourcetype=sourcetype, host=host, unbroken=True, close=True, encapsulate_value_in_double_quotes=True) # Keep a count of the results sent result_info.results_outputted += 1 return result_info