def test_browser(self, browser, **kwargs): """ Determine if the given browser is configured and able to be used. """ success = None web_scraper = WebScraper(3) # Set the proxy authentication try: web_input = WebInput(timeout=10) proxy_type, proxy_server, proxy_port, proxy_user, proxy_password = web_input.get_proxy_config( cherrypy.session.get('sessionKey'), "default") web_scraper.set_proxy(proxy_type, proxy_server, proxy_port, proxy_user, proxy_password) except splunk.ResourceNotFound: cherrypy.response.status = 202 return self.render_error_json( _("Proxy server information could not be obtained")) try: result = web_scraper.scrape_page( selector="a", url=WebInputController.TEST_BROWSER_URL, browser=browser, include_raw_content=True) if not result: success = False elif len(result) < 1: success = False elif 'browser' not in result[0]: success = True else: success = (result[0]['browser'] == browser) except Exception as exception: logger.exception( "Exception generated when attempting to test the browser") success = False return self.render_json({'success': success})
def test_browser(self, browser, **kwargs): """ Determine if the given browser is configured and able to be used. """ success = None web_scraper = WebScraper(3) # Set the proxy authentication try: web_input = WebInput(timeout=10) proxy_type, proxy_server, proxy_port, proxy_user, proxy_password = web_input.get_proxy_config(cherrypy.session.get('sessionKey'), "default") web_scraper.set_proxy(proxy_type, proxy_server, proxy_port, proxy_user, proxy_password) except splunk.ResourceNotFound: cherrypy.response.status = 202 return self.render_error_json(_("Proxy server information could not be obtained")) try: result = web_scraper.scrape_page(selector="a", url=WebInputController.TEST_BROWSER_URL, browser=browser, include_raw_content=True) if not result: success = False elif len(result) < 1: success = False else: success = (result[0]['browser'] == browser) except Exception as exception: logger.exception("Exception generated when attempting to test the browser") success = False return self.render_json({ 'success' : success })
def get_scrape_page(self, request_info, **kwargs): """ Perform a page scrape and return the results (useful for previewing a web_input modular input configuration) """ result = [{}] # Run the input try: web_input = WebInput(timeout=10) kw = {} # Get the URL or URI url = None if 'url' in kwargs: url = kwargs['url'] elif 'uri' in kwargs: url = kwargs['uri'] if url is None: return self.render_error_json("No URL was provided", 202) # Get the selector selector = None if 'selector' in kwargs: selector = kwargs['selector'] # Determine if we should include empty matches if 'empty_matches' in kwargs: kw['include_empty_matches'] = util.normalizeBoolean( kwargs['empty_matches'], True) # Get the use_element_name parameter if 'use_element_name' in kwargs: kw['use_element_name'] = util.normalizeBoolean( kwargs['use_element_name'], False) # Get the text_separator parameter if 'text_separator' in kwargs: kw['text_separator'] = kwargs['text_separator'] # Get the output_as_mv parameter. This parameter is different from the name of the # argument that the class accepts and will be renamed accrdingly. if 'output_as_mv' in kwargs: kw['output_matches_as_mv'] = util.normalizeBoolean( kwargs['output_as_mv'], True) # If we are outputting as multi-valued parameters, then don't include the separate # fields if kw['output_matches_as_mv']: kw['output_matches_as_separate_fields'] = False else: # http://lukemurphey.net/issues/1643 kw['output_matches_as_separate_fields'] = True # Get the field match prefix if 'match_prefix' in kwargs: kw['match_prefix'] = kwargs['match_prefix'] # Get the browser parameter if 'browser' in kwargs: kw['browser'] = kwargs['browser'] # Get the page_limit parameter if 'page_limit' in kwargs: kw['page_limit'] = int(kwargs['page_limit']) # Get the depth_limit parameter if 'depth_limit' in kwargs: kw['depth_limit'] = int(kwargs['depth_limit']) # Get the depth_limit parameter if 'url_filter' in kwargs: kw['url_filter'] = kwargs['url_filter'] # Get the name_attributes parameter if 'name_attributes' in kwargs: kw['name_attributes'] = kwargs['name_attributes'] # Get the raw_content parameter if 'raw_content' in kwargs: kw['include_raw_content'] = util.normalizeBoolean( kwargs['raw_content']) # Only extract links using HTTPS if on Splunk Cloud if ModularInput.is_on_cloud(request_info.session_key): kw['https_only'] = True # Otherwise, allow callers to specify which links to extract elif 'https_only' in kwargs: kw['https_only'] = util.normalizeBoolean(kwargs['https_only']) # Get the proxy configuration conf_stanza = "default" # Get the timeout parameter timeout = 5 if 'timeout' in kwargs: try: timeout = int(kwargs['timeout']) except: # The timeout is invalid. Ignore this for now, it will get picked up when # the user attempts to save the input pass # Make the web scraper instance web_scraper = WebScraper(timeout) # Get the authentication information, if available username = None password = None if 'password' in kwargs and 'username' in kwargs: username = kwargs['username'] password = kwargs['password'] username_field = kwargs.get('username_field', None) password_field = kwargs.get('password_field', None) authentication_url = kwargs.get('authentication_url', None) if authentication_url is not None: authentication_url = urlparse(authentication_url) logger.debug("Using credentials for scrape_page") web_scraper.set_authentication(username, password, authentication_url, username_field, password_field) # Get the user-agent string if 'user_agent' in kwargs: web_scraper.user_agent = kwargs['user_agent'] # Set the proxy authentication try: proxy_type, proxy_server, proxy_port, proxy_user, proxy_password = web_input.get_proxy_config( request_info.session_key, conf_stanza) web_scraper.set_proxy(proxy_type, proxy_server, proxy_port, proxy_user, proxy_password) except ResourceNotFound: return self.render_error_json( "Proxy server information could not be obtained", 202) # Scrape the page result = web_scraper.scrape_page(url, selector, **kw) except FieldValidationException as e: return self.render_error_json(str(e), 220) except ServerNotFoundError as e: return self.render_error_json(str(e), 220) except (SelectorError, SelectorSyntaxError, ExpressionError): return self.render_error_json("Selector is invalid. ", 220) except LoginFormNotFound: return self.render_error_json("Login form was not found", 220) except FormAuthenticationFailed: return self.render_error_json("Form authentication failed", 220) except Exception as e: logger.exception("Error generated during execution") return self.render_error_json(str(e), 500) # Return the information if 'include_first_result_only' in kwargs: return self.render_json(result[0]) else: return self.render_json(result)
class WebScraperSearchCommand(SearchCommand): """ The search command takes the arguments provided by the command-line and sends it to the modular input functions so that you could you run the input manually. """ def __init__(self, url=None, selector=None, username=None, password=None, timeout=30, name_attributes=None, output_as_mv=True, output_matches_as_mv=None, output_matches_as_separate_fields=False, use_element_name=False, page_limit=1, depth_limit=50, url_filter=None, text_separator=" ", raw_content=False, include_raw_content=None, browser=None, match_prefix=None, user_agent=None, empty_matches=False, empty_value='NULL', authentication_url=None, username_field=None, password_field=None): # Note: output_matches_as_mv and include_raw_content are supported for legacy purposes # Make sure the required arguments are provided if url is None: raise ValueError("url argument must be provided") if selector is None: raise ValueError("selector argument must be provided") # Use the older output_matches_as_mv field if included if output_matches_as_mv is not None: output_as_mv = output_matches_as_mv # Decide on whether to include the matches as separate fields if output_as_mv is set if normalizeBoolean(output_as_mv): output_as_mv = True output_matches_as_separate_fields = False else: output_as_mv = False output_matches_as_separate_fields = True if name_attributes is None: name_attributes = [] # Make the web scraper instance self.web_scraper = WebScraper(int(timeout)) self.web_scraper.user_agent = user_agent # Save the parameters self.params = { "url": url, "selector": selector, "name_attributes": name_attributes, "output_matches_as_mv": normalizeBoolean(output_as_mv), "output_matches_as_separate_fields": normalizeBoolean(output_matches_as_separate_fields), "include_empty_matches": empty_matches, "empty_value": empty_value, "use_element_name": normalizeBoolean(use_element_name), "page_limit": int(page_limit), "depth_limit": int(depth_limit), "url_filter": url_filter, "include_raw_content": normalizeBoolean(include_raw_content) if include_raw_content is not None else normalizeBoolean(raw_content), "text_separator": text_separator, "browser": browser, "match_prefix": match_prefix } if username is not None and password is not None: self.web_scraper.set_authentication(username, password, authentication_url, username_field, password_field) SearchCommand.__init__(self, run_in_preview=True, logger_name="web_scrape") self.logger.info("Web scraper running against url=%s", url) def handle_results(self, results, session_key, in_preview): # FYI: we ignore results since this is a generating command # Make sure that URL is using SSL if on Splunk Cloud if ModularInput.is_on_cloud( session_key) and not self.params["url"].startswith("https"): raise Exception( "The URL to scrape must use HTTPS; Splunk Cloud doesn't allow unsecured network access" ) # Make sure that links get extracted if they point to HTTPS sites if on Splunk Cloud self.params['https_only'] = ModularInput.is_on_cloud(session_key) # Do the scraping results = self.web_scraper.scrape_page(**self.params) # Output the results self.output_results(results)
def scrape_page(self, **kwargs): """ Perform a page scrape and return the results (useful for previewing a web_input modular input configuration) """ result = [{}] # Run the input try: web_input = WebInput(timeout=10) kw = {} # Get the URL or URI url = None if 'url' in kwargs: url = kwargs['url'] elif 'uri' in kwargs: url = kwargs['uri'] if url is None: cherrypy.response.status = 202 return self.render_error_json(_("No URL was provided")) # Get the selector selector = None if 'selector' in kwargs: selector = kwargs['selector'] # Determine if we should include empty matches if 'empty_matches' in kwargs: kw['include_empty_matches'] = util.normalizeBoolean(kwargs['empty_matches'], True) # Get the use_element_name parameter if 'use_element_name' in kwargs: kw['use_element_name'] = util.normalizeBoolean(kwargs['use_element_name'], False) # Get the text_separator parameter if 'text_separator' in kwargs: kw['text_separator'] = kwargs['text_separator'] # Get the output_as_mv parameter. This parameter is different from the name of the # argument that the class accepts and will be renamed accrdingly. if 'output_as_mv' in kwargs: kw['output_matches_as_mv'] = util.normalizeBoolean(kwargs['output_as_mv'], True) # If we are outputting as multi-valued parameters, then don't include the separate # fields if kw['output_matches_as_mv']: kw['output_matches_as_separate_fields'] = False else: # http://lukemurphey.net/issues/1643 kw['output_matches_as_separate_fields'] = True # Get the field match prefix if 'match_prefix' in kwargs: kw['match_prefix'] = kwargs['match_prefix'] # Get the browser parameter if 'browser' in kwargs: kw['browser'] = kwargs['browser'] # Get the page_limit parameter if 'page_limit' in kwargs: kw['page_limit'] = int(kwargs['page_limit']) # Get the depth_limit parameter if 'depth_limit' in kwargs: kw['depth_limit'] = int(kwargs['depth_limit']) # Get the depth_limit parameter if 'url_filter' in kwargs: kw['url_filter'] = kwargs['url_filter'] # Get the name_attributes parameter if 'name_attributes' in kwargs: kw['name_attributes'] = kwargs['name_attributes'] # Get the raw_content parameter if 'raw_content' in kwargs: kw['include_raw_content'] = util.normalizeBoolean(kwargs['raw_content']) # Only extract links using HTTPS if on Splunk Cloud if ModularInput.is_on_cloud(cherrypy.session.get('sessionKey')): kw['https_only'] = True # Otherwise, allow callers to specify which links to extract elif 'https_only' in kwargs: kw['https_only'] = util.normalizeBoolean(kwargs['https_only']) # Get the proxy configuration conf_stanza = "default" # Get the timeout parameter timeout = 5 if 'timeout' in kwargs: try: timeout = int(kwargs['timeout']) except: # The timeout is invalid. Ignore this for now, it will get picked up when # the user attempts to save the input pass # Make the web scraper instance web_scraper = WebScraper(timeout) # Get the authentication information, if available username = None password = None if 'password' in kwargs and 'username' in kwargs: username = kwargs['username'] password = kwargs['password'] username_field = kwargs.get('username_field', None) password_field = kwargs.get('password_field', None) authentication_url = kwargs.get('authentication_url', None) if authentication_url is not None: authentication_url = urlparse.urlparse(authentication_url) logger.debug("Using credentials for scrape_page") web_scraper.set_authentication(username, password, authentication_url, username_field, password_field) # Get the user-agent string if 'user_agent' in kwargs: web_scraper.user_agent = kwargs['user_agent'] # Set the proxy authentication try: proxy_type, proxy_server, proxy_port, proxy_user, proxy_password = web_input.get_proxy_config(cherrypy.session.get('sessionKey'), conf_stanza) web_scraper.set_proxy(proxy_type, proxy_server, proxy_port, proxy_user, proxy_password) except splunk.ResourceNotFound: cherrypy.response.status = 202 return self.render_error_json(_("Proxy server information could not be obtained")) # Scrape the page result = web_scraper.scrape_page(url, selector, **kw) except FieldValidationException as e: cherrypy.response.status = 220 return self.render_error_json(_(str(e))) except ServerNotFoundError as e: cherrypy.response.status = 220 return self.render_error_json(_(str(e))) except (SelectorError, SelectorSyntaxError, ExpressionError): cherrypy.response.status = 220 return self.render_error_json(_("Selector is invalid. ")) except LoginFormNotFound: cherrypy.response.status = 220 return self.render_error_json("Login form was not found") except FormAuthenticationFailed: cherrypy.response.status = 220 return self.render_error_json("Form authentication failed") except Exception as e: cherrypy.response.status = 500 logger.exception("Error generated during execution") return self.render_error_json(_(str(e))) # Return the information if 'include_first_result_only' in kwargs: return self.render_json(result[0], set_mime='application/json') else: return self.render_json(result, set_mime='application/json')
def run(self, stanza, cleaned_params, input_config): # Make the parameters interval = cleaned_params["interval"] title = cleaned_params["title"] url = cleaned_params["url"] selector = cleaned_params.get("selector", None) username = cleaned_params.get("username", None) password = cleaned_params.get("password", None) name_attributes = cleaned_params.get("name_attributes", []) user_agent = cleaned_params.get("user_agent", None) timeout = cleaned_params.get("timeout", self.timeout) sourcetype = cleaned_params.get("sourcetype", "web_input") host = cleaned_params.get("host", None) index = cleaned_params.get("index", "default") conf_stanza = cleaned_params.get("configuration", None) use_element_name = cleaned_params.get("use_element_name", False) page_limit = cleaned_params.get("page_limit", 1) url_filter = cleaned_params.get("url_filter", None) depth_limit = cleaned_params.get("depth_limit", 25) raw_content = cleaned_params.get("raw_content", False) text_separator = cleaned_params.get("text_separator", " ") browser = cleaned_params.get("browser", WebScraper.INTEGRATED_CLIENT) output_as_mv = cleaned_params.get("output_as_mv", True) output_results_policy = cleaned_params.get("output_results", None) username_field = cleaned_params.get("username_field", None) password_field = cleaned_params.get("password_field", None) authentication_url = cleaned_params.get("authentication_url", None) source = stanza if self.needs_another_run(input_config.checkpoint_dir, stanza, interval): # Don't scan the URL if the URL is unencrypted and the host is on Cloud if self.is_on_cloud( input_config.session_key) and not url.scheme == "https": self.logger.warn( "The URL will not be processed because the host is running on Splunk Cloud and the URL isn't using encryption, url=%s", url.geturl()) return # Don't scan the URL if the login URL is unencrypted and the host is on Cloud if self.is_on_cloud( input_config.session_key ) and authentication_url is not None and authentication_url.scheme != "https": self.logger.warn( "The URL will not be processed because the host is running on Splunk Cloud and the login URL isn't using encryption, authentication_url=%s", authentication_url.geturl()) return # Get the proxy configuration try: proxy_type, proxy_server, proxy_port, proxy_user, proxy_password = self.get_proxy_config( input_config.session_key, conf_stanza) except splunk.ResourceNotFound: logger.error( "The proxy configuration could not be loaded (resource not found). The execution will be skipped for now for this input with stanza=%s", stanza) return except splunk.SplunkdConnectionException: logger.error( "The proxy configuration could not be loaded (splunkd connection problem). The execution will be skipped for now for this input with stanza=%s", stanza) return # Get the secure password if necessary if username is not None: secure_password = self.get_secure_password( realm=stanza, session_key=input_config.session_key) if secure_password is not None: password = secure_password['content']['clear_password'] self.logger.debug( "Successfully loaded the secure password for input=%s", stanza) # Get the information from the page try: # Make sure the page_limit is not too small if page_limit < 1 or page_limit is None or page_limit == "": logger.warn("The parameter is too small for page_limit=%r", page_limit) page_limit = 1 # Make sure the depth_limit is valid if depth_limit < 1 or depth_limit is None or depth_limit == "": logger.warn( "The parameter is too small for depth_limit=%r", depth_limit) depth_limit = 50 # Determine how to make the match fields output_matches_as_mv = True output_matches_as_separate_fields = False if not output_as_mv: output_matches_as_mv = False output_matches_as_separate_fields = True additional_fields = {'title': title} # Make an instance of the web-scraper and initialize it web_scraper = WebScraper(timeout, logger=logger) web_scraper.set_proxy(proxy_type, proxy_server, proxy_port, proxy_user, proxy_password) web_scraper.user_agent = user_agent web_scraper.set_authentication(username, password, authentication_url, username_field, password_field) # Get the checkpoint data so that we can determine the prior hash of the results if necessary checkpoint_data = self.get_checkpoint_data( input_config.checkpoint_dir, stanza) if checkpoint_data is None: checkpoint_data = {} # Keep a list of the matches so that we can determine if any of results changed result_info = WebInputResult() if output_results_policy == WebInput.OUTPUT_RESULTS_WHEN_CONTENTS_CHANGE or output_results_policy == WebInput.OUTPUT_RESULTS_WHEN_MATCHES_CHANGE: output_fx = None else: # Setup the output function so that we can stream the results output_fx = lambda result: self.output_results( [result], index, source, sourcetype, host, checkpoint_data, None, result_info) # Perform the scrape results = web_scraper.scrape_page( url, selector, name_attributes, use_element_name=use_element_name, page_limit=page_limit, depth_limit=depth_limit, url_filter=url_filter, include_raw_content=raw_content, text_separator=text_separator, browser=browser, output_matches_as_mv=output_matches_as_mv, output_matches_as_separate_fields= output_matches_as_separate_fields, additional_fields=additional_fields, https_only=self.is_on_cloud(input_config.session_key), output_fx=output_fx) # Determine the number of results if output_fx is None: matches = len(results) elif output_fx is not None: matches = results logger.info( "Successfully executed the website input, matches_count=%r, stanza=%s, url=%s", matches, stanza, url.geturl()) except LoginFormNotFound as e: logger.warn( 'Form authentication failed since the form could not be found, stanza=%s', stanza) except FormAuthenticationFailed as e: logger.warn( 'Form authentication failed, stanza=%s, error="%s"', stanza, str(e)) except WebClientException as e: logger.warn('Client connection failed, stanza=%s, error="%s"', stanza, str(e)) except Exception: logger.exception( "An exception occurred when attempting to retrieve information from the web-page, stanza=%s", stanza) # Get the time that the input last ran last_ran = self.last_ran(input_config.checkpoint_dir, stanza) # If we didn't output the results already (using streaming output, then do it now) if output_fx is None: self.output_results(results, index, source, sourcetype, host, checkpoint_data, output_results_policy, result_info) # Make the new checkpoint data dictionary new_checkpoint_data = { 'last_run': self.get_non_deviated_last_run(last_ran, interval, stanza), 'matches_hash': result_info.get_hash_of_all_matches(), 'content_hash': result_info.get_hash_of_all_results() } # Save the checkpoint so that we remember when we last executed this self.save_checkpoint_data(input_config.checkpoint_dir, stanza, new_checkpoint_data) # Force garbage collection at the end of the run # This is useful since inputs often time run infrequently and we want to clean up # after ourselves while we wait for the next run import gc gc.collect()
def run(self, stanza, cleaned_params, input_config): # Make the parameters interval = cleaned_params["interval"] title = cleaned_params["title"] url = cleaned_params["url"] selector = cleaned_params.get("selector", None) username = cleaned_params.get("username", None) password = cleaned_params.get("password", None) name_attributes = cleaned_params.get("name_attributes", []) user_agent = cleaned_params.get("user_agent", None) timeout = cleaned_params.get("timeout", self.timeout) sourcetype = cleaned_params.get("sourcetype", "web_input") host = cleaned_params.get("host", None) index = cleaned_params.get("index", "default") conf_stanza = cleaned_params.get("configuration", None) use_element_name = cleaned_params.get("use_element_name", False) page_limit = cleaned_params.get("page_limit", 1) url_filter = cleaned_params.get("url_filter", None) depth_limit = cleaned_params.get("depth_limit", 25) raw_content = cleaned_params.get("raw_content", False) text_separator = cleaned_params.get("text_separator", " ") browser = cleaned_params.get("browser", WebScraper.INTEGRATED_CLIENT) output_as_mv = cleaned_params.get("output_as_mv", True) output_results_policy = cleaned_params.get("output_results", None) username_field = cleaned_params.get("username_field", None) password_field = cleaned_params.get("password_field", None) authentication_url = cleaned_params.get("authentication_url", None) source = stanza if self.needs_another_run(input_config.checkpoint_dir, stanza, interval): # Don't scan the URL if the URL is unencrypted and the host is on Cloud if self.is_on_cloud(input_config.session_key) and not url.scheme == "https": self.logger.warn("The URL will not be processed because the host is running on Splunk Cloud and the URL isn't using encryption, url=%s", url.geturl()) return # Don't scan the URL if the login URL is unencrypted and the host is on Cloud if self.is_on_cloud(input_config.session_key) and authentication_url is not None and authentication_url.scheme != "https": self.logger.warn("The URL will not be processed because the host is running on Splunk Cloud and the login URL isn't using encryption, authentication_url=%s", authentication_url.geturl()) return # Get the proxy configuration try: proxy_type, proxy_server, proxy_port, proxy_user, proxy_password = self.get_proxy_config(input_config.session_key, conf_stanza) except splunk.ResourceNotFound: logger.error("The proxy configuration could not be loaded (resource not found). The execution will be skipped for now for this input with stanza=%s", stanza) return except splunk.SplunkdConnectionException: logger.error("The proxy configuration could not be loaded (splunkd connection problem). The execution will be skipped for now for this input with stanza=%s", stanza) return # Get the secure password if necessary if username is not None: secure_password = self.get_secure_password(realm=stanza, session_key=input_config.session_key) if secure_password is not None: password = secure_password['content']['clear_password'] self.logger.debug("Successfully loaded the secure password for input=%s", stanza) # Get the information from the page try: # Make sure the page_limit is not too small if page_limit < 1 or page_limit is None or page_limit == "": logger.warn("The parameter is too small for page_limit=%r", page_limit) page_limit = 1 # Make sure the depth_limit is valid if depth_limit < 1 or depth_limit is None or depth_limit == "": logger.warn("The parameter is too small for depth_limit=%r", depth_limit) depth_limit = 50 # Determine how to make the match fields output_matches_as_mv = True output_matches_as_separate_fields = False if not output_as_mv: output_matches_as_mv = False output_matches_as_separate_fields = True additional_fields = { 'title' : title } # Make an instance of the web-scraper and initialize it web_scraper = WebScraper(timeout, logger=logger) web_scraper.set_proxy(proxy_type, proxy_server, proxy_port, proxy_user, proxy_password) web_scraper.user_agent = user_agent web_scraper.set_authentication(username, password, authentication_url, username_field, password_field) # Get the checkpoint data so that we can determine the prior hash of the results if necessary checkpoint_data = self.get_checkpoint_data(input_config.checkpoint_dir, stanza) if checkpoint_data is None: checkpoint_data = {} # Keep a list of the matches so that we can determine if any of results changed result_info = WebInputResult() if output_results_policy == WebInput.OUTPUT_RESULTS_WHEN_CONTENTS_CHANGE or output_results_policy == WebInput.OUTPUT_RESULTS_WHEN_MATCHES_CHANGE: output_fx = None else: # Setup the output function so that we can stream the results output_fx = lambda result: self.output_results([result], index, source, sourcetype, host, checkpoint_data, None, result_info) # Perform the scrape results = web_scraper.scrape_page(url, selector, name_attributes, use_element_name=use_element_name, page_limit=page_limit, depth_limit=depth_limit, url_filter=url_filter, include_raw_content=raw_content, text_separator=text_separator, browser=browser, output_matches_as_mv=output_matches_as_mv, output_matches_as_separate_fields=output_matches_as_separate_fields, additional_fields=additional_fields, https_only=self.is_on_cloud(input_config.session_key), output_fx=output_fx) # Determine the number of results if output_fx is None: matches = len(results) elif output_fx is not None: matches = results logger.info("Successfully executed the website input, matches_count=%r, stanza=%s, url=%s", matches, stanza, url.geturl()) except LoginFormNotFound as e: logger.warn('Form authentication failed since the form could not be found, stanza=%s', stanza) except FormAuthenticationFailed as e: logger.warn('Form authentication failed, stanza=%s, error="%s"', stanza, str(e)) except WebClientException as e: logger.warn('Client connection failed, stanza=%s, error="%s"', stanza, str(e)) except Exception: logger.exception("An exception occurred when attempting to retrieve information from the web-page, stanza=%s", stanza) # Get the time that the input last ran last_ran = self.last_ran(input_config.checkpoint_dir, stanza) # If we didn't output the results already (using streaming output, then do it now) if output_fx is None: self.output_results(results, index, source, sourcetype, host, checkpoint_data, output_results_policy, result_info) # Make the new checkpoint data dictionary new_checkpoint_data = { 'last_run' : self.get_non_deviated_last_run(last_ran, interval, stanza), 'matches_hash' : result_info.get_hash_of_all_matches(), 'content_hash' : result_info.get_hash_of_all_results() } # Save the checkpoint so that we remember when we last executed this self.save_checkpoint_data(input_config.checkpoint_dir, stanza, new_checkpoint_data) # Force garbage collection at the end of the run # This is useful since inputs often time run infrequently and we want to clean up # after ourselves while we wait for the next run import gc gc.collect()
class WebScraperSearchCommand(SearchCommand): """ The search command takes the arguments provided by the command-line and sends it to the modular input functions so that you could you run the input manually. """ def __init__(self, url=None, selector=None, username=None, password=None, timeout=30, name_attributes=None, output_as_mv=True, output_matches_as_mv=None, output_matches_as_separate_fields=False, use_element_name=False, page_limit=1, depth_limit=50, url_filter=None, text_separator=" ", raw_content=False, include_raw_content=None, browser=None, match_prefix=None, user_agent=None, empty_matches=False, empty_value='NULL', authentication_url=None, username_field=None, password_field=None): # Note: output_matches_as_mv and include_raw_content are supported for legacy purposes # Make sure the required arguments are provided if url is None: raise ValueError("url argument must be provided") if selector is None: raise ValueError("selector argument must be provided") # Use the older output_matches_as_mv field if included if output_matches_as_mv is not None: output_as_mv = output_matches_as_mv # Decide on whether to include the matches as separate fields if output_as_mv is set if normalizeBoolean(output_as_mv): output_as_mv = True output_matches_as_separate_fields = False else: output_as_mv = False output_matches_as_separate_fields = True if name_attributes is None: name_attributes = [] # Make the web scraper instance self.web_scraper = WebScraper(int(timeout)) self.web_scraper.user_agent = user_agent # Save the parameters self.params = { "url": url, "selector": selector, "name_attributes": name_attributes, "output_matches_as_mv": normalizeBoolean(output_as_mv), "output_matches_as_separate_fields": normalizeBoolean(output_matches_as_separate_fields), "include_empty_matches": empty_matches, "empty_value": empty_value, "use_element_name" : normalizeBoolean(use_element_name), "page_limit" : int(page_limit), "depth_limit" : int(depth_limit), "url_filter" : url_filter, "include_raw_content" : normalizeBoolean(include_raw_content) if include_raw_content is not None else normalizeBoolean(raw_content), "text_separator" : text_separator, "browser" : browser, "match_prefix" : match_prefix } if username is not None and password is not None: self.web_scraper.set_authentication(username, password, authentication_url, username_field, password_field) SearchCommand.__init__(self, run_in_preview=True, logger_name="web_scrape") self.logger.info("Web scraper running against url=%s", url) def handle_results(self, results, session_key, in_preview): # FYI: we ignore results since this is a generating command # Make sure that URL is using SSL if on Splunk Cloud if ModularInput.is_on_cloud(session_key) and not self.params["url"].startswith("https"): raise Exception("The URL to scrape must use HTTPS; Splunk Cloud doesn't allow unsecured network access") # Make sure that links get extracted if they point to HTTPS sites if on Splunk Cloud self.params['https_only'] = ModularInput.is_on_cloud(session_key) # Do the scraping results = self.web_scraper.scrape_page(**self.params) # Output the results self.output_results(results)