def handle_results(self, results, session_key, in_preview): # FYI: we ignore results since this is a generating command # Make sure that URL is using SSL if on Splunk Cloud if ModularInput.is_on_cloud( session_key) and not self.params["url"].startswith("https"): raise Exception( "The URL to scrape must use HTTPS; Splunk Cloud doesn't allow unsecured network access" ) # Make sure that links get extracted if they point to HTTPS sites if on Splunk Cloud self.params['https_only'] = ModularInput.is_on_cloud(session_key) # Do the scraping results = self.web_scraper.scrape_page(**self.params) # Output the results self.output_results(results)
def get_scrape_page(self, request_info, **kwargs): """ Perform a page scrape and return the results (useful for previewing a web_input modular input configuration) """ result = [{}] # Run the input try: web_input = WebInput(timeout=10) kw = {} # Get the URL or URI url = None if 'url' in kwargs: url = kwargs['url'] elif 'uri' in kwargs: url = kwargs['uri'] if url is None: return self.render_error_json("No URL was provided", 202) # Get the selector selector = None if 'selector' in kwargs: selector = kwargs['selector'] # Determine if we should include empty matches if 'empty_matches' in kwargs: kw['include_empty_matches'] = util.normalizeBoolean( kwargs['empty_matches'], True) # Get the use_element_name parameter if 'use_element_name' in kwargs: kw['use_element_name'] = util.normalizeBoolean( kwargs['use_element_name'], False) # Get the text_separator parameter if 'text_separator' in kwargs: kw['text_separator'] = kwargs['text_separator'] # Get the output_as_mv parameter. This parameter is different from the name of the # argument that the class accepts and will be renamed accrdingly. if 'output_as_mv' in kwargs: kw['output_matches_as_mv'] = util.normalizeBoolean( kwargs['output_as_mv'], True) # If we are outputting as multi-valued parameters, then don't include the separate # fields if kw['output_matches_as_mv']: kw['output_matches_as_separate_fields'] = False else: # http://lukemurphey.net/issues/1643 kw['output_matches_as_separate_fields'] = True # Get the field match prefix if 'match_prefix' in kwargs: kw['match_prefix'] = kwargs['match_prefix'] # Get the browser parameter if 'browser' in kwargs: kw['browser'] = kwargs['browser'] # Get the page_limit parameter if 'page_limit' in kwargs: kw['page_limit'] = int(kwargs['page_limit']) # Get the depth_limit parameter if 'depth_limit' in kwargs: kw['depth_limit'] = int(kwargs['depth_limit']) # Get the depth_limit parameter if 'url_filter' in kwargs: kw['url_filter'] = kwargs['url_filter'] # Get the name_attributes parameter if 'name_attributes' in kwargs: kw['name_attributes'] = kwargs['name_attributes'] # Get the raw_content parameter if 'raw_content' in kwargs: kw['include_raw_content'] = util.normalizeBoolean( kwargs['raw_content']) # Only extract links using HTTPS if on Splunk Cloud if ModularInput.is_on_cloud(request_info.session_key): kw['https_only'] = True # Otherwise, allow callers to specify which links to extract elif 'https_only' in kwargs: kw['https_only'] = util.normalizeBoolean(kwargs['https_only']) # Get the proxy configuration conf_stanza = "default" # Get the timeout parameter timeout = 5 if 'timeout' in kwargs: try: timeout = int(kwargs['timeout']) except: # The timeout is invalid. Ignore this for now, it will get picked up when # the user attempts to save the input pass # Make the web scraper instance web_scraper = WebScraper(timeout) # Get the authentication information, if available username = None password = None if 'password' in kwargs and 'username' in kwargs: username = kwargs['username'] password = kwargs['password'] username_field = kwargs.get('username_field', None) password_field = kwargs.get('password_field', None) authentication_url = kwargs.get('authentication_url', None) if authentication_url is not None: authentication_url = urlparse(authentication_url) logger.debug("Using credentials for scrape_page") web_scraper.set_authentication(username, password, authentication_url, username_field, password_field) # Get the user-agent string if 'user_agent' in kwargs: web_scraper.user_agent = kwargs['user_agent'] # Set the proxy authentication try: proxy_type, proxy_server, proxy_port, proxy_user, proxy_password = web_input.get_proxy_config( request_info.session_key, conf_stanza) web_scraper.set_proxy(proxy_type, proxy_server, proxy_port, proxy_user, proxy_password) except ResourceNotFound: return self.render_error_json( "Proxy server information could not be obtained", 202) # Scrape the page result = web_scraper.scrape_page(url, selector, **kw) except FieldValidationException as e: return self.render_error_json(str(e), 220) except ServerNotFoundError as e: return self.render_error_json(str(e), 220) except (SelectorError, SelectorSyntaxError, ExpressionError): return self.render_error_json("Selector is invalid. ", 220) except LoginFormNotFound: return self.render_error_json("Login form was not found", 220) except FormAuthenticationFailed: return self.render_error_json("Form authentication failed", 220) except Exception as e: logger.exception("Error generated during execution") return self.render_error_json(str(e), 500) # Return the information if 'include_first_result_only' in kwargs: return self.render_json(result[0]) else: return self.render_json(result)
def get_load_page(self, request_info, url, **kwargs): """ Proxy a web-page through so that a UI can be displayed for showing potential results. """ web_client = None try: # -------------------------------------- # 1: Make sure that user has permission to make inputs. We don't want to allow people # to use this as a general proxy. # -------------------------------------- if not WebInputOperationsHandler.hasCapability( 'edit_modinput_web_input' ) and WebInputOperationsHandler.hasCapability('admin_all_objects'): return self.render_error_html( 'You need the "edit_modinput_web_input" capability ' + 'to make website inputs', 403) # Don't allow proxying of the javascript files if url.endswith(".js"): return { 'payload': '', 'status': 200, 'headers': { 'Content-Type': 'application/javascript' }, } # -------------------------------------- # 2: Only allow HTTPS if the install is on Splunk Cloud # -------------------------------------- if ModularInput.is_on_cloud(request_info.session_key ) and not url.startswith("https://"): return self.render_error_html( 'URLs on Splunk Cloud must use HTTPS protocol', 401) # TODO: deterine best code # -------------------------------------- # 3: Perform a request for the page # -------------------------------------- # Get the proxy configuration conf_stanza = "default" try: web_input = WebInput(timeout=10) proxy_type, proxy_server, proxy_port, proxy_user, proxy_password = \ web_input.get_proxy_config(request_info.session_key, conf_stanza) except ResourceNotFound: return self.render_error_html( "Proxy server information could not be obtained", 202) # Get the timeout to use timeout = None if 'timeout' in kwargs: try: timeout = int(kwargs['timeout']) except ValueError: timeout = 15 else: timeout = 15 # Get the user-agent user_agent = kwargs.get('user_agent', None) # Get the information on the browser to use browser = None if 'browser' in kwargs: browser = kwargs['browser'] # Make the client if browser is None or browser == WebScraper.INTEGRATED_CLIENT: web_client = DefaultWebClient(timeout, user_agent, logger) elif browser == WebScraper.FIREFOX: web_client = FirefoxClient(timeout, user_agent, logger) elif browser == WebScraper.CHROME: web_client = ChromeClient(timeout, user_agent, logger) web_client.setProxy(proxy_type, proxy_server, proxy_port, proxy_user, proxy_password) # Get the username and password username = kwargs.get('username', None) password = kwargs.get('password', None) username_field = kwargs.get('username_field', None) password_field = kwargs.get('password_field', None) authentication_url = kwargs.get('authentication_url', None) if username is not None and password is not None: username = kwargs['username'] password = kwargs['password'] username_field = kwargs.get('username_field', None) password_field = kwargs.get('password_field', None) authentication_url = kwargs.get('authentication_url', None) web_client.setCredentials(username, password) if authentication_url is not None: logger.debug( "Authenticating using form login in scrape_page") web_client.doFormLogin(authentication_url, username_field, password_field) # Get the page try: content = web_client.get_url(url, 'GET') response = web_client.get_response_headers() except: logger.exception( "Exception generated while attempting to content for url=%s", url) return self.render_error_html( "Page preview could not be obtained using a web-browser", 500) # -------------------------------------- # 4: Render the content with the browser if necessary # -------------------------------------- """ if 'text/html' in response['content-type']: # Get the information on the browser to use browser = None if 'browser' in kwargs: browser = kwargs['browser'] # Try rendering the content using a web-browser try: if browser is not None and browser != WebScraper.INTEGRATED_CLIENT: web_scraper = WebScraper(timeout=timeout) web_scraper.set_proxy(proxy_type, proxy_server, proxy_port, proxy_user, proxy_password) web_scraper.set_authentication(username, password) content = web_scraper.get_result_browser(urlparse(url), browser) except: logger.exception("Exception generated while attempting to get browser rendering or url=%s", url) cherrypy.response.status = 500 return self.render_error_html("Page preview could not be obtained using a web-browser") """ # -------------------------------------- # 5: Rewrite the links in HTML files so that they also point to the internal proxy # -------------------------------------- if "<html" in content: # Parse the content html = lxml.html.document_fromstring(content) # Rewrite the links to point to this internal proxy rewrite_using_internal_proxy = True if rewrite_using_internal_proxy: def relocate_href(link): """ Change the hrefs such that they go through the proxy. """ link = urljoin(url, link) if link.endswith(".js"): return "" if not link.endswith(".css"): return "load_page?url=" + link else: return link html.rewrite_links(relocate_href) # Block the href links for element, attribute, _, _ in html.iterlinks(): if element.tag == "a" and attribute == "href": element.set('href', "#") elif element.tag == "form" and attribute == "action": element.set('action', "?") else: html.make_links_absolute(url) # Determine if we should clean the JS clean_script = True if 'clean_script' in kwargs: clean_script = util.normalizeBoolean( kwargs['clean_script']) # Determine if we should clean the CSS clean_styles = False if 'clean_styles' in kwargs: clean_styles = util.normalizeBoolean( kwargs['clean_styles']) # Clean up the HTML if clean_styles or clean_script: kill_tags = [] if clean_script: kill_tags = ["script"] # Remove the script blocks cleaner = Cleaner(page_structure=False, kill_tags=kill_tags, javascript=False, links=False, style=clean_styles, safe_attrs_only=False) # Get the content content = lxml.html.tostring(cleaner.clean_html(html), encoding="unicode") else: content = lxml.html.tostring(html, encoding="unicode") # -------------------------------------- # 6: Respond with the results # -------------------------------------- headers = {} if 'content-type' in response: headers['Content-Type'] = response['content-type'] else: headers['Content-Type'] = 'text/html' # -------------------------------------- # 7: Clear Javascript files # -------------------------------------- if response.get('content-type', "") == "application/javascript" \ or response.get('content-type', "") == "application/x-javascript" \ or response.get('content-type', "") == "text/javascript" \ or url.endswith(".js"): return {'payload': '', 'headers': headers, 'status': 200} return {'payload': content, 'headers': headers, 'status': 200} except LoginFormNotFound: logger.debug("Login form not found") return self.render_error_html("Login form was not found", 200) except FormAuthenticationFailed as e: logger.debug("Form authentication failed: " + str(e)) return self.render_error_html( "Form authentication failed: " + str(e), 200) except: logger.exception("Error when attempting to proxy an HTTP request") return self.render_error_html("Page preview could not be created", 500) finally: if web_client: web_client.close()