def test_scrape_page_mv(self): web_input = WebInput(timeout=3) url_field = URLField( "test_web_input", "title", "this is a test" ) selector_field = SelectorField( "test_web_input_css", "title", "this is a test" ) result = WebInput.scrape_page( url_field.to_python("http://textcritical.net/"), selector_field.to_python("h2"), output_matches_as_mv=True ) self.assertEqual(result['response_code'], 200) self.assertEqual(len(result['match']), 3) out = StringIO() web_input.output_event(result, stanza="web_input://textcritical_net", index="main", source="test_web_input", sourcetype="sourcetype", out=out) self.assertEquals( len(re.findall("match=", out.getvalue())), 3)
def test_needs_another_run(self): # Test case where file does not exist self.assertTrue( WebInput.needs_another_run( "/Users/lmurphey/Applications/splunk/var/lib/splunk/modinputs/web_input", "web_input://DoesNotExist", 60 ) ) # Test an interval right at the earlier edge self.assertFalse( WebInput.needs_another_run( os.path.join( self.get_test_dir(), "configs" ), "web_input://TextCritical.com", 60, 1365486765 ) ) # Test an interval at the later edge self.assertFalse( WebInput.needs_another_run( os.path.join( self.get_test_dir(), "configs" ), "web_input://TextCritical.com", 10, 1365486775 ) ) # Test interval beyond later edge self.assertTrue( WebInput.needs_another_run( os.path.join( self.get_test_dir(), "configs" ), "web_input://TextCritical.com", 10, 1365486776 ) )
def test_unparsable(self): web_input = WebInput(timeout=3) url_field = URLField( "test_web_input", "title", "this is a test" ) selector_field = SelectorField( "test_web_input_css", "title", "this is a test" ) result = WebInput.scrape_page( url_field.to_python("http://textcritical.net/media/images/link_external.png"), selector_field.to_python(".hero-unit .main_background"), timeout=3, output_matches_as_mv=True ) self.assertEqual(result['match'], [])
def scrape_page(self, url, selector, **kwargs): """ Perform a page scrape and return the results (useful for previewing a web_input modular input configuration) """ result = {} # Run the input try: web_input = WebInput(timeout=10) # Get the authentication information, if available username = None password = None if( 'password' in kwargs and 'username' in kwargs): username = kwargs['username'] password = kwargs['password'] # Get the user-agent string user_agent = None if( 'user_agent' in kwargs): user_agent = kwargs['user_agent'] # Determine if we should include empty matches include_empty_matches = False if 'include_empty_matches' in kwargs: include_empty_matches = util.normalizeBoolean(kwargs['include_empty_matches'], True) # Get the proxy configuration conf_stanza = "default" try: proxy_type, proxy_server, proxy_port, proxy_user, proxy_password = web_input.get_proxy_config(cherrypy.session.get('sessionKey'), conf_stanza) except splunk.ResourceNotFound: cherrypy.response.status = 202 return self.render_error_json(_("Proxy server information could not be obtained")) # Scrape the page result = WebInput.scrape_page( url, selector, username=username, password=password, include_empty_matches=include_empty_matches, proxy_type=proxy_type, proxy_server=proxy_server, proxy_port=proxy_port, proxy_user=proxy_user, proxy_password=proxy_password, user_agent=user_agent) except FieldValidationException, e: cherrypy.response.status = 202 return self.render_error_json(_(str(e)))
def test_scrape_unavailable_page(self): web_input = WebInput(timeout=3) url_field = URLField( "test_web_input", "title", "this is a test" ) selector_field = SelectorField( "test_web_input_css", "title", "this is a test" ) result = WebInput.scrape_page( url_field.to_python("http://192.168.30.23/"), selector_field.to_python(".hero-unit.main_background"), timeout=3 ) self.assertEqual(result['timed_out'], True)
def test_scrape_page(self): web_input = WebInput(timeout=3) url_field = URLField( "test_web_input", "title", "this is a test" ) selector_field = SelectorField( "test_web_input_css", "title", "this is a test" ) result = WebInput.scrape_page( url_field.to_python("http://textcritical.net/"), selector_field.to_python(".hero-unit.main_background") ) self.assertEqual(result['response_code'], 200) self.assertEqual(len(result['match']), 1)
def test_scrape_page_name_attributes_separate_fields(self): web_input = WebInput(timeout=3) url_field = URLField( "test_web_input", "title", "this is a test" ) selector_field = SelectorField( "test_web_input_css", "title", "this is a test" ) result = WebInput.scrape_page( url_field.to_python("http://127.0.0.1:8888"), selector_field.to_python(".hd"), username="******", password="******", timeout=3, name_attributes=["class"], output_matches_as_separate_fields=True, output_matches_as_mv=False) self.assertEqual(result['match_hd_1'], 'Mode:')
def test_scrape_page_name_attributes(self): web_input = WebInput(timeout=3) url_field = URLField( "test_web_input", "title", "this is a test" ) selector_field = SelectorField( "test_web_input_css", "title", "this is a test" ) result = WebInput.scrape_page( url_field.to_python("http://127.0.0.1:8888"), selector_field.to_python(".hd"), username="******", password="******", timeout=3, name_attributes=["class"] ) self.assertEqual(len(result['hd']), 31)
def test_scrape_encoding_detect_meta(self): web_input = WebInput(timeout=3) url_field = URLField( "test_web_input", "title", "this is a test" ) selector_field = SelectorField( "test_web_input_css", "title", "this is a test" ) result = WebInput.scrape_page( url_field.to_python("http://textcritical.net/work/new-testament/Mark/1/2"), selector_field.to_python(".verse-container"), charset_detect_meta_enabled=True, charset_detect_content_type_header_enabled=False, charset_detect_sniff_enabled=False ) self.assertEqual(result['response_code'], 200) self.assertEqual(result['encoding'], "utf-8")
def test_browser(self, browser, **kwargs): """ Determine if the given browser is configured and able to be used. """ success = None web_scraper = WebScraper(3) # Set the proxy authentication try: web_input = WebInput(timeout=10) proxy_type, proxy_server, proxy_port, proxy_user, proxy_password = web_input.get_proxy_config( cherrypy.session.get('sessionKey'), "default") web_scraper.set_proxy(proxy_type, proxy_server, proxy_port, proxy_user, proxy_password) except splunk.ResourceNotFound: cherrypy.response.status = 202 return self.render_error_json( _("Proxy server information could not be obtained")) try: result = web_scraper.scrape_page( selector="a", url=WebInputController.TEST_BROWSER_URL, browser=browser, include_raw_content=True) if not result: success = False elif len(result) < 1: success = False elif 'browser' not in result[0]: success = True else: success = (result[0]['browser'] == browser) except Exception as exception: logger.exception( "Exception generated when attempting to test the browser") success = False return self.render_json({'success': success})
def test_scrape_page_with_invalid_credentials(self): web_input = WebInput(timeout=3) url_field = URLField( "test_web_input", "title", "this is a test" ) selector_field = SelectorField( "test_web_input_css", "title", "this is a test" ) result = WebInput.scrape_page( url_field.to_python("http://127.0.0.1:8888"), selector_field.to_python("tr"), timeout=3, output_matches_as_mv=True ) #print result['match'] self.assertEqual(len(result['match']), 0)
def test_scrape_page_name_attributes_escaped_name(self): web_input = WebInput(timeout=3) url_field = URLField( "test_web_input", "title", "this is a test" ) selector_field = SelectorField( "test_web_input_css", "title", "this is a test" ) result = WebInput.scrape_page( url_field.to_python("http://127.0.0.1:8888"), selector_field.to_python("input"), username="******", password="******", timeout=3, name_attributes=["onclick"], include_empty_matches=True) self.assertTrue('btnBerTest__' in result) self.assertTrue('btnReset__' in result)
def test_scrape_page_adjacent_selector(self): # For bug: http://lukemurphey.net/issues/773 web_input = WebInput(timeout=3) url_field = URLField( "test_web_input", "title", "this is a test" ) selector_field = SelectorField( "test_web_input_css", "title", "this is a test" ) result = WebInput.scrape_page( url_field.to_python("http://textcritical.net/"), selector_field.to_python("h1+p,.sharing-buttons"), timeout=3, output_matches_as_mv=True ) self.assertEqual(len(result['match']), 2)
def get_login_fields(self, url=None, **kwargs): web_input = WebInput(timeout=10) proxy_type, proxy_server, proxy_port, proxy_user, proxy_password = \ web_input.get_proxy_config(cherrypy.session.get('sessionKey'), "default") client = MechanizeClient(5) logger.debug("Using proxy %s to detect form fields", proxy_server) user_agent = kwargs.get('user_agent') _, username_field, password_field = client.detectFormFields(url, proxy_type, proxy_server, proxy_port, proxy_user, proxy_password, user_agent) return self.render_json({ 'username_field' : username_field or "", 'password_field' : password_field or "" })
def test_scrape_encoding_detect_page(self): web_input = WebInput(timeout=3) url_field = URLField( "test_web_input", "title", "this is a test" ) selector_field = SelectorField( "test_web_input_css", "title", "this is a test" ) result = WebInput.scrape_page( url_field.to_python("http://textcritical.net/work/new-testament/Mark/1/2?async"), selector_field.to_python(".verse-container") ) self.assertEqual(result['response_code'], 200) self.assertEqual(len(result['match']), 45) #print result['match'] self.assertEqual(unicodedata.normalize('NFC', result['match'][1]), unicodedata.normalize('NFC', u"2 Καθὼς γέγραπται ἐν τῷ Ἠσαίᾳ τῷ προφήτῃ Ἰδοὺ ἀποστέλλω τὸν ἄγγελόν μου πρὸ προσώπου σου , ὃς κατασκευάσει τὴν ὁδόν σου :")) self.assertEqual(result['encoding'], "utf-8")
def test_scrape_page_child_text(self): # This text ensure that text from nodes under the selected nodes is properly extracted web_input = WebInput(timeout=3) url_field = URLField( "test_web_input", "title", "this is a test" ) selector_field = SelectorField( "test_web_input_css", "title", "this is a test" ) result = WebInput.scrape_page( url_field.to_python("http://textcritical.net/"), selector_field.to_python(".hero-unit.main_background"), output_matches_as_mv=True ) self.assertEqual(result['response_code'], 200) self.assertEqual(len(result['match']), 1) self.assertEqual(result['match'][0], "Ancient Greek, Modern Design TextCritical.net is a website that provides a library of ancient Greek works")
def handle_results(self, results, in_preview, session_key): # FYI: we ignore results since this is a generating command # Do the scraping result = WebInput.scrape_page(self.url, self.selector, self.username, self.password, self.timeout, self.name_attributes, self.output_matches_as_mv, self.output_matches_as_separate_fields, include_empty_matches=False, proxy_type="http", proxy_server=None, proxy_port=None, proxy_user=None, proxy_password=None) self.logger.debug("Retrieved results, result=%r", result) # Output the results self.output_results([result])
def test_scrape_page_bad_encoding(self): #http://lukemurphey.net/issues/987 web_input = WebInput(timeout=3) url_field = URLField( "test_web_input", "title", "this is a test" ) selector_field = SelectorField( "test_web_input_css", "title", "this is a test" ) result = WebInput.scrape_page( url_field.to_python("http://rss.slashdot.org/Slashdot/slashdot"), selector_field.to_python("description") ) self.assertEqual(result['response_code'], 200) self.assertGreater(len(result['match']), 0) self.assertEqual(result['encoding'], "ISO-8859-1")
def get_get_login_fields(self, request_info, url=None, **kwargs): web_input = WebInput(timeout=10) proxy_type, proxy_server, proxy_port, proxy_user, proxy_password = \ web_input.get_proxy_config(request_info.session_key, "default") client = MechanizeClient(5) logger.debug("Using proxy %s to detect form fields", proxy_server) user_agent = kwargs.get('user_agent') _, username_field, password_field = client.detectFormFields( url, proxy_type, proxy_server, proxy_port, proxy_user, proxy_password, user_agent) return self.render_json({ 'username_field': username_field or "", 'password_field': password_field or "" })
def test_browser(self, browser, **kwargs): """ Determine if the given browser is configured and able to be used. """ success = None web_scraper = WebScraper(3) # Set the proxy authentication try: web_input = WebInput(timeout=10) proxy_type, proxy_server, proxy_port, proxy_user, proxy_password = web_input.get_proxy_config(cherrypy.session.get('sessionKey'), "default") web_scraper.set_proxy(proxy_type, proxy_server, proxy_port, proxy_user, proxy_password) except splunk.ResourceNotFound: cherrypy.response.status = 202 return self.render_error_json(_("Proxy server information could not be obtained")) try: result = web_scraper.scrape_page(selector="a", url=WebInputController.TEST_BROWSER_URL, browser=browser, include_raw_content=True) if not result: success = False elif len(result) < 1: success = False else: success = (result[0]['browser'] == browser) except Exception as exception: logger.exception("Exception generated when attempting to test the browser") success = False return self.render_json({ 'success' : success })
def get_scrape_page(self, request_info, **kwargs): """ Perform a page scrape and return the results (useful for previewing a web_input modular input configuration) """ result = [{}] # Run the input try: web_input = WebInput(timeout=10) kw = {} # Get the URL or URI url = None if 'url' in kwargs: url = kwargs['url'] elif 'uri' in kwargs: url = kwargs['uri'] if url is None: return self.render_error_json("No URL was provided", 202) # Get the selector selector = None if 'selector' in kwargs: selector = kwargs['selector'] # Determine if we should include empty matches if 'empty_matches' in kwargs: kw['include_empty_matches'] = util.normalizeBoolean( kwargs['empty_matches'], True) # Get the use_element_name parameter if 'use_element_name' in kwargs: kw['use_element_name'] = util.normalizeBoolean( kwargs['use_element_name'], False) # Get the text_separator parameter if 'text_separator' in kwargs: kw['text_separator'] = kwargs['text_separator'] # Get the output_as_mv parameter. This parameter is different from the name of the # argument that the class accepts and will be renamed accrdingly. if 'output_as_mv' in kwargs: kw['output_matches_as_mv'] = util.normalizeBoolean( kwargs['output_as_mv'], True) # If we are outputting as multi-valued parameters, then don't include the separate # fields if kw['output_matches_as_mv']: kw['output_matches_as_separate_fields'] = False else: # http://lukemurphey.net/issues/1643 kw['output_matches_as_separate_fields'] = True # Get the field match prefix if 'match_prefix' in kwargs: kw['match_prefix'] = kwargs['match_prefix'] # Get the browser parameter if 'browser' in kwargs: kw['browser'] = kwargs['browser'] # Get the page_limit parameter if 'page_limit' in kwargs: kw['page_limit'] = int(kwargs['page_limit']) # Get the depth_limit parameter if 'depth_limit' in kwargs: kw['depth_limit'] = int(kwargs['depth_limit']) # Get the depth_limit parameter if 'url_filter' in kwargs: kw['url_filter'] = kwargs['url_filter'] # Get the name_attributes parameter if 'name_attributes' in kwargs: kw['name_attributes'] = kwargs['name_attributes'] # Get the raw_content parameter if 'raw_content' in kwargs: kw['include_raw_content'] = util.normalizeBoolean( kwargs['raw_content']) # Only extract links using HTTPS if on Splunk Cloud if ModularInput.is_on_cloud(request_info.session_key): kw['https_only'] = True # Otherwise, allow callers to specify which links to extract elif 'https_only' in kwargs: kw['https_only'] = util.normalizeBoolean(kwargs['https_only']) # Get the proxy configuration conf_stanza = "default" # Get the timeout parameter timeout = 5 if 'timeout' in kwargs: try: timeout = int(kwargs['timeout']) except: # The timeout is invalid. Ignore this for now, it will get picked up when # the user attempts to save the input pass # Make the web scraper instance web_scraper = WebScraper(timeout) # Get the authentication information, if available username = None password = None if 'password' in kwargs and 'username' in kwargs: username = kwargs['username'] password = kwargs['password'] username_field = kwargs.get('username_field', None) password_field = kwargs.get('password_field', None) authentication_url = kwargs.get('authentication_url', None) if authentication_url is not None: authentication_url = urlparse(authentication_url) logger.debug("Using credentials for scrape_page") web_scraper.set_authentication(username, password, authentication_url, username_field, password_field) # Get the user-agent string if 'user_agent' in kwargs: web_scraper.user_agent = kwargs['user_agent'] # Set the proxy authentication try: proxy_type, proxy_server, proxy_port, proxy_user, proxy_password = web_input.get_proxy_config( request_info.session_key, conf_stanza) web_scraper.set_proxy(proxy_type, proxy_server, proxy_port, proxy_user, proxy_password) except ResourceNotFound: return self.render_error_json( "Proxy server information could not be obtained", 202) # Scrape the page result = web_scraper.scrape_page(url, selector, **kw) except FieldValidationException as e: return self.render_error_json(str(e), 220) except ServerNotFoundError as e: return self.render_error_json(str(e), 220) except (SelectorError, SelectorSyntaxError, ExpressionError): return self.render_error_json("Selector is invalid. ", 220) except LoginFormNotFound: return self.render_error_json("Login form was not found", 220) except FormAuthenticationFailed: return self.render_error_json("Form authentication failed", 220) except Exception as e: logger.exception("Error generated during execution") return self.render_error_json(str(e), 500) # Return the information if 'include_first_result_only' in kwargs: return self.render_json(result[0]) else: return self.render_json(result)
def get_load_page(self, request_info, url, **kwargs): """ Proxy a web-page through so that a UI can be displayed for showing potential results. """ web_client = None try: # -------------------------------------- # 1: Make sure that user has permission to make inputs. We don't want to allow people # to use this as a general proxy. # -------------------------------------- if not WebInputOperationsHandler.hasCapability( 'edit_modinput_web_input' ) and WebInputOperationsHandler.hasCapability('admin_all_objects'): return self.render_error_html( 'You need the "edit_modinput_web_input" capability ' + 'to make website inputs', 403) # Don't allow proxying of the javascript files if url.endswith(".js"): return { 'payload': '', 'status': 200, 'headers': { 'Content-Type': 'application/javascript' }, } # -------------------------------------- # 2: Only allow HTTPS if the install is on Splunk Cloud # -------------------------------------- if ModularInput.is_on_cloud(request_info.session_key ) and not url.startswith("https://"): return self.render_error_html( 'URLs on Splunk Cloud must use HTTPS protocol', 401) # TODO: deterine best code # -------------------------------------- # 3: Perform a request for the page # -------------------------------------- # Get the proxy configuration conf_stanza = "default" try: web_input = WebInput(timeout=10) proxy_type, proxy_server, proxy_port, proxy_user, proxy_password = \ web_input.get_proxy_config(request_info.session_key, conf_stanza) except ResourceNotFound: return self.render_error_html( "Proxy server information could not be obtained", 202) # Get the timeout to use timeout = None if 'timeout' in kwargs: try: timeout = int(kwargs['timeout']) except ValueError: timeout = 15 else: timeout = 15 # Get the user-agent user_agent = kwargs.get('user_agent', None) # Get the information on the browser to use browser = None if 'browser' in kwargs: browser = kwargs['browser'] # Make the client if browser is None or browser == WebScraper.INTEGRATED_CLIENT: web_client = DefaultWebClient(timeout, user_agent, logger) elif browser == WebScraper.FIREFOX: web_client = FirefoxClient(timeout, user_agent, logger) elif browser == WebScraper.CHROME: web_client = ChromeClient(timeout, user_agent, logger) web_client.setProxy(proxy_type, proxy_server, proxy_port, proxy_user, proxy_password) # Get the username and password username = kwargs.get('username', None) password = kwargs.get('password', None) username_field = kwargs.get('username_field', None) password_field = kwargs.get('password_field', None) authentication_url = kwargs.get('authentication_url', None) if username is not None and password is not None: username = kwargs['username'] password = kwargs['password'] username_field = kwargs.get('username_field', None) password_field = kwargs.get('password_field', None) authentication_url = kwargs.get('authentication_url', None) web_client.setCredentials(username, password) if authentication_url is not None: logger.debug( "Authenticating using form login in scrape_page") web_client.doFormLogin(authentication_url, username_field, password_field) # Get the page try: content = web_client.get_url(url, 'GET') response = web_client.get_response_headers() except: logger.exception( "Exception generated while attempting to content for url=%s", url) return self.render_error_html( "Page preview could not be obtained using a web-browser", 500) # -------------------------------------- # 4: Render the content with the browser if necessary # -------------------------------------- """ if 'text/html' in response['content-type']: # Get the information on the browser to use browser = None if 'browser' in kwargs: browser = kwargs['browser'] # Try rendering the content using a web-browser try: if browser is not None and browser != WebScraper.INTEGRATED_CLIENT: web_scraper = WebScraper(timeout=timeout) web_scraper.set_proxy(proxy_type, proxy_server, proxy_port, proxy_user, proxy_password) web_scraper.set_authentication(username, password) content = web_scraper.get_result_browser(urlparse(url), browser) except: logger.exception("Exception generated while attempting to get browser rendering or url=%s", url) cherrypy.response.status = 500 return self.render_error_html("Page preview could not be obtained using a web-browser") """ # -------------------------------------- # 5: Rewrite the links in HTML files so that they also point to the internal proxy # -------------------------------------- if "<html" in content: # Parse the content html = lxml.html.document_fromstring(content) # Rewrite the links to point to this internal proxy rewrite_using_internal_proxy = True if rewrite_using_internal_proxy: def relocate_href(link): """ Change the hrefs such that they go through the proxy. """ link = urljoin(url, link) if link.endswith(".js"): return "" if not link.endswith(".css"): return "load_page?url=" + link else: return link html.rewrite_links(relocate_href) # Block the href links for element, attribute, _, _ in html.iterlinks(): if element.tag == "a" and attribute == "href": element.set('href', "#") elif element.tag == "form" and attribute == "action": element.set('action', "?") else: html.make_links_absolute(url) # Determine if we should clean the JS clean_script = True if 'clean_script' in kwargs: clean_script = util.normalizeBoolean( kwargs['clean_script']) # Determine if we should clean the CSS clean_styles = False if 'clean_styles' in kwargs: clean_styles = util.normalizeBoolean( kwargs['clean_styles']) # Clean up the HTML if clean_styles or clean_script: kill_tags = [] if clean_script: kill_tags = ["script"] # Remove the script blocks cleaner = Cleaner(page_structure=False, kill_tags=kill_tags, javascript=False, links=False, style=clean_styles, safe_attrs_only=False) # Get the content content = lxml.html.tostring(cleaner.clean_html(html), encoding="unicode") else: content = lxml.html.tostring(html, encoding="unicode") # -------------------------------------- # 6: Respond with the results # -------------------------------------- headers = {} if 'content-type' in response: headers['Content-Type'] = response['content-type'] else: headers['Content-Type'] = 'text/html' # -------------------------------------- # 7: Clear Javascript files # -------------------------------------- if response.get('content-type', "") == "application/javascript" \ or response.get('content-type', "") == "application/x-javascript" \ or response.get('content-type', "") == "text/javascript" \ or url.endswith(".js"): return {'payload': '', 'headers': headers, 'status': 200} return {'payload': content, 'headers': headers, 'status': 200} except LoginFormNotFound: logger.debug("Login form not found") return self.render_error_html("Login form was not found", 200) except FormAuthenticationFailed as e: logger.debug("Form authentication failed: " + str(e)) return self.render_error_html( "Form authentication failed: " + str(e), 200) except: logger.exception("Error when attempting to proxy an HTTP request") return self.render_error_html("Page preview could not be created", 500) finally: if web_client: web_client.close()
def test_is_expired(self): self.assertFalse( WebInput.is_expired(time.time(), 30) ) self.assertTrue( WebInput.is_expired(time.time() - 31, 30) )
def test_save_checkpoint(self): WebInput.save_checkpoint_data(self.tmp_dir, "web_input://TextCritical.com", { 'last_run': 100 }) self.assertEquals( WebInput.last_ran(self.tmp_dir, "web_input://TextCritical.com"), 100)
def test_input_timeout(self): url_field = URLField( "test_input_timeout", "title", "this is a test" ) selector_field = SelectorField( "test_web_input_css", "title", "this is a test" ) result = WebInput.scrape_page( url_field.to_python("https://192.168.30.23/"), selector_field.to_python("div"), timeout=3 ) self.assertEquals(result['timed_out'], True)
def test_get_file_path(self): self.assertEquals( WebInput.get_file_path( "/Users/lmurphey/Applications/splunk/var/lib/splunk/modinputs/web_input", "web_input://TextCritical.com"), "/Users/lmurphey/Applications/splunk/var/lib/splunk/modinputs/web_input/2c70b6c76574eb4d825bfb194a460558.json")
def test_field_escaping_reserved(self): self.assertTrue(WebInput.escape_field_name("source"), "match_source") self.assertTrue(WebInput.escape_field_name("host"), "match_host") self.assertTrue(WebInput.escape_field_name("sourcetype"), "match_sourcetype") self.assertTrue(WebInput.escape_field_name("_time"), "match_time")
def test_field_escaping_whitespace(self): self.assertTrue(WebInput.escape_field_name(" "), "blank")
def test_field_escaping(self): self.assertTrue(WebInput.escape_field_name("tree()"), "tree__")
def load_page(self, url, **kwargs): """ Proxy a web-page through so that a UI can be displayed for showing potential results. """ web_client = None try: # -------------------------------------- # 1: Make sure that user has permission to make inputs. We don't want to allow people # to use this as a general proxy. # -------------------------------------- if not WebInputController.hasCapability('edit_modinput_web_input'): return self.render_error_html('You need the "edit_modinput_web_input" capability ' + 'to make website inputs') # Don't allow proxying of the javascript files if url.endswith(".js"): cherrypy.response.headers['Content-Type'] = 'application/javascript' return "" # -------------------------------------- # 2: Only allow HTTPS if the install is on Splunk Cloud # -------------------------------------- if ModularInput.is_on_cloud(cherrypy.session.get('sessionKey')) and not url.startswith("https://"): return self.render_error_html('URLs on Splunk Cloud must use HTTPS protocol') # -------------------------------------- # 3: Perform a request for the page # -------------------------------------- # Get the proxy configuration conf_stanza = "default" try: web_input = WebInput(timeout=10) proxy_type, proxy_server, proxy_port, proxy_user, proxy_password = \ web_input.get_proxy_config(cherrypy.session.get('sessionKey'), conf_stanza) except splunk.ResourceNotFound: cherrypy.response.status = 202 return self.render_error_html("Proxy server information could not be obtained") # Get the timeout to use timeout = None if 'timeout' in kwargs: try: timeout = int(kwargs['timeout']) except ValueError: timeout = 15 else: timeout = 15 # Get the user-agent user_agent = kwargs.get('user_agent', None) # Get the information on the browser to use browser = None if 'browser' in kwargs: browser = kwargs['browser'] # Make the client if browser is None or browser == WebScraper.INTEGRATED_CLIENT: web_client = DefaultWebClient(timeout, user_agent, logger) elif browser == WebScraper.FIREFOX: web_client = FirefoxClient(timeout, user_agent, logger) elif browser == WebScraper.CHROME: web_client = ChromeClient(timeout, user_agent, logger) web_client.setProxy(proxy_type, proxy_server, proxy_port, proxy_user, proxy_password) # Get the username and password username = kwargs.get('username', None) password = kwargs.get('password', None) username_field = kwargs.get('username_field', None) password_field = kwargs.get('password_field', None) authentication_url = kwargs.get('authentication_url', None) if username is not None and password is not None: username = kwargs['username'] password = kwargs['password'] username_field = kwargs.get('username_field', None) password_field = kwargs.get('password_field', None) authentication_url = kwargs.get('authentication_url', None) web_client.setCredentials(username, password) if authentication_url is not None: logger.debug("Authenticating using form login in scrape_page") web_client.doFormLogin(authentication_url, username_field, password_field) # Get the page try: content = web_client.get_url(url, 'GET') response = web_client.get_response_headers() except: logger.exception("Exception generated while attempting to content for url=%s", url) cherrypy.response.status = 500 return self.render_error_html("Page preview could not be created using a web-browser") # -------------------------------------- # 4: Render the content with the browser if necessary # -------------------------------------- """ if 'text/html' in response['content-type']: # Get the information on the browser to use browser = None if 'browser' in kwargs: browser = kwargs['browser'] # Try rendering the content using a web-browser try: if browser is not None and browser != WebScraper.INTEGRATED_CLIENT: web_scraper = WebScraper(timeout=timeout) web_scraper.set_proxy(proxy_type, proxy_server, proxy_port, proxy_user, proxy_password) web_scraper.set_authentication(username, password) content = web_scraper.get_result_browser(urlparse.urlparse(url), browser) except: logger.exception("Exception generated while attempting to get browser rendering or url=%s", url) cherrypy.response.status = 500 return self.render_error_html("Page preview could not be created using a web-browser") """ # -------------------------------------- # 5: Rewrite the links in HTML files so that they also point to the internal proxy # -------------------------------------- if "<html" in content: # Parse the content html = lxml.html.document_fromstring(content) # Rewrite the links to point to this internal proxy rewrite_using_internal_proxy = True if rewrite_using_internal_proxy: def relocate_href(link): """ Change the hrefs such that they go through the proxy. """ link = urlparse.urljoin(url, link) if link.endswith(".js"): return "" if not link.endswith(".css"): return "load_page?url=" + link else: return link html.rewrite_links(relocate_href) # Block the href links for element, attribute, _, _ in html.iterlinks(): if element.tag == "a" and attribute == "href": element.set('href', "#") elif element.tag == "form" and attribute == "action": element.set('action', "?") else: html.make_links_absolute(url) # Determine if we should clean the JS clean_script = True if 'clean_script' in kwargs: clean_script = util.normalizeBoolean(kwargs['clean_script']) # Determine if we should clean the CSS clean_styles = False if 'clean_styles' in kwargs: clean_styles = util.normalizeBoolean(kwargs['clean_styles']) # Clean up the HTML if clean_styles or clean_script: kill_tags = [] if clean_script: kill_tags = ["script"] # Remove the script blocks cleaner = Cleaner(page_structure=False, kill_tags=kill_tags, javascript=False, links=False, style=clean_styles, safe_attrs_only=False) # Get the content content = lxml.html.tostring(cleaner.clean_html(html)) else: content = lxml.html.tostring(html) # -------------------------------------- # 6: Respond with the results # -------------------------------------- if 'content-type' in response: cherrypy.response.headers['Content-Type'] = response['content-type'] else: cherrypy.response.headers['Content-Type'] = 'text/html' # -------------------------------------- # 7: Clear Javascript files # -------------------------------------- if response.get('content-type', "") == "application/javascript" \ or response.get('content-type', "") == "application/x-javascript" \ or response.get('content-type', "") == "text/javascript" \ or url.endswith(".js"): return "" return content except LoginFormNotFound: logger.debug("Login form not found") return self.render_error_html("Login form was not found") except FormAuthenticationFailed as e: logger.debug("Form authentication failed: " + str(e)) return self.render_error_html("Form authentication failed: " + str(e)) except: logger.exception("Error when attempting to proxy an HTTP request") cherrypy.response.status = 500 return self.render_error_html("Page preview could not be created") finally: if web_client: web_client.close()
def scrape_page(self, **kwargs): """ Perform a page scrape and return the results (useful for previewing a web_input modular input configuration) """ result = [{}] # Run the input try: web_input = WebInput(timeout=10) kw = {} # Get the URL or URI url = None if 'url' in kwargs: url = kwargs['url'] elif 'uri' in kwargs: url = kwargs['uri'] if url is None: cherrypy.response.status = 202 return self.render_error_json(_("No URL was provided")) # Get the selector selector = None if 'selector' in kwargs: selector = kwargs['selector'] # Determine if we should include empty matches if 'empty_matches' in kwargs: kw['include_empty_matches'] = util.normalizeBoolean(kwargs['empty_matches'], True) # Get the use_element_name parameter if 'use_element_name' in kwargs: kw['use_element_name'] = util.normalizeBoolean(kwargs['use_element_name'], False) # Get the text_separator parameter if 'text_separator' in kwargs: kw['text_separator'] = kwargs['text_separator'] # Get the output_as_mv parameter. This parameter is different from the name of the # argument that the class accepts and will be renamed accrdingly. if 'output_as_mv' in kwargs: kw['output_matches_as_mv'] = util.normalizeBoolean(kwargs['output_as_mv'], True) # If we are outputting as multi-valued parameters, then don't include the separate # fields if kw['output_matches_as_mv']: kw['output_matches_as_separate_fields'] = False else: # http://lukemurphey.net/issues/1643 kw['output_matches_as_separate_fields'] = True # Get the field match prefix if 'match_prefix' in kwargs: kw['match_prefix'] = kwargs['match_prefix'] # Get the browser parameter if 'browser' in kwargs: kw['browser'] = kwargs['browser'] # Get the page_limit parameter if 'page_limit' in kwargs: kw['page_limit'] = int(kwargs['page_limit']) # Get the depth_limit parameter if 'depth_limit' in kwargs: kw['depth_limit'] = int(kwargs['depth_limit']) # Get the depth_limit parameter if 'url_filter' in kwargs: kw['url_filter'] = kwargs['url_filter'] # Get the name_attributes parameter if 'name_attributes' in kwargs: kw['name_attributes'] = kwargs['name_attributes'] # Get the raw_content parameter if 'raw_content' in kwargs: kw['include_raw_content'] = util.normalizeBoolean(kwargs['raw_content']) # Only extract links using HTTPS if on Splunk Cloud if ModularInput.is_on_cloud(cherrypy.session.get('sessionKey')): kw['https_only'] = True # Otherwise, allow callers to specify which links to extract elif 'https_only' in kwargs: kw['https_only'] = util.normalizeBoolean(kwargs['https_only']) # Get the proxy configuration conf_stanza = "default" # Get the timeout parameter timeout = 5 if 'timeout' in kwargs: try: timeout = int(kwargs['timeout']) except: # The timeout is invalid. Ignore this for now, it will get picked up when # the user attempts to save the input pass # Make the web scraper instance web_scraper = WebScraper(timeout) # Get the authentication information, if available username = None password = None if 'password' in kwargs and 'username' in kwargs: username = kwargs['username'] password = kwargs['password'] username_field = kwargs.get('username_field', None) password_field = kwargs.get('password_field', None) authentication_url = kwargs.get('authentication_url', None) if authentication_url is not None: authentication_url = urlparse.urlparse(authentication_url) logger.debug("Using credentials for scrape_page") web_scraper.set_authentication(username, password, authentication_url, username_field, password_field) # Get the user-agent string if 'user_agent' in kwargs: web_scraper.user_agent = kwargs['user_agent'] # Set the proxy authentication try: proxy_type, proxy_server, proxy_port, proxy_user, proxy_password = web_input.get_proxy_config(cherrypy.session.get('sessionKey'), conf_stanza) web_scraper.set_proxy(proxy_type, proxy_server, proxy_port, proxy_user, proxy_password) except splunk.ResourceNotFound: cherrypy.response.status = 202 return self.render_error_json(_("Proxy server information could not be obtained")) # Scrape the page result = web_scraper.scrape_page(url, selector, **kw) except FieldValidationException as e: cherrypy.response.status = 220 return self.render_error_json(_(str(e))) except ServerNotFoundError as e: cherrypy.response.status = 220 return self.render_error_json(_(str(e))) except (SelectorError, SelectorSyntaxError, ExpressionError): cherrypy.response.status = 220 return self.render_error_json(_("Selector is invalid. ")) except LoginFormNotFound: cherrypy.response.status = 220 return self.render_error_json("Login form was not found") except FormAuthenticationFailed: cherrypy.response.status = 220 return self.render_error_json("Form authentication failed") except Exception as e: cherrypy.response.status = 500 logger.exception("Error generated during execution") return self.render_error_json(_(str(e))) # Return the information if 'include_first_result_only' in kwargs: return self.render_json(result[0], set_mime='application/json') else: return self.render_json(result, set_mime='application/json')