Beispiel #1
0
 def test_scrape_page_mv(self):
     web_input = WebInput(timeout=3)
     
     url_field = URLField( "test_web_input", "title", "this is a test" )
     selector_field = SelectorField( "test_web_input_css", "title", "this is a test" )
     result = WebInput.scrape_page( url_field.to_python("http://textcritical.net/"), selector_field.to_python("h2"), output_matches_as_mv=True )
     self.assertEqual(result['response_code'], 200)
     self.assertEqual(len(result['match']), 3)
     
     out = StringIO()
     web_input.output_event(result, stanza="web_input://textcritical_net", index="main", source="test_web_input", sourcetype="sourcetype", out=out)
     self.assertEquals( len(re.findall("match=", out.getvalue())), 3)
Beispiel #2
0
 def test_needs_another_run(self):
     
     # Test case where file does not exist
     self.assertTrue( WebInput.needs_another_run( "/Users/lmurphey/Applications/splunk/var/lib/splunk/modinputs/web_input", "web_input://DoesNotExist", 60 ) )
     
     # Test an interval right at the earlier edge
     self.assertFalse( WebInput.needs_another_run( os.path.join( self.get_test_dir(), "configs" ), "web_input://TextCritical.com", 60, 1365486765 ) )
     
     # Test an interval at the later edge
     self.assertFalse( WebInput.needs_another_run( os.path.join( self.get_test_dir(), "configs" ), "web_input://TextCritical.com", 10, 1365486775 ) )
     
     # Test interval beyond later edge
     self.assertTrue( WebInput.needs_another_run( os.path.join( self.get_test_dir(), "configs" ), "web_input://TextCritical.com", 10, 1365486776 ) )
Beispiel #3
0
 def test_unparsable(self):
     web_input = WebInput(timeout=3)
     
     url_field = URLField( "test_web_input", "title", "this is a test" )
     selector_field = SelectorField( "test_web_input_css", "title", "this is a test" )
     result = WebInput.scrape_page( url_field.to_python("http://textcritical.net/media/images/link_external.png"), selector_field.to_python(".hero-unit .main_background"), timeout=3, output_matches_as_mv=True )
     self.assertEqual(result['match'], [])
 def scrape_page(self, url, selector, **kwargs):
     """
     Perform a page scrape and return the results (useful for previewing a web_input modular input configuration)
     """
     
     result = {}
     
     # Run the input
     try:
         web_input = WebInput(timeout=10)
         
         # Get the authentication information, if available
         username = None
         password = None
         
         if( 'password' in kwargs and 'username' in kwargs):
             username = kwargs['username']
             password = kwargs['password']
             
         # Get the user-agent string
         user_agent = None
         
         if( 'user_agent' in kwargs):
             user_agent = kwargs['user_agent']
         
         # Determine if we should include empty matches
         include_empty_matches = False
         
         if 'include_empty_matches' in kwargs:
             include_empty_matches = util.normalizeBoolean(kwargs['include_empty_matches'], True)
         
         # Get the proxy configuration
         conf_stanza = "default"
         
         try:
             proxy_type, proxy_server, proxy_port, proxy_user, proxy_password = web_input.get_proxy_config(cherrypy.session.get('sessionKey'), conf_stanza)
         except splunk.ResourceNotFound:
             cherrypy.response.status = 202
             return self.render_error_json(_("Proxy server information could not be obtained"))
         
         # Scrape the page
         result = WebInput.scrape_page( url, selector, username=username, password=password, include_empty_matches=include_empty_matches, proxy_type=proxy_type, proxy_server=proxy_server, proxy_port=proxy_port, proxy_user=proxy_user, proxy_password=proxy_password, user_agent=user_agent)
         
     except FieldValidationException, e:
         cherrypy.response.status = 202
         return self.render_error_json(_(str(e)))
Beispiel #5
0
 def test_scrape_unavailable_page(self):
     web_input = WebInput(timeout=3)
     
     url_field = URLField( "test_web_input", "title", "this is a test" )
     selector_field = SelectorField( "test_web_input_css", "title", "this is a test" )
     result = WebInput.scrape_page( url_field.to_python("http://192.168.30.23/"), selector_field.to_python(".hero-unit.main_background"), timeout=3 )
     
     self.assertEqual(result['timed_out'], True)
Beispiel #6
0
 def test_scrape_page(self):
     web_input = WebInput(timeout=3)
     
     url_field = URLField( "test_web_input", "title", "this is a test" )
     selector_field = SelectorField( "test_web_input_css", "title", "this is a test" )
     result = WebInput.scrape_page( url_field.to_python("http://textcritical.net/"), selector_field.to_python(".hero-unit.main_background") )
     self.assertEqual(result['response_code'], 200)
     self.assertEqual(len(result['match']), 1)
Beispiel #7
0
 def test_scrape_page_name_attributes_separate_fields(self):
     web_input = WebInput(timeout=3)
     
     url_field = URLField( "test_web_input", "title", "this is a test" )
     selector_field = SelectorField( "test_web_input_css", "title", "this is a test" )
     result = WebInput.scrape_page( url_field.to_python("http://127.0.0.1:8888"), selector_field.to_python(".hd"), username="******", password="******", timeout=3, name_attributes=["class"], output_matches_as_separate_fields=True, output_matches_as_mv=False)
     
     self.assertEqual(result['match_hd_1'], 'Mode:')
Beispiel #8
0
 def test_scrape_page_name_attributes(self):
     web_input = WebInput(timeout=3)
     
     url_field = URLField( "test_web_input", "title", "this is a test" )
     selector_field = SelectorField( "test_web_input_css", "title", "this is a test" )
     result = WebInput.scrape_page( url_field.to_python("http://127.0.0.1:8888"), selector_field.to_python(".hd"), username="******", password="******", timeout=3, name_attributes=["class"] )
     
     self.assertEqual(len(result['hd']), 31)
Beispiel #9
0
 def test_scrape_encoding_detect_meta(self):
     web_input = WebInput(timeout=3)
     
     url_field = URLField( "test_web_input", "title", "this is a test" )
     selector_field = SelectorField( "test_web_input_css", "title", "this is a test" )
     result = WebInput.scrape_page( url_field.to_python("http://textcritical.net/work/new-testament/Mark/1/2"), selector_field.to_python(".verse-container"), charset_detect_meta_enabled=True, charset_detect_content_type_header_enabled=False, charset_detect_sniff_enabled=False )
     self.assertEqual(result['response_code'], 200)
     self.assertEqual(result['encoding'], "utf-8")
    def test_browser(self, browser, **kwargs):
        """
        Determine if the given browser is configured and able to be used.
        """

        success = None

        web_scraper = WebScraper(3)

        # Set the proxy authentication
        try:
            web_input = WebInput(timeout=10)
            proxy_type, proxy_server, proxy_port, proxy_user, proxy_password = web_input.get_proxy_config(
                cherrypy.session.get('sessionKey'), "default")

            web_scraper.set_proxy(proxy_type, proxy_server, proxy_port,
                                  proxy_user, proxy_password)

        except splunk.ResourceNotFound:
            cherrypy.response.status = 202
            return self.render_error_json(
                _("Proxy server information could not be obtained"))

        try:
            result = web_scraper.scrape_page(
                selector="a",
                url=WebInputController.TEST_BROWSER_URL,
                browser=browser,
                include_raw_content=True)

            if not result:
                success = False
            elif len(result) < 1:
                success = False
            elif 'browser' not in result[0]:
                success = True
            else:
                success = (result[0]['browser'] == browser)

        except Exception as exception:
            logger.exception(
                "Exception generated when attempting to test the browser")
            success = False

        return self.render_json({'success': success})
Beispiel #11
0
 def test_scrape_page_with_invalid_credentials(self):
     web_input = WebInput(timeout=3)
     
     url_field = URLField( "test_web_input", "title", "this is a test" )
     selector_field = SelectorField( "test_web_input_css", "title", "this is a test" )
     result = WebInput.scrape_page( url_field.to_python("http://127.0.0.1:8888"), selector_field.to_python("tr"), timeout=3, output_matches_as_mv=True )
     
     #print result['match']
     self.assertEqual(len(result['match']), 0)
Beispiel #12
0
 def test_scrape_page_name_attributes_escaped_name(self):
     web_input = WebInput(timeout=3)
     
     url_field = URLField( "test_web_input", "title", "this is a test" )
     selector_field = SelectorField( "test_web_input_css", "title", "this is a test" )
     result = WebInput.scrape_page( url_field.to_python("http://127.0.0.1:8888"), selector_field.to_python("input"), username="******", password="******", timeout=3, name_attributes=["onclick"], include_empty_matches=True)
     
     self.assertTrue('btnBerTest__' in result)
     self.assertTrue('btnReset__' in result)
Beispiel #13
0
 def test_scrape_page_adjacent_selector(self):
     # For bug: http://lukemurphey.net/issues/773
     
     web_input = WebInput(timeout=3)
     
     url_field = URLField( "test_web_input", "title", "this is a test" )
     selector_field = SelectorField( "test_web_input_css", "title", "this is a test" )
     result = WebInput.scrape_page( url_field.to_python("http://textcritical.net/"), selector_field.to_python("h1+p,.sharing-buttons"), timeout=3, output_matches_as_mv=True )
     self.assertEqual(len(result['match']), 2)
    def get_login_fields(self, url=None, **kwargs):

        web_input = WebInput(timeout=10)

        proxy_type, proxy_server, proxy_port, proxy_user, proxy_password = \
        web_input.get_proxy_config(cherrypy.session.get('sessionKey'), "default")

        client = MechanizeClient(5)

        logger.debug("Using proxy %s to detect form fields", proxy_server)

        user_agent = kwargs.get('user_agent')

        _, username_field, password_field = client.detectFormFields(url, proxy_type, proxy_server, proxy_port, proxy_user, proxy_password, user_agent)

        return self.render_json({
            'username_field' : username_field or "",
            'password_field' : password_field or ""
        })
Beispiel #15
0
 def test_scrape_encoding_detect_page(self):
     web_input = WebInput(timeout=3)
     
     url_field = URLField( "test_web_input", "title", "this is a test" )
     selector_field = SelectorField( "test_web_input_css", "title", "this is a test" )
     result = WebInput.scrape_page( url_field.to_python("http://textcritical.net/work/new-testament/Mark/1/2?async"), selector_field.to_python(".verse-container") )
     self.assertEqual(result['response_code'], 200)
     self.assertEqual(len(result['match']), 45)
     #print result['match']
     self.assertEqual(unicodedata.normalize('NFC', result['match'][1]), unicodedata.normalize('NFC', u"2 Καθὼς γέγραπται ἐν τῷ Ἠσαίᾳ τῷ προφήτῃ Ἰδοὺ ἀποστέλλω τὸν ἄγγελόν μου πρὸ προσώπου σου , ὃς κατασκευάσει τὴν ὁδόν σου :"))
     self.assertEqual(result['encoding'], "utf-8")
Beispiel #16
0
 def test_scrape_page_child_text(self):
     # This text ensure that text from nodes under the selected nodes is properly extracted
     web_input = WebInput(timeout=3)
     
     url_field = URLField( "test_web_input", "title", "this is a test" )
     selector_field = SelectorField( "test_web_input_css", "title", "this is a test" )
     result = WebInput.scrape_page( url_field.to_python("http://textcritical.net/"), selector_field.to_python(".hero-unit.main_background"), output_matches_as_mv=True )
     self.assertEqual(result['response_code'], 200)
     self.assertEqual(len(result['match']), 1)
     
     self.assertEqual(result['match'][0], "Ancient Greek, Modern Design TextCritical.net is a website that provides a library of ancient Greek works")
Beispiel #17
0
 def handle_results(self, results, in_preview, session_key):
     
     # FYI: we ignore results since this is a generating command
     
     # Do the scraping
     result = WebInput.scrape_page(self.url, self.selector, self.username, self.password, self.timeout, self.name_attributes, self.output_matches_as_mv, self.output_matches_as_separate_fields, include_empty_matches=False, proxy_type="http", proxy_server=None, proxy_port=None, proxy_user=None, proxy_password=None)
     
     self.logger.debug("Retrieved results, result=%r", result)
     
     # Output the results
     self.output_results([result])
Beispiel #18
0
 def test_scrape_page_bad_encoding(self):
     #http://lukemurphey.net/issues/987
     
     web_input = WebInput(timeout=3)
     
     url_field = URLField( "test_web_input", "title", "this is a test" )
     selector_field = SelectorField( "test_web_input_css", "title", "this is a test" )
     result = WebInput.scrape_page( url_field.to_python("http://rss.slashdot.org/Slashdot/slashdot"), selector_field.to_python("description") )
     self.assertEqual(result['response_code'], 200)
     self.assertGreater(len(result['match']), 0)
     self.assertEqual(result['encoding'], "ISO-8859-1")
    def get_get_login_fields(self, request_info, url=None, **kwargs):

        web_input = WebInput(timeout=10)

        proxy_type, proxy_server, proxy_port, proxy_user, proxy_password = \
        web_input.get_proxy_config(request_info.session_key, "default")

        client = MechanizeClient(5)

        logger.debug("Using proxy %s to detect form fields", proxy_server)

        user_agent = kwargs.get('user_agent')

        _, username_field, password_field = client.detectFormFields(
            url, proxy_type, proxy_server, proxy_port, proxy_user,
            proxy_password, user_agent)

        return self.render_json({
            'username_field': username_field or "",
            'password_field': password_field or ""
        })
    def test_browser(self, browser, **kwargs):
        """
        Determine if the given browser is configured and able to be used.
        """

        success = None

        web_scraper = WebScraper(3)

        # Set the proxy authentication
        try:
            web_input = WebInput(timeout=10)
            proxy_type, proxy_server, proxy_port, proxy_user, proxy_password = web_input.get_proxy_config(cherrypy.session.get('sessionKey'), "default")

            web_scraper.set_proxy(proxy_type, proxy_server, proxy_port, proxy_user, proxy_password)

        except splunk.ResourceNotFound:
            cherrypy.response.status = 202
            return self.render_error_json(_("Proxy server information could not be obtained"))

        try:
            result = web_scraper.scrape_page(selector="a", url=WebInputController.TEST_BROWSER_URL,
                                             browser=browser, include_raw_content=True)

            if not result:
                success = False
            elif len(result) < 1:
                success = False
            else:
                success = (result[0]['browser'] == browser)
        
        except Exception as exception:
            logger.exception("Exception generated when attempting to test the browser")
            success = False

        return self.render_json({
            'success' : success
        })
    def get_scrape_page(self, request_info, **kwargs):
        """
        Perform a page scrape and return the results (useful for previewing a web_input modular
        input configuration)
        """

        result = [{}]

        # Run the input
        try:
            web_input = WebInput(timeout=10)

            kw = {}

            # Get the URL or URI
            url = None

            if 'url' in kwargs:
                url = kwargs['url']
            elif 'uri' in kwargs:
                url = kwargs['uri']

            if url is None:
                return self.render_error_json("No URL was provided", 202)

            # Get the selector
            selector = None

            if 'selector' in kwargs:
                selector = kwargs['selector']

            # Determine if we should include empty matches
            if 'empty_matches' in kwargs:
                kw['include_empty_matches'] = util.normalizeBoolean(
                    kwargs['empty_matches'], True)

            # Get the use_element_name parameter
            if 'use_element_name' in kwargs:
                kw['use_element_name'] = util.normalizeBoolean(
                    kwargs['use_element_name'], False)

            # Get the text_separator parameter
            if 'text_separator' in kwargs:
                kw['text_separator'] = kwargs['text_separator']

            # Get the output_as_mv parameter. This parameter is different from the name of the
            # argument that the class accepts and will be renamed accrdingly.
            if 'output_as_mv' in kwargs:
                kw['output_matches_as_mv'] = util.normalizeBoolean(
                    kwargs['output_as_mv'], True)

                # If we are outputting as multi-valued parameters, then don't include the separate
                # fields
                if kw['output_matches_as_mv']:
                    kw['output_matches_as_separate_fields'] = False
                else:
                    # http://lukemurphey.net/issues/1643
                    kw['output_matches_as_separate_fields'] = True

            # Get the field match prefix
            if 'match_prefix' in kwargs:
                kw['match_prefix'] = kwargs['match_prefix']

            # Get the browser parameter
            if 'browser' in kwargs:
                kw['browser'] = kwargs['browser']

            # Get the page_limit parameter
            if 'page_limit' in kwargs:
                kw['page_limit'] = int(kwargs['page_limit'])

            # Get the depth_limit parameter
            if 'depth_limit' in kwargs:
                kw['depth_limit'] = int(kwargs['depth_limit'])

            # Get the depth_limit parameter
            if 'url_filter' in kwargs:
                kw['url_filter'] = kwargs['url_filter']

            # Get the name_attributes parameter
            if 'name_attributes' in kwargs:
                kw['name_attributes'] = kwargs['name_attributes']

            # Get the raw_content parameter
            if 'raw_content' in kwargs:
                kw['include_raw_content'] = util.normalizeBoolean(
                    kwargs['raw_content'])

            # Only extract links using HTTPS if on Splunk Cloud
            if ModularInput.is_on_cloud(request_info.session_key):
                kw['https_only'] = True

            # Otherwise, allow callers to specify which links to extract
            elif 'https_only' in kwargs:
                kw['https_only'] = util.normalizeBoolean(kwargs['https_only'])

            # Get the proxy configuration
            conf_stanza = "default"

            # Get the timeout parameter
            timeout = 5

            if 'timeout' in kwargs:
                try:
                    timeout = int(kwargs['timeout'])
                except:
                    # The timeout is invalid. Ignore this for now, it will get picked up when
                    # the user attempts to save the input
                    pass

            # Make the web scraper instance
            web_scraper = WebScraper(timeout)

            # Get the authentication information, if available
            username = None
            password = None

            if 'password' in kwargs and 'username' in kwargs:
                username = kwargs['username']
                password = kwargs['password']

                username_field = kwargs.get('username_field', None)
                password_field = kwargs.get('password_field', None)
                authentication_url = kwargs.get('authentication_url', None)

                if authentication_url is not None:
                    authentication_url = urlparse(authentication_url)

                logger.debug("Using credentials for scrape_page")
                web_scraper.set_authentication(username, password,
                                               authentication_url,
                                               username_field, password_field)

            # Get the user-agent string
            if 'user_agent' in kwargs:
                web_scraper.user_agent = kwargs['user_agent']

            # Set the proxy authentication
            try:
                proxy_type, proxy_server, proxy_port, proxy_user, proxy_password = web_input.get_proxy_config(
                    request_info.session_key, conf_stanza)

                web_scraper.set_proxy(proxy_type, proxy_server, proxy_port,
                                      proxy_user, proxy_password)

            except ResourceNotFound:
                return self.render_error_json(
                    "Proxy server information could not be obtained", 202)

            # Scrape the page
            result = web_scraper.scrape_page(url, selector, **kw)

        except FieldValidationException as e:
            return self.render_error_json(str(e), 220)

        except ServerNotFoundError as e:
            return self.render_error_json(str(e), 220)

        except (SelectorError, SelectorSyntaxError, ExpressionError):
            return self.render_error_json("Selector is invalid. ", 220)

        except LoginFormNotFound:
            return self.render_error_json("Login form was not found", 220)

        except FormAuthenticationFailed:
            return self.render_error_json("Form authentication failed", 220)

        except Exception as e:
            logger.exception("Error generated during execution")
            return self.render_error_json(str(e), 500)

        # Return the information
        if 'include_first_result_only' in kwargs:
            return self.render_json(result[0])
        else:
            return self.render_json(result)
    def get_load_page(self, request_info, url, **kwargs):
        """
        Proxy a web-page through so that a UI can be displayed for showing potential results.
        """

        web_client = None

        try:

            # --------------------------------------
            # 1: Make sure that user has permission to make inputs. We don't want to allow people
            #    to use this as a general proxy.
            # --------------------------------------
            if not WebInputOperationsHandler.hasCapability(
                    'edit_modinput_web_input'
            ) and WebInputOperationsHandler.hasCapability('admin_all_objects'):
                return self.render_error_html(
                    'You need the "edit_modinput_web_input" capability ' +
                    'to make website inputs', 403)

            # Don't allow proxying of the javascript files
            if url.endswith(".js"):
                return {
                    'payload': '',
                    'status': 200,
                    'headers': {
                        'Content-Type': 'application/javascript'
                    },
                }

            # --------------------------------------
            # 2: Only allow HTTPS if the install is on Splunk Cloud
            # --------------------------------------
            if ModularInput.is_on_cloud(request_info.session_key
                                        ) and not url.startswith("https://"):
                return self.render_error_html(
                    'URLs on Splunk Cloud must use HTTPS protocol',
                    401)  # TODO: deterine best code

            # --------------------------------------
            # 3: Perform a request for the page
            # --------------------------------------

            # Get the proxy configuration
            conf_stanza = "default"

            try:
                web_input = WebInput(timeout=10)

                proxy_type, proxy_server, proxy_port, proxy_user, proxy_password = \
                web_input.get_proxy_config(request_info.session_key, conf_stanza)

            except ResourceNotFound:
                return self.render_error_html(
                    "Proxy server information could not be obtained", 202)

            # Get the timeout to use
            timeout = None

            if 'timeout' in kwargs:
                try:
                    timeout = int(kwargs['timeout'])
                except ValueError:
                    timeout = 15
            else:
                timeout = 15

            # Get the user-agent
            user_agent = kwargs.get('user_agent', None)

            # Get the information on the browser to use
            browser = None

            if 'browser' in kwargs:
                browser = kwargs['browser']

            # Make the client
            if browser is None or browser == WebScraper.INTEGRATED_CLIENT:
                web_client = DefaultWebClient(timeout, user_agent, logger)
            elif browser == WebScraper.FIREFOX:
                web_client = FirefoxClient(timeout, user_agent, logger)
            elif browser == WebScraper.CHROME:
                web_client = ChromeClient(timeout, user_agent, logger)

            web_client.setProxy(proxy_type, proxy_server, proxy_port,
                                proxy_user, proxy_password)

            # Get the username and password
            username = kwargs.get('username', None)
            password = kwargs.get('password', None)

            username_field = kwargs.get('username_field', None)
            password_field = kwargs.get('password_field', None)
            authentication_url = kwargs.get('authentication_url', None)

            if username is not None and password is not None:
                username = kwargs['username']
                password = kwargs['password']

                username_field = kwargs.get('username_field', None)
                password_field = kwargs.get('password_field', None)
                authentication_url = kwargs.get('authentication_url', None)

                web_client.setCredentials(username, password)

                if authentication_url is not None:
                    logger.debug(
                        "Authenticating using form login in scrape_page")
                    web_client.doFormLogin(authentication_url, username_field,
                                           password_field)

            # Get the page
            try:
                content = web_client.get_url(url, 'GET')
                response = web_client.get_response_headers()
            except:
                logger.exception(
                    "Exception generated while attempting to content for url=%s",
                    url)
                return self.render_error_html(
                    "Page preview could not be obtained using a web-browser",
                    500)

            # --------------------------------------
            # 4: Render the content with the browser if necessary
            # --------------------------------------
            """
            if 'text/html' in response['content-type']:

                # Get the information on the browser to use
                browser = None

                if 'browser' in kwargs:
                    browser = kwargs['browser']

                # Try rendering the content using a web-browser
                try:
                    if browser is not None and browser != WebScraper.INTEGRATED_CLIENT:
                        
                        web_scraper = WebScraper(timeout=timeout)
                        web_scraper.set_proxy(proxy_type, proxy_server, proxy_port, proxy_user, proxy_password)
                        web_scraper.set_authentication(username, password)
                        content = web_scraper.get_result_browser(urlparse(url), browser)

                except:
                    logger.exception("Exception generated while attempting to get browser rendering or url=%s", url)

                    cherrypy.response.status = 500
                    return self.render_error_html("Page preview could not be obtained using a web-browser")
            """

            # --------------------------------------
            # 5: Rewrite the links in HTML files so that they also point to the internal proxy
            # --------------------------------------
            if "<html" in content:

                # Parse the content
                html = lxml.html.document_fromstring(content)

                # Rewrite the links to point to this internal proxy
                rewrite_using_internal_proxy = True

                if rewrite_using_internal_proxy:

                    def relocate_href(link):
                        """
                        Change the hrefs such that they go through the proxy.
                        """

                        link = urljoin(url, link)

                        if link.endswith(".js"):
                            return ""
                        if not link.endswith(".css"):
                            return "load_page?url=" + link
                        else:
                            return link

                    html.rewrite_links(relocate_href)

                    # Block the href links
                    for element, attribute, _, _ in html.iterlinks():
                        if element.tag == "a" and attribute == "href":
                            element.set('href', "#")

                        elif element.tag == "form" and attribute == "action":
                            element.set('action', "?")
                else:
                    html.make_links_absolute(url)

                # Determine if we should clean the JS
                clean_script = True

                if 'clean_script' in kwargs:
                    clean_script = util.normalizeBoolean(
                        kwargs['clean_script'])

                # Determine if we should clean the CSS
                clean_styles = False

                if 'clean_styles' in kwargs:
                    clean_styles = util.normalizeBoolean(
                        kwargs['clean_styles'])

                # Clean up the HTML
                if clean_styles or clean_script:

                    kill_tags = []

                    if clean_script:
                        kill_tags = ["script"]

                    # Remove the script blocks
                    cleaner = Cleaner(page_structure=False,
                                      kill_tags=kill_tags,
                                      javascript=False,
                                      links=False,
                                      style=clean_styles,
                                      safe_attrs_only=False)

                    # Get the content
                    content = lxml.html.tostring(cleaner.clean_html(html),
                                                 encoding="unicode")

                else:
                    content = lxml.html.tostring(html, encoding="unicode")

            # --------------------------------------
            # 6: Respond with the results
            # --------------------------------------
            headers = {}

            if 'content-type' in response:
                headers['Content-Type'] = response['content-type']
            else:
                headers['Content-Type'] = 'text/html'

            # --------------------------------------
            # 7: Clear Javascript files
            # --------------------------------------
            if response.get('content-type', "") == "application/javascript" \
               or response.get('content-type', "") == "application/x-javascript" \
               or response.get('content-type', "") == "text/javascript" \
               or url.endswith(".js"):

                return {'payload': '', 'headers': headers, 'status': 200}

            return {'payload': content, 'headers': headers, 'status': 200}

        except LoginFormNotFound:
            logger.debug("Login form not found")
            return self.render_error_html("Login form was not found", 200)

        except FormAuthenticationFailed as e:
            logger.debug("Form authentication failed: " + str(e))
            return self.render_error_html(
                "Form authentication failed: " + str(e), 200)

        except:
            logger.exception("Error when attempting to proxy an HTTP request")
            return self.render_error_html("Page preview could not be created",
                                          500)

        finally:
            if web_client:
                web_client.close()
Beispiel #23
0
 def test_is_expired(self):
     self.assertFalse( WebInput.is_expired(time.time(), 30) )
     self.assertTrue( WebInput.is_expired(time.time() - 31, 30) )
Beispiel #24
0
 def test_save_checkpoint(self):
     WebInput.save_checkpoint_data(self.tmp_dir, "web_input://TextCritical.com", { 'last_run': 100 })
     self.assertEquals( WebInput.last_ran(self.tmp_dir, "web_input://TextCritical.com"), 100)
Beispiel #25
0
 def test_input_timeout(self):
     url_field = URLField( "test_input_timeout", "title", "this is a test" )
     selector_field = SelectorField( "test_web_input_css", "title", "this is a test" )
     result = WebInput.scrape_page( url_field.to_python("https://192.168.30.23/"), selector_field.to_python("div"), timeout=3 )
     
     self.assertEquals(result['timed_out'], True)
Beispiel #26
0
 def test_get_file_path(self):
     self.assertEquals( WebInput.get_file_path( "/Users/lmurphey/Applications/splunk/var/lib/splunk/modinputs/web_input", "web_input://TextCritical.com"), "/Users/lmurphey/Applications/splunk/var/lib/splunk/modinputs/web_input/2c70b6c76574eb4d825bfb194a460558.json")
Beispiel #27
0
 def test_field_escaping_reserved(self):
     self.assertTrue(WebInput.escape_field_name("source"), "match_source")
     self.assertTrue(WebInput.escape_field_name("host"), "match_host")
     self.assertTrue(WebInput.escape_field_name("sourcetype"), "match_sourcetype")
     self.assertTrue(WebInput.escape_field_name("_time"), "match_time")
Beispiel #28
0
 def test_field_escaping_whitespace(self):
     self.assertTrue(WebInput.escape_field_name("  "), "blank")
Beispiel #29
0
 def test_field_escaping(self):
     self.assertTrue(WebInput.escape_field_name("tree()"), "tree__")
    def load_page(self, url, **kwargs):
        """
        Proxy a web-page through so that a UI can be displayed for showing potential results.
        """

        web_client = None

        try:

            # --------------------------------------
            # 1: Make sure that user has permission to make inputs. We don't want to allow people
            #    to use this as a general proxy.
            # --------------------------------------
            if not WebInputController.hasCapability('edit_modinput_web_input'):
                return self.render_error_html('You need the "edit_modinput_web_input" capability ' +
                                              'to make website inputs')

            # Don't allow proxying of the javascript files
            if url.endswith(".js"):
                cherrypy.response.headers['Content-Type'] = 'application/javascript'
                return ""

            # --------------------------------------
            # 2: Only allow HTTPS if the install is on Splunk Cloud
            # --------------------------------------
            if ModularInput.is_on_cloud(cherrypy.session.get('sessionKey')) and not url.startswith("https://"):
                return self.render_error_html('URLs on Splunk Cloud must use HTTPS protocol')

            # --------------------------------------
            # 3: Perform a request for the page
            # --------------------------------------

            # Get the proxy configuration
            conf_stanza = "default"

            try:
                web_input = WebInput(timeout=10)

                proxy_type, proxy_server, proxy_port, proxy_user, proxy_password = \
                web_input.get_proxy_config(cherrypy.session.get('sessionKey'), conf_stanza)

            except splunk.ResourceNotFound:
                cherrypy.response.status = 202
                return self.render_error_html("Proxy server information could not be obtained")

            # Get the timeout to use
            timeout = None

            if 'timeout' in kwargs:
                try:
                    timeout = int(kwargs['timeout'])
                except ValueError:
                    timeout = 15
            else:
                timeout = 15

            # Get the user-agent
            user_agent = kwargs.get('user_agent', None)

            # Get the information on the browser to use
            browser = None

            if 'browser' in kwargs:
                browser = kwargs['browser']

            # Make the client
            if browser is None or browser == WebScraper.INTEGRATED_CLIENT:
                web_client = DefaultWebClient(timeout, user_agent, logger)
            elif browser == WebScraper.FIREFOX:
                web_client = FirefoxClient(timeout, user_agent, logger)
            elif browser == WebScraper.CHROME:
                web_client = ChromeClient(timeout, user_agent, logger)
            
            web_client.setProxy(proxy_type, proxy_server, proxy_port, proxy_user, proxy_password)

            # Get the username and password
            username = kwargs.get('username', None)
            password = kwargs.get('password', None)

            username_field = kwargs.get('username_field', None)
            password_field = kwargs.get('password_field', None)
            authentication_url = kwargs.get('authentication_url', None)

            if username is not None and password is not None:
                username = kwargs['username']
                password = kwargs['password']

                username_field = kwargs.get('username_field', None)
                password_field = kwargs.get('password_field', None)
                authentication_url = kwargs.get('authentication_url', None)

                web_client.setCredentials(username, password)

                if authentication_url is not None:
                    logger.debug("Authenticating using form login in scrape_page")
                    web_client.doFormLogin(authentication_url, username_field, password_field)

            # Get the page
            try:
                content = web_client.get_url(url, 'GET')
                response = web_client.get_response_headers()
            except:
                logger.exception("Exception generated while attempting to content for url=%s", url)

                cherrypy.response.status = 500
                return self.render_error_html("Page preview could not be created using a web-browser")

            # --------------------------------------
            # 4: Render the content with the browser if necessary
            # --------------------------------------
            """
            if 'text/html' in response['content-type']:

                # Get the information on the browser to use
                browser = None

                if 'browser' in kwargs:
                    browser = kwargs['browser']

                # Try rendering the content using a web-browser
                try:
                    if browser is not None and browser != WebScraper.INTEGRATED_CLIENT:
                        
                        web_scraper = WebScraper(timeout=timeout)
                        web_scraper.set_proxy(proxy_type, proxy_server, proxy_port, proxy_user, proxy_password)
                        web_scraper.set_authentication(username, password)
                        content = web_scraper.get_result_browser(urlparse.urlparse(url), browser)

                except:
                    logger.exception("Exception generated while attempting to get browser rendering or url=%s", url)

                    cherrypy.response.status = 500
                    return self.render_error_html("Page preview could not be created using a web-browser")
            """

            # --------------------------------------
            # 5: Rewrite the links in HTML files so that they also point to the internal proxy
            # --------------------------------------
            if "<html" in content:

                # Parse the content
                html = lxml.html.document_fromstring(content)

                # Rewrite the links to point to this internal proxy
                rewrite_using_internal_proxy = True

                if rewrite_using_internal_proxy:

                    def relocate_href(link):
                        """
                        Change the hrefs such that they go through the proxy.
                        """

                        link = urlparse.urljoin(url, link)

                        if link.endswith(".js"):
                            return ""
                        if not link.endswith(".css"):
                            return "load_page?url=" + link
                        else:
                            return link

                    html.rewrite_links(relocate_href)

                    # Block the href links
                    for element, attribute, _, _ in html.iterlinks():
                        if element.tag == "a" and attribute == "href":
                            element.set('href', "#")

                        elif element.tag == "form" and attribute == "action":
                            element.set('action', "?")
                else:
                    html.make_links_absolute(url)

                # Determine if we should clean the JS
                clean_script = True

                if 'clean_script' in kwargs:
                    clean_script = util.normalizeBoolean(kwargs['clean_script'])

                # Determine if we should clean the CSS
                clean_styles = False

                if 'clean_styles' in kwargs:
                    clean_styles = util.normalizeBoolean(kwargs['clean_styles'])

                # Clean up the HTML
                if clean_styles or clean_script:

                    kill_tags = []

                    if clean_script:
                        kill_tags = ["script"]

                    # Remove the script blocks
                    cleaner = Cleaner(page_structure=False, kill_tags=kill_tags, javascript=False,
                                      links=False, style=clean_styles, safe_attrs_only=False)

                    # Get the content
                    content = lxml.html.tostring(cleaner.clean_html(html))

                else:
                    content = lxml.html.tostring(html)

            # --------------------------------------
            # 6: Respond with the results
            # --------------------------------------
            if 'content-type' in response:
                cherrypy.response.headers['Content-Type'] = response['content-type']
            else:
                cherrypy.response.headers['Content-Type'] = 'text/html'

            # --------------------------------------
            # 7: Clear Javascript files
            # --------------------------------------
            if response.get('content-type', "") == "application/javascript" \
               or response.get('content-type', "") == "application/x-javascript" \
               or response.get('content-type', "") == "text/javascript" \
               or url.endswith(".js"):

                return ""

            return content

        except LoginFormNotFound:
            logger.debug("Login form not found")
            return self.render_error_html("Login form was not found")

        except FormAuthenticationFailed as e:
            logger.debug("Form authentication failed: " + str(e))
            return self.render_error_html("Form authentication failed: " + str(e))

        except:
            logger.exception("Error when attempting to proxy an HTTP request")
            cherrypy.response.status = 500
            return self.render_error_html("Page preview could not be created")

        finally:
            if web_client:
                web_client.close()
    def scrape_page(self, **kwargs):
        """
        Perform a page scrape and return the results (useful for previewing a web_input modular
        input configuration)
        """

        result = [{}]

        # Run the input
        try:
            web_input = WebInput(timeout=10)

            kw = {}

            # Get the URL or URI
            url = None

            if 'url' in kwargs:
                url = kwargs['url']
            elif 'uri' in kwargs:
                url = kwargs['uri']

            if url is None:
                cherrypy.response.status = 202
                return self.render_error_json(_("No URL was provided"))

            # Get the selector
            selector = None

            if 'selector' in kwargs:
                selector = kwargs['selector']

            # Determine if we should include empty matches
            if 'empty_matches' in kwargs:
                kw['include_empty_matches'] = util.normalizeBoolean(kwargs['empty_matches'], True)

            # Get the use_element_name parameter
            if 'use_element_name' in kwargs:
                kw['use_element_name'] = util.normalizeBoolean(kwargs['use_element_name'], False)

            # Get the text_separator parameter
            if 'text_separator' in kwargs:
                kw['text_separator'] = kwargs['text_separator']

            # Get the output_as_mv parameter. This parameter is different from the name of the
            # argument that the class accepts and will be renamed accrdingly.
            if 'output_as_mv' in kwargs:
                kw['output_matches_as_mv'] = util.normalizeBoolean(kwargs['output_as_mv'], True)

                # If we are outputting as multi-valued parameters, then don't include the separate
                # fields
                if kw['output_matches_as_mv']:
                    kw['output_matches_as_separate_fields'] = False
                else:
                    # http://lukemurphey.net/issues/1643
                    kw['output_matches_as_separate_fields'] = True

            # Get the field match prefix
            if 'match_prefix' in kwargs:
                kw['match_prefix'] = kwargs['match_prefix']

            # Get the browser parameter
            if 'browser' in kwargs:
                kw['browser'] = kwargs['browser']

            # Get the page_limit parameter
            if 'page_limit' in kwargs:
                kw['page_limit'] = int(kwargs['page_limit'])

            # Get the depth_limit parameter
            if 'depth_limit' in kwargs:
                kw['depth_limit'] = int(kwargs['depth_limit'])

            # Get the depth_limit parameter
            if 'url_filter' in kwargs:
                kw['url_filter'] = kwargs['url_filter']

            # Get the name_attributes parameter
            if 'name_attributes' in kwargs:
                kw['name_attributes'] = kwargs['name_attributes']

            # Get the raw_content parameter
            if 'raw_content' in kwargs:
                kw['include_raw_content'] = util.normalizeBoolean(kwargs['raw_content'])

            # Only extract links using HTTPS if on Splunk Cloud
            if ModularInput.is_on_cloud(cherrypy.session.get('sessionKey')):
                kw['https_only'] = True

            # Otherwise, allow callers to specify which links to extract
            elif 'https_only' in kwargs:
                kw['https_only'] = util.normalizeBoolean(kwargs['https_only'])

            # Get the proxy configuration
            conf_stanza = "default"

            # Get the timeout parameter
            timeout = 5

            if 'timeout' in kwargs:
                try:
                    timeout = int(kwargs['timeout'])
                except:
                     # The timeout is invalid. Ignore this for now, it will get picked up when
                     # the user attempts to save the input
                    pass

            # Make the web scraper instance
            web_scraper = WebScraper(timeout)

            # Get the authentication information, if available
            username = None
            password = None

            if 'password' in kwargs and 'username' in kwargs:
                username = kwargs['username']
                password = kwargs['password']

                username_field = kwargs.get('username_field', None)
                password_field = kwargs.get('password_field', None)
                authentication_url = kwargs.get('authentication_url', None)

                if authentication_url is not None:
                    authentication_url = urlparse.urlparse(authentication_url)

                logger.debug("Using credentials for scrape_page")
                web_scraper.set_authentication(username, password, authentication_url, username_field, password_field)

            # Get the user-agent string
            if 'user_agent' in kwargs:
                web_scraper.user_agent = kwargs['user_agent']

            # Set the proxy authentication
            try:
                proxy_type, proxy_server, proxy_port, proxy_user, proxy_password = web_input.get_proxy_config(cherrypy.session.get('sessionKey'), conf_stanza)

                web_scraper.set_proxy(proxy_type, proxy_server, proxy_port, proxy_user, proxy_password)

            except splunk.ResourceNotFound:
                cherrypy.response.status = 202
                return self.render_error_json(_("Proxy server information could not be obtained"))

            # Scrape the page
            result = web_scraper.scrape_page(url, selector, **kw)

        except FieldValidationException as e:
            cherrypy.response.status = 220
            return self.render_error_json(_(str(e)))

        except ServerNotFoundError as e:
            cherrypy.response.status = 220
            return self.render_error_json(_(str(e)))

        except (SelectorError, SelectorSyntaxError, ExpressionError):
            cherrypy.response.status = 220
            return self.render_error_json(_("Selector is invalid. "))

        except LoginFormNotFound:
            cherrypy.response.status = 220
            return self.render_error_json("Login form was not found")

        except FormAuthenticationFailed:
            cherrypy.response.status = 220
            return self.render_error_json("Form authentication failed")

        except Exception as e:
            cherrypy.response.status = 500

            logger.exception("Error generated during execution")
            return self.render_error_json(_(str(e)))

        # Return the information
        if 'include_first_result_only' in kwargs:
            return self.render_json(result[0], set_mime='application/json')
        else:
            return self.render_json(result, set_mime='application/json')