Python WebScraper.scrape_pageの例

プログラミング言語: Python

名前空間/パッケージ名: website_input_app.web_scraper

クラス/型: WebScraper

メソッド/関数: scrape_page

hotexamples.comのコード掲載数: 8

Python WebScraper.scrape_page - 8件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのwebsite_input_app.web_scraper.WebScraper.scrape_pageの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

よく使われるメソッド

表示非表示

WebScraper(4)

scrape_page(4)

set_authentication(3)

set_proxy(3)

user_agent(2)

コード例 #1

ファイルを表示

ファイル: web_input_controller.py プロジェクト: dbreddyAI/splunk-web-input

    def test_browser(self, browser, **kwargs):
        """
        Determine if the given browser is configured and able to be used.
        """

        success = None

        web_scraper = WebScraper(3)

        # Set the proxy authentication
        try:
            web_input = WebInput(timeout=10)
            proxy_type, proxy_server, proxy_port, proxy_user, proxy_password = web_input.get_proxy_config(
                cherrypy.session.get('sessionKey'), "default")

            web_scraper.set_proxy(proxy_type, proxy_server, proxy_port,
                                  proxy_user, proxy_password)

        except splunk.ResourceNotFound:
            cherrypy.response.status = 202
            return self.render_error_json(
                _("Proxy server information could not be obtained"))

        try:
            result = web_scraper.scrape_page(
                selector="a",
                url=WebInputController.TEST_BROWSER_URL,
                browser=browser,
                include_raw_content=True)

            if not result:
                success = False
            elif len(result) < 1:
                success = False
            elif 'browser' not in result[0]:
                success = True
            else:
                success = (result[0]['browser'] == browser)

        except Exception as exception:
            logger.exception(
                "Exception generated when attempting to test the browser")
            success = False

        return self.render_json({'success': success})

コード例 #2

ファイルを表示

ファイル: web_input_controller.py プロジェクト: LukeMurphey/splunk-web-input

    def test_browser(self, browser, **kwargs):
        """
        Determine if the given browser is configured and able to be used.
        """

        success = None

        web_scraper = WebScraper(3)

        # Set the proxy authentication
        try:
            web_input = WebInput(timeout=10)
            proxy_type, proxy_server, proxy_port, proxy_user, proxy_password = web_input.get_proxy_config(cherrypy.session.get('sessionKey'), "default")

            web_scraper.set_proxy(proxy_type, proxy_server, proxy_port, proxy_user, proxy_password)

        except splunk.ResourceNotFound:
            cherrypy.response.status = 202
            return self.render_error_json(_("Proxy server information could not be obtained"))

        try:
            result = web_scraper.scrape_page(selector="a", url=WebInputController.TEST_BROWSER_URL,
                                             browser=browser, include_raw_content=True)

            if not result:
                success = False
            elif len(result) < 1:
                success = False
            else:
                success = (result[0]['browser'] == browser)
        
        except Exception as exception:
            logger.exception("Exception generated when attempting to test the browser")
            success = False

        return self.render_json({
            'success' : success
        })

コード例 #3

ファイルを表示

ファイル: website_input_ops_rest_handler.py プロジェクト: jjosserand/splunk-web-input

    def get_scrape_page(self, request_info, **kwargs):
        """
        Perform a page scrape and return the results (useful for previewing a web_input modular
        input configuration)
        """

        result = [{}]

        # Run the input
        try:
            web_input = WebInput(timeout=10)

            kw = {}

            # Get the URL or URI
            url = None

            if 'url' in kwargs:
                url = kwargs['url']
            elif 'uri' in kwargs:
                url = kwargs['uri']

            if url is None:
                return self.render_error_json("No URL was provided", 202)

            # Get the selector
            selector = None

            if 'selector' in kwargs:
                selector = kwargs['selector']

            # Determine if we should include empty matches
            if 'empty_matches' in kwargs:
                kw['include_empty_matches'] = util.normalizeBoolean(
                    kwargs['empty_matches'], True)

            # Get the use_element_name parameter
            if 'use_element_name' in kwargs:
                kw['use_element_name'] = util.normalizeBoolean(
                    kwargs['use_element_name'], False)

            # Get the text_separator parameter
            if 'text_separator' in kwargs:
                kw['text_separator'] = kwargs['text_separator']

            # Get the output_as_mv parameter. This parameter is different from the name of the
            # argument that the class accepts and will be renamed accrdingly.
            if 'output_as_mv' in kwargs:
                kw['output_matches_as_mv'] = util.normalizeBoolean(
                    kwargs['output_as_mv'], True)

                # If we are outputting as multi-valued parameters, then don't include the separate
                # fields
                if kw['output_matches_as_mv']:
                    kw['output_matches_as_separate_fields'] = False
                else:
                    # http://lukemurphey.net/issues/1643
                    kw['output_matches_as_separate_fields'] = True

            # Get the field match prefix
            if 'match_prefix' in kwargs:
                kw['match_prefix'] = kwargs['match_prefix']

            # Get the browser parameter
            if 'browser' in kwargs:
                kw['browser'] = kwargs['browser']

            # Get the page_limit parameter
            if 'page_limit' in kwargs:
                kw['page_limit'] = int(kwargs['page_limit'])

            # Get the depth_limit parameter
            if 'depth_limit' in kwargs:
                kw['depth_limit'] = int(kwargs['depth_limit'])

            # Get the depth_limit parameter
            if 'url_filter' in kwargs:
                kw['url_filter'] = kwargs['url_filter']

            # Get the name_attributes parameter
            if 'name_attributes' in kwargs:
                kw['name_attributes'] = kwargs['name_attributes']

            # Get the raw_content parameter
            if 'raw_content' in kwargs:
                kw['include_raw_content'] = util.normalizeBoolean(
                    kwargs['raw_content'])

            # Only extract links using HTTPS if on Splunk Cloud
            if ModularInput.is_on_cloud(request_info.session_key):
                kw['https_only'] = True

            # Otherwise, allow callers to specify which links to extract
            elif 'https_only' in kwargs:
                kw['https_only'] = util.normalizeBoolean(kwargs['https_only'])

            # Get the proxy configuration
            conf_stanza = "default"

            # Get the timeout parameter
            timeout = 5

            if 'timeout' in kwargs:
                try:
                    timeout = int(kwargs['timeout'])
                except:
                    # The timeout is invalid. Ignore this for now, it will get picked up when
                    # the user attempts to save the input
                    pass

            # Make the web scraper instance
            web_scraper = WebScraper(timeout)

            # Get the authentication information, if available
            username = None
            password = None

            if 'password' in kwargs and 'username' in kwargs:
                username = kwargs['username']
                password = kwargs['password']

                username_field = kwargs.get('username_field', None)
                password_field = kwargs.get('password_field', None)
                authentication_url = kwargs.get('authentication_url', None)

                if authentication_url is not None:
                    authentication_url = urlparse(authentication_url)

                logger.debug("Using credentials for scrape_page")
                web_scraper.set_authentication(username, password,
                                               authentication_url,
                                               username_field, password_field)

            # Get the user-agent string
            if 'user_agent' in kwargs:
                web_scraper.user_agent = kwargs['user_agent']

            # Set the proxy authentication
            try:
                proxy_type, proxy_server, proxy_port, proxy_user, proxy_password = web_input.get_proxy_config(
                    request_info.session_key, conf_stanza)

                web_scraper.set_proxy(proxy_type, proxy_server, proxy_port,
                                      proxy_user, proxy_password)

            except ResourceNotFound:
                return self.render_error_json(
                    "Proxy server information could not be obtained", 202)

            # Scrape the page
            result = web_scraper.scrape_page(url, selector, **kw)

        except FieldValidationException as e:
            return self.render_error_json(str(e), 220)

        except ServerNotFoundError as e:
            return self.render_error_json(str(e), 220)

        except (SelectorError, SelectorSyntaxError, ExpressionError):
            return self.render_error_json("Selector is invalid. ", 220)

        except LoginFormNotFound:
            return self.render_error_json("Login form was not found", 220)

        except FormAuthenticationFailed:
            return self.render_error_json("Form authentication failed", 220)

        except Exception as e:
            logger.exception("Error generated during execution")
            return self.render_error_json(str(e), 500)

        # Return the information
        if 'include_first_result_only' in kwargs:
            return self.render_json(result[0])
        else:
            return self.render_json(result)

コード例 #4

ファイルを表示

class WebScraperSearchCommand(SearchCommand):
    """
    The search command takes the arguments provided by the command-line and sends it to the
    modular input functions so that you could you run the input manually.
    """
    def __init__(self,
                 url=None,
                 selector=None,
                 username=None,
                 password=None,
                 timeout=30,
                 name_attributes=None,
                 output_as_mv=True,
                 output_matches_as_mv=None,
                 output_matches_as_separate_fields=False,
                 use_element_name=False,
                 page_limit=1,
                 depth_limit=50,
                 url_filter=None,
                 text_separator=" ",
                 raw_content=False,
                 include_raw_content=None,
                 browser=None,
                 match_prefix=None,
                 user_agent=None,
                 empty_matches=False,
                 empty_value='NULL',
                 authentication_url=None,
                 username_field=None,
                 password_field=None):

        # Note: output_matches_as_mv and include_raw_content are supported for legacy purposes

        # Make sure the required arguments are provided
        if url is None:
            raise ValueError("url argument must be provided")

        if selector is None:
            raise ValueError("selector argument must be provided")

        # Use the older output_matches_as_mv field if included
        if output_matches_as_mv is not None:
            output_as_mv = output_matches_as_mv

        # Decide on whether to include the matches as separate fields if output_as_mv is set
        if normalizeBoolean(output_as_mv):
            output_as_mv = True
            output_matches_as_separate_fields = False
        else:
            output_as_mv = False
            output_matches_as_separate_fields = True

        if name_attributes is None:
            name_attributes = []

        # Make the web scraper instance
        self.web_scraper = WebScraper(int(timeout))
        self.web_scraper.user_agent = user_agent

        # Save the parameters
        self.params = {
            "url":
            url,
            "selector":
            selector,
            "name_attributes":
            name_attributes,
            "output_matches_as_mv":
            normalizeBoolean(output_as_mv),
            "output_matches_as_separate_fields":
            normalizeBoolean(output_matches_as_separate_fields),
            "include_empty_matches":
            empty_matches,
            "empty_value":
            empty_value,
            "use_element_name":
            normalizeBoolean(use_element_name),
            "page_limit":
            int(page_limit),
            "depth_limit":
            int(depth_limit),
            "url_filter":
            url_filter,
            "include_raw_content":
            normalizeBoolean(include_raw_content) if include_raw_content
            is not None else normalizeBoolean(raw_content),
            "text_separator":
            text_separator,
            "browser":
            browser,
            "match_prefix":
            match_prefix
        }

        if username is not None and password is not None:
            self.web_scraper.set_authentication(username, password,
                                                authentication_url,
                                                username_field, password_field)

        SearchCommand.__init__(self,
                               run_in_preview=True,
                               logger_name="web_scrape")

        self.logger.info("Web scraper running against url=%s", url)

    def handle_results(self, results, session_key, in_preview):

        # FYI: we ignore results since this is a generating command

        # Make sure that URL is using SSL if on Splunk Cloud
        if ModularInput.is_on_cloud(
                session_key) and not self.params["url"].startswith("https"):
            raise Exception(
                "The URL to scrape must use HTTPS; Splunk Cloud doesn't allow unsecured network access"
            )

        # Make sure that links get extracted if they point to HTTPS sites if on Splunk Cloud
        self.params['https_only'] = ModularInput.is_on_cloud(session_key)

        # Do the scraping
        results = self.web_scraper.scrape_page(**self.params)

        # Output the results
        self.output_results(results)

コード例 #5

ファイルを表示

ファイル: web_input_controller.py プロジェクト: LukeMurphey/splunk-web-input

    def scrape_page(self, **kwargs):
        """
        Perform a page scrape and return the results (useful for previewing a web_input modular
        input configuration)
        """

        result = [{}]

        # Run the input
        try:
            web_input = WebInput(timeout=10)

            kw = {}

            # Get the URL or URI
            url = None

            if 'url' in kwargs:
                url = kwargs['url']
            elif 'uri' in kwargs:
                url = kwargs['uri']

            if url is None:
                cherrypy.response.status = 202
                return self.render_error_json(_("No URL was provided"))

            # Get the selector
            selector = None

            if 'selector' in kwargs:
                selector = kwargs['selector']

            # Determine if we should include empty matches
            if 'empty_matches' in kwargs:
                kw['include_empty_matches'] = util.normalizeBoolean(kwargs['empty_matches'], True)

            # Get the use_element_name parameter
            if 'use_element_name' in kwargs:
                kw['use_element_name'] = util.normalizeBoolean(kwargs['use_element_name'], False)

            # Get the text_separator parameter
            if 'text_separator' in kwargs:
                kw['text_separator'] = kwargs['text_separator']

            # Get the output_as_mv parameter. This parameter is different from the name of the
            # argument that the class accepts and will be renamed accrdingly.
            if 'output_as_mv' in kwargs:
                kw['output_matches_as_mv'] = util.normalizeBoolean(kwargs['output_as_mv'], True)

                # If we are outputting as multi-valued parameters, then don't include the separate
                # fields
                if kw['output_matches_as_mv']:
                    kw['output_matches_as_separate_fields'] = False
                else:
                    # http://lukemurphey.net/issues/1643
                    kw['output_matches_as_separate_fields'] = True

            # Get the field match prefix
            if 'match_prefix' in kwargs:
                kw['match_prefix'] = kwargs['match_prefix']

            # Get the browser parameter
            if 'browser' in kwargs:
                kw['browser'] = kwargs['browser']

            # Get the page_limit parameter
            if 'page_limit' in kwargs:
                kw['page_limit'] = int(kwargs['page_limit'])

            # Get the depth_limit parameter
            if 'depth_limit' in kwargs:
                kw['depth_limit'] = int(kwargs['depth_limit'])

            # Get the depth_limit parameter
            if 'url_filter' in kwargs:
                kw['url_filter'] = kwargs['url_filter']

            # Get the name_attributes parameter
            if 'name_attributes' in kwargs:
                kw['name_attributes'] = kwargs['name_attributes']

            # Get the raw_content parameter
            if 'raw_content' in kwargs:
                kw['include_raw_content'] = util.normalizeBoolean(kwargs['raw_content'])

            # Only extract links using HTTPS if on Splunk Cloud
            if ModularInput.is_on_cloud(cherrypy.session.get('sessionKey')):
                kw['https_only'] = True

            # Otherwise, allow callers to specify which links to extract
            elif 'https_only' in kwargs:
                kw['https_only'] = util.normalizeBoolean(kwargs['https_only'])

            # Get the proxy configuration
            conf_stanza = "default"

            # Get the timeout parameter
            timeout = 5

            if 'timeout' in kwargs:
                try:
                    timeout = int(kwargs['timeout'])
                except:
                     # The timeout is invalid. Ignore this for now, it will get picked up when
                     # the user attempts to save the input
                    pass

            # Make the web scraper instance
            web_scraper = WebScraper(timeout)

            # Get the authentication information, if available
            username = None
            password = None

            if 'password' in kwargs and 'username' in kwargs:
                username = kwargs['username']
                password = kwargs['password']

                username_field = kwargs.get('username_field', None)
                password_field = kwargs.get('password_field', None)
                authentication_url = kwargs.get('authentication_url', None)

                if authentication_url is not None:
                    authentication_url = urlparse.urlparse(authentication_url)

                logger.debug("Using credentials for scrape_page")
                web_scraper.set_authentication(username, password, authentication_url, username_field, password_field)

            # Get the user-agent string
            if 'user_agent' in kwargs:
                web_scraper.user_agent = kwargs['user_agent']

            # Set the proxy authentication
            try:
                proxy_type, proxy_server, proxy_port, proxy_user, proxy_password = web_input.get_proxy_config(cherrypy.session.get('sessionKey'), conf_stanza)

                web_scraper.set_proxy(proxy_type, proxy_server, proxy_port, proxy_user, proxy_password)

            except splunk.ResourceNotFound:
                cherrypy.response.status = 202
                return self.render_error_json(_("Proxy server information could not be obtained"))

            # Scrape the page
            result = web_scraper.scrape_page(url, selector, **kw)

        except FieldValidationException as e:
            cherrypy.response.status = 220
            return self.render_error_json(_(str(e)))

        except ServerNotFoundError as e:
            cherrypy.response.status = 220
            return self.render_error_json(_(str(e)))

        except (SelectorError, SelectorSyntaxError, ExpressionError):
            cherrypy.response.status = 220
            return self.render_error_json(_("Selector is invalid. "))

        except LoginFormNotFound:
            cherrypy.response.status = 220
            return self.render_error_json("Login form was not found")

        except FormAuthenticationFailed:
            cherrypy.response.status = 220
            return self.render_error_json("Form authentication failed")

        except Exception as e:
            cherrypy.response.status = 500

            logger.exception("Error generated during execution")
            return self.render_error_json(_(str(e)))

        # Return the information
        if 'include_first_result_only' in kwargs:
            return self.render_json(result[0], set_mime='application/json')
        else:
            return self.render_json(result, set_mime='application/json')

コード例 #6

ファイルを表示

    def run(self, stanza, cleaned_params, input_config):

        # Make the parameters
        interval = cleaned_params["interval"]
        title = cleaned_params["title"]
        url = cleaned_params["url"]
        selector = cleaned_params.get("selector", None)
        username = cleaned_params.get("username", None)
        password = cleaned_params.get("password", None)
        name_attributes = cleaned_params.get("name_attributes", [])
        user_agent = cleaned_params.get("user_agent", None)
        timeout = cleaned_params.get("timeout", self.timeout)
        sourcetype = cleaned_params.get("sourcetype", "web_input")
        host = cleaned_params.get("host", None)
        index = cleaned_params.get("index", "default")
        conf_stanza = cleaned_params.get("configuration", None)
        use_element_name = cleaned_params.get("use_element_name", False)
        page_limit = cleaned_params.get("page_limit", 1)
        url_filter = cleaned_params.get("url_filter", None)
        depth_limit = cleaned_params.get("depth_limit", 25)
        raw_content = cleaned_params.get("raw_content", False)
        text_separator = cleaned_params.get("text_separator", " ")
        browser = cleaned_params.get("browser", WebScraper.INTEGRATED_CLIENT)
        output_as_mv = cleaned_params.get("output_as_mv", True)
        output_results_policy = cleaned_params.get("output_results", None)
        username_field = cleaned_params.get("username_field", None)
        password_field = cleaned_params.get("password_field", None)
        authentication_url = cleaned_params.get("authentication_url", None)
        source = stanza

        if self.needs_another_run(input_config.checkpoint_dir, stanza,
                                  interval):

            # Don't scan the URL if the URL is unencrypted and the host is on Cloud
            if self.is_on_cloud(
                    input_config.session_key) and not url.scheme == "https":
                self.logger.warn(
                    "The URL will not be processed because the host is running on Splunk Cloud and the URL isn't using encryption, url=%s",
                    url.geturl())
                return

            # Don't scan the URL if the login URL is unencrypted and the host is on Cloud
            if self.is_on_cloud(
                    input_config.session_key
            ) and authentication_url is not None and authentication_url.scheme != "https":
                self.logger.warn(
                    "The URL will not be processed because the host is running on Splunk Cloud and the login URL isn't using encryption, authentication_url=%s",
                    authentication_url.geturl())
                return

            # Get the proxy configuration
            try:
                proxy_type, proxy_server, proxy_port, proxy_user, proxy_password = self.get_proxy_config(
                    input_config.session_key, conf_stanza)
            except splunk.ResourceNotFound:
                logger.error(
                    "The proxy configuration could not be loaded (resource not found). The execution will be skipped for now for this input with stanza=%s",
                    stanza)
                return
            except splunk.SplunkdConnectionException:
                logger.error(
                    "The proxy configuration could not be loaded (splunkd connection problem). The execution will be skipped for now for this input with stanza=%s",
                    stanza)
                return

            # Get the secure password if necessary
            if username is not None:
                secure_password = self.get_secure_password(
                    realm=stanza, session_key=input_config.session_key)

                if secure_password is not None:
                    password = secure_password['content']['clear_password']
                    self.logger.debug(
                        "Successfully loaded the secure password for input=%s",
                        stanza)

            # Get the information from the page
            try:

                # Make sure the page_limit is not too small
                if page_limit < 1 or page_limit is None or page_limit == "":
                    logger.warn("The parameter is too small for page_limit=%r",
                                page_limit)
                    page_limit = 1

                # Make sure the depth_limit is valid
                if depth_limit < 1 or depth_limit is None or depth_limit == "":
                    logger.warn(
                        "The parameter is too small for depth_limit=%r",
                        depth_limit)
                    depth_limit = 50

                # Determine how to make the match fields
                output_matches_as_mv = True
                output_matches_as_separate_fields = False

                if not output_as_mv:
                    output_matches_as_mv = False
                    output_matches_as_separate_fields = True

                additional_fields = {'title': title}

                # Make an instance of the web-scraper and initialize it
                web_scraper = WebScraper(timeout, logger=logger)

                web_scraper.set_proxy(proxy_type, proxy_server, proxy_port,
                                      proxy_user, proxy_password)
                web_scraper.user_agent = user_agent
                web_scraper.set_authentication(username, password,
                                               authentication_url,
                                               username_field, password_field)

                # Get the checkpoint data so that we can determine the prior hash of the results if necessary
                checkpoint_data = self.get_checkpoint_data(
                    input_config.checkpoint_dir, stanza)

                if checkpoint_data is None:
                    checkpoint_data = {}

                # Keep a list of the matches so that we can determine if any of results changed
                result_info = WebInputResult()

                if output_results_policy == WebInput.OUTPUT_RESULTS_WHEN_CONTENTS_CHANGE or output_results_policy == WebInput.OUTPUT_RESULTS_WHEN_MATCHES_CHANGE:
                    output_fx = None
                else:
                    # Setup the output function so that we can stream the results
                    output_fx = lambda result: self.output_results(
                        [result], index, source, sourcetype, host,
                        checkpoint_data, None, result_info)

                # Perform the scrape
                results = web_scraper.scrape_page(
                    url,
                    selector,
                    name_attributes,
                    use_element_name=use_element_name,
                    page_limit=page_limit,
                    depth_limit=depth_limit,
                    url_filter=url_filter,
                    include_raw_content=raw_content,
                    text_separator=text_separator,
                    browser=browser,
                    output_matches_as_mv=output_matches_as_mv,
                    output_matches_as_separate_fields=
                    output_matches_as_separate_fields,
                    additional_fields=additional_fields,
                    https_only=self.is_on_cloud(input_config.session_key),
                    output_fx=output_fx)

                # Determine the number of results
                if output_fx is None:
                    matches = len(results)
                elif output_fx is not None:
                    matches = results

                logger.info(
                    "Successfully executed the website input, matches_count=%r, stanza=%s, url=%s",
                    matches, stanza, url.geturl())

            except LoginFormNotFound as e:
                logger.warn(
                    'Form authentication failed since the form could not be found, stanza=%s',
                    stanza)

            except FormAuthenticationFailed as e:
                logger.warn(
                    'Form authentication failed, stanza=%s, error="%s"',
                    stanza, str(e))

            except WebClientException as e:
                logger.warn('Client connection failed, stanza=%s, error="%s"',
                            stanza, str(e))

            except Exception:
                logger.exception(
                    "An exception occurred when attempting to retrieve information from the web-page, stanza=%s",
                    stanza)

            # Get the time that the input last ran
            last_ran = self.last_ran(input_config.checkpoint_dir, stanza)

            # If we didn't output the results already (using streaming output, then do it now)
            if output_fx is None:
                self.output_results(results, index, source, sourcetype, host,
                                    checkpoint_data, output_results_policy,
                                    result_info)

            # Make the new checkpoint data dictionary
            new_checkpoint_data = {
                'last_run':
                self.get_non_deviated_last_run(last_ran, interval, stanza),
                'matches_hash':
                result_info.get_hash_of_all_matches(),
                'content_hash':
                result_info.get_hash_of_all_results()
            }

            # Save the checkpoint so that we remember when we last executed this
            self.save_checkpoint_data(input_config.checkpoint_dir, stanza,
                                      new_checkpoint_data)

            # Force garbage collection at the end of the run
            # This is useful since inputs often time run infrequently and we want to clean up
            # after ourselves while we wait for the next run
            import gc
            gc.collect()

コード例 #7

ファイルを表示

ファイル: web_input.py プロジェクト: LukeMurphey/splunk-web-input

    def run(self, stanza, cleaned_params, input_config):

        # Make the parameters
        interval              = cleaned_params["interval"]
        title                 = cleaned_params["title"]
        url                   = cleaned_params["url"]
        selector              = cleaned_params.get("selector", None)
        username              = cleaned_params.get("username", None)
        password              = cleaned_params.get("password", None)
        name_attributes       = cleaned_params.get("name_attributes", [])
        user_agent            = cleaned_params.get("user_agent", None)
        timeout               = cleaned_params.get("timeout", self.timeout)
        sourcetype            = cleaned_params.get("sourcetype", "web_input")
        host                  = cleaned_params.get("host", None)
        index                 = cleaned_params.get("index", "default")
        conf_stanza           = cleaned_params.get("configuration", None)
        use_element_name      = cleaned_params.get("use_element_name", False)
        page_limit            = cleaned_params.get("page_limit", 1)
        url_filter            = cleaned_params.get("url_filter", None)
        depth_limit           = cleaned_params.get("depth_limit", 25)
        raw_content           = cleaned_params.get("raw_content", False)
        text_separator        = cleaned_params.get("text_separator", " ")
        browser               = cleaned_params.get("browser", WebScraper.INTEGRATED_CLIENT)
        output_as_mv          = cleaned_params.get("output_as_mv", True)
        output_results_policy = cleaned_params.get("output_results", None)
        username_field        = cleaned_params.get("username_field", None)
        password_field        = cleaned_params.get("password_field", None)
        authentication_url    = cleaned_params.get("authentication_url", None)
        source                = stanza

        if self.needs_another_run(input_config.checkpoint_dir, stanza, interval):

            # Don't scan the URL if the URL is unencrypted and the host is on Cloud
            if self.is_on_cloud(input_config.session_key) and not url.scheme == "https":
                self.logger.warn("The URL will not be processed because the host is running on Splunk Cloud and the URL isn't using encryption, url=%s", url.geturl())
                return

            # Don't scan the URL if the login URL is unencrypted and the host is on Cloud
            if self.is_on_cloud(input_config.session_key) and authentication_url is not None and authentication_url.scheme != "https":
                self.logger.warn("The URL will not be processed because the host is running on Splunk Cloud and the login URL isn't using encryption, authentication_url=%s", authentication_url.geturl())
                return

            # Get the proxy configuration
            try:
                proxy_type, proxy_server, proxy_port, proxy_user, proxy_password = self.get_proxy_config(input_config.session_key, conf_stanza)
            except splunk.ResourceNotFound:
                logger.error("The proxy configuration could not be loaded (resource not found). The execution will be skipped for now for this input with stanza=%s", stanza)
                return
            except splunk.SplunkdConnectionException:
                logger.error("The proxy configuration could not be loaded (splunkd connection problem). The execution will be skipped for now for this input with stanza=%s", stanza)
                return

            # Get the secure password if necessary
            if username is not None:
                secure_password = self.get_secure_password(realm=stanza, session_key=input_config.session_key)

                if secure_password is not None:
                    password = secure_password['content']['clear_password']
                    self.logger.debug("Successfully loaded the secure password for input=%s", stanza)

            # Get the information from the page
            try:

                # Make sure the page_limit is not too small
                if page_limit < 1 or page_limit is None or page_limit == "":
                    logger.warn("The parameter is too small for page_limit=%r", page_limit)
                    page_limit = 1

                # Make sure the depth_limit is valid
                if depth_limit < 1 or depth_limit is None or depth_limit == "":
                    logger.warn("The parameter is too small for depth_limit=%r", depth_limit)
                    depth_limit = 50
    
                # Determine how to make the match fields
                output_matches_as_mv = True
                output_matches_as_separate_fields = False

                if not output_as_mv:
                    output_matches_as_mv = False
                    output_matches_as_separate_fields = True

                additional_fields = {
                    'title' : title
                }

                # Make an instance of the web-scraper and initialize it
                web_scraper = WebScraper(timeout, logger=logger)

                web_scraper.set_proxy(proxy_type, proxy_server, proxy_port, proxy_user, proxy_password)
                web_scraper.user_agent = user_agent
                web_scraper.set_authentication(username, password, authentication_url, username_field, password_field)

                # Get the checkpoint data so that we can determine the prior hash of the results if necessary
                checkpoint_data = self.get_checkpoint_data(input_config.checkpoint_dir, stanza)

                if checkpoint_data is None:
                    checkpoint_data = {}

                # Keep a list of the matches so that we can determine if any of results changed
                result_info = WebInputResult()

                if output_results_policy == WebInput.OUTPUT_RESULTS_WHEN_CONTENTS_CHANGE or output_results_policy == WebInput.OUTPUT_RESULTS_WHEN_MATCHES_CHANGE:
                    output_fx = None
                else:
                    # Setup the output function so that we can stream the results
                    output_fx = lambda result: self.output_results([result], index, source, sourcetype, host, checkpoint_data, None, result_info)

                # Perform the scrape
                results = web_scraper.scrape_page(url, selector, name_attributes,
                                                 use_element_name=use_element_name,
                                                 page_limit=page_limit,
                                                 depth_limit=depth_limit, url_filter=url_filter,
                                                 include_raw_content=raw_content,
                                                 text_separator=text_separator,
                                                 browser=browser,
                                                 output_matches_as_mv=output_matches_as_mv,
                                                 output_matches_as_separate_fields=output_matches_as_separate_fields,
                                                 additional_fields=additional_fields,
                                                 https_only=self.is_on_cloud(input_config.session_key),
                                                 output_fx=output_fx)

                # Determine the number of results
                if output_fx is None: 
                    matches = len(results)
                elif output_fx is not None:
                    matches = results

                logger.info("Successfully executed the website input, matches_count=%r, stanza=%s, url=%s", matches, stanza, url.geturl())
                    
            except LoginFormNotFound as e:
                logger.warn('Form authentication failed since the form could not be found, stanza=%s', stanza)

            except FormAuthenticationFailed as e:
                logger.warn('Form authentication failed, stanza=%s, error="%s"', stanza, str(e))

            except WebClientException as e:
                logger.warn('Client connection failed, stanza=%s, error="%s"', stanza, str(e))

            except Exception:
                logger.exception("An exception occurred when attempting to retrieve information from the web-page, stanza=%s", stanza)

            # Get the time that the input last ran
            last_ran = self.last_ran(input_config.checkpoint_dir, stanza)

            # If we didn't output the results already (using streaming output, then do it now)
            if output_fx is None:
                self.output_results(results, index, source, sourcetype, host, checkpoint_data, output_results_policy, result_info)

            # Make the new checkpoint data dictionary
            new_checkpoint_data = {
                'last_run' : self.get_non_deviated_last_run(last_ran, interval, stanza),
                'matches_hash' : result_info.get_hash_of_all_matches(),
                'content_hash' : result_info.get_hash_of_all_results()
            }

            # Save the checkpoint so that we remember when we last executed this
            self.save_checkpoint_data(input_config.checkpoint_dir, stanza, new_checkpoint_data)

            # Force garbage collection at the end of the run
            # This is useful since inputs often time run infrequently and we want to clean up
            # after ourselves while we wait for the next run 
            import gc
            gc.collect()

コード例 #8

ファイルを表示

ファイル: web_scrape.py プロジェクト: LukeMurphey/splunk-web-input

class WebScraperSearchCommand(SearchCommand):
    """
    The search command takes the arguments provided by the command-line and sends it to the
    modular input functions so that you could you run the input manually.
    """

    def __init__(self, url=None, selector=None, username=None, password=None, timeout=30,
                 name_attributes=None, output_as_mv=True, output_matches_as_mv=None,
                 output_matches_as_separate_fields=False, use_element_name=False, page_limit=1,
                 depth_limit=50, url_filter=None, text_separator=" ", raw_content=False,
                 include_raw_content=None, browser=None, match_prefix=None, user_agent=None,
                 empty_matches=False, empty_value='NULL', authentication_url=None,
                 username_field=None, password_field=None):

        # Note: output_matches_as_mv and include_raw_content are supported for legacy purposes

        # Make sure the required arguments are provided
        if url is None:
            raise ValueError("url argument must be provided")

        if selector is None:
            raise ValueError("selector argument must be provided")

        # Use the older output_matches_as_mv field if included
        if output_matches_as_mv is not None:
            output_as_mv = output_matches_as_mv

        # Decide on whether to include the matches as separate fields if output_as_mv is set
        if normalizeBoolean(output_as_mv):
            output_as_mv = True
            output_matches_as_separate_fields = False
        else:
            output_as_mv = False
            output_matches_as_separate_fields = True

        if name_attributes is None:
            name_attributes = []

        # Make the web scraper instance
        self.web_scraper = WebScraper(int(timeout))
        self.web_scraper.user_agent = user_agent

        # Save the parameters
        self.params = {
            "url": url,
            "selector": selector,
            "name_attributes": name_attributes,
            "output_matches_as_mv": normalizeBoolean(output_as_mv),
            "output_matches_as_separate_fields": normalizeBoolean(output_matches_as_separate_fields),
            "include_empty_matches": empty_matches,
            "empty_value": empty_value,
            "use_element_name" : normalizeBoolean(use_element_name),
            "page_limit" : int(page_limit),
            "depth_limit" : int(depth_limit),
            "url_filter" : url_filter,
            "include_raw_content" : normalizeBoolean(include_raw_content) if include_raw_content is not None else normalizeBoolean(raw_content),
            "text_separator" : text_separator,
            "browser" : browser,
            "match_prefix" : match_prefix
        }

        if username is not None and password is not None:
            self.web_scraper.set_authentication(username, password, authentication_url, username_field, password_field)

        SearchCommand.__init__(self, run_in_preview=True, logger_name="web_scrape")

        self.logger.info("Web scraper running against url=%s", url)

    def handle_results(self, results, session_key, in_preview):

        # FYI: we ignore results since this is a generating command

        # Make sure that URL is using SSL if on Splunk Cloud
        if ModularInput.is_on_cloud(session_key) and not self.params["url"].startswith("https"):
            raise Exception("The URL to scrape must use HTTPS; Splunk Cloud doesn't allow unsecured network access")

        # Make sure that links get extracted if they point to HTTPS sites if on Splunk Cloud
        self.params['https_only'] = ModularInput.is_on_cloud(session_key)

        # Do the scraping
        results = self.web_scraper.scrape_page(**self.params)

        # Output the results
        self.output_results(results)