def __init__(self): scheme_args = { 'title': "Google Spreadsheet", 'description': "Allows you to import/export Splunk lookups to/from Google spreadsheets", 'use_external_validation': "true", 'streaming_mode': "xml", 'use_single_instance': "true" } args = [ Field("spreadsheet", "Spreadsheet Title", "The title of the spreadsheet", empty_allowed=False), Field("worksheet", "Worksheet Name", 'The name of the worksheet (e.g. "Sheet1")', empty_allowed=False), Field( "service_account_key_file", "OAuth2 Service Account Key File", 'The service account key with the credentials necessary for authenticating to Google', empty_allowed=False, required_on_create=False, required_on_edit=False), BooleanField( "only_if_changed", "Import file only if changed", "If set to true, then the import will only be done if the Google worksheet was changed.", empty_allowed=True, required_on_create=False, required_on_edit=False), Field( "operation", "Operation", "The operation to perform (import into Splunk or export to Google Drive)", empty_allowed=False), Field("lookup_name", "Lookup File Name", 'The name of the lookup file to import the content into', empty_allowed=False), DurationField( "interval", "Interval", "The interval defining how often to import the file; can include time units (e.g. 15m for 15 minutes, 8h for 8 hours)", empty_allowed=False), DeprecatedField("google_login", "Google Login", 'The login to use when authenticating to Google'), DeprecatedField( "google_password", "Google Password", 'The password to use when authenticating to Google. You will need to use an app-specific password here if you are using two-factor authentication.' ) ] ModularInput.__init__(self, scheme_args, args, logger_name='google_spreadsheet_modular_input')
def __init__(self): scheme_args = {'title': "File Meta-data", 'description': "Import file and directory meta-data (size, modification dates, etc.)", 'use_external_validation': "true", 'streaming_mode': "xml", 'use_single_instance': "true"} args = [ FilePathField("file_path", "File or directory path", "The path of the to get information on", empty_allowed=False), BooleanField("recurse", "Recursively iterate sub-directories", "Indicates whether sub-directories ought to be recursed", empty_allowed=False), BooleanField("only_if_changed", "Changed items only", "Only include items when one of the time fields is changed", empty_allowed=False), DurationField("interval", "Interval", "The interval defining how often to import the feed; can include time units (e.g. 15m for 15 minutes, 8h for 8 hours)", empty_allowed=False), BooleanField("include_file_hash", "Compute file hash", "Compute a hash on the file (SHA224)", empty_allowed=False), DataSizeField("file_hash_limit", "File-size hash limit", "Only include items when one of the time fields is changed", empty_allowed=False), IntegerField("depth_limit", "Depth Limit", "A limit on how many directories deep to get results for", none_allowed=True, empty_allowed=True), WildcardField("file_filter", "File Name Filter", "A wildcard for which files will be included", none_allowed=True, empty_allowed=True) ] ModularInput.__init__(self, scheme_args, args, logger_name='file_meta_data_modular_input')
def __init__(self, timeout=30, thread_limit=None): scheme_args = {'title': "Website Availability Check", 'description': "Connects to a website in order to obtain performance statistics", 'use_external_validation': "true", 'streaming_mode': "xml", 'use_single_instance': "true"} args = [ Field("title", "Title", "A short description (typically just the domain name)", empty_allowed=False), URLField("url", "URL", "The URL to connect to (must be be either HTTP or HTTPS protocol)", empty_allowed=False, require_https_on_cloud=True), DurationField("interval", "Interval", "The interval defining how often to perform the check; can include time units (e.g. 15m for 15 minutes, 8h for 8 hours)", empty_allowed=False), Field("configuration", "Configuration", "Defines a specific proxy configuration to use (in website_monitoring.spec) if not using the default; only used if you want to have multiple proxy servers", none_allowed=True, empty_allowed=True), Field("client_certificate", "Client Certificate Path", "Defines the path to the client certificate (if the website requires client SSL authentication)", none_allowed=True, empty_allowed=True), Field("client_certificate_key", "Client Certificate Key Path", "Defines the path to the client certificate key (necessary of the key is in a separate file from the certificate)", none_allowed=True, empty_allowed=True), Field("username", "Username", "The username to use for authenticating (only HTTP authentication supported)", none_allowed=True, empty_allowed=True, required_on_create=False, required_on_edit=False), Field("password", "Password", "The password to use for authenticating (only HTTP authentication supported)", none_allowed=True, empty_allowed=True, required_on_create=False, required_on_edit=False), Field("user_agent", "User Agent", "The user-agent to use when communicating with the server", none_allowed=True, empty_allowed=True, required_on_create=False, required_on_edit=False), Field("should_contain_string", "String match", "A string that should be present in the content", none_allowed=True, empty_allowed=True, required_on_create=False, required_on_edit=False) ] ModularInput.__init__(self, scheme_args, args, logger_name='web_availability_modular_input', logger_level=logging.DEBUG) if timeout > 0: self.timeout = timeout else: self.timeout = 30 if thread_limit is None: self.thread_limit = WebPing.DEFAULT_THREAD_LIMIT else: self.thread_limit = thread_limit self.threads = {}
def __init__(self, timeout=30): scheme_args = {'title': "Website Availability Check", 'description': "Connects to a website in order to obtain performance statistics", 'use_external_validation': "true", 'streaming_mode': "xml", 'use_single_instance': "true"} args = [ Field("title", "Title", "A short description (typically just the domain name)", empty_allowed=False), URLField("url", "URL", "The URL to connect to (must be be either HTTP or HTTPS protocol)", empty_allowed=False), DurationField("interval", "Interval", "The interval defining how often to perform the check; can include time units (e.g. 15m for 15 minutes, 8h for 8 hours)", empty_allowed=False), Field("configuration", "Configuration", "Defines a specific proxy configuration to use (in website_monitoring.spec) if not using the default; only used if you want to have multiple proxy servers", none_allowed=True, empty_allowed=True), Field("client_certificate", "Client Certificate Path", "Defines the path to the client certificate (if the website requires client SSL authentication)", none_allowed=True, empty_allowed=True), Field("client_certificate_key", "Client Certificate Key Path", "Defines the path to the client certificate key (necessary of the key is in a separate file from the certificate)", none_allowed=True, empty_allowed=True), Field("username", "Username", "The username to use for authenticating (only HTTP authentication supported)", none_allowed=True, empty_allowed=True, required_on_create=False, required_on_edit=False), Field("password", "Password", "The password to use for authenticating (only HTTP authentication supported)", none_allowed=True, empty_allowed=True, required_on_create=False, required_on_edit=False) ] ModularInput.__init__( self, scheme_args, args, logger_name='web_availability_modular_input' ) if timeout > 0: self.timeout = timeout else: self.timeout = 30
def __init__(self, timeout=30, **kwargs): scheme_args = { 'title': "JWT Webhook", 'description': "Retrieve data from jwt webhook using SSL", 'use_single_instance': True } args = [ IntegerField('port', 'Port', 'The port to run the web-server on', none_allowed=False, empty_allowed=False), Field( 'secret', 'Secret', 'The secret key to decode the JWT encoded payload, leave it empty if the payload is not JWT encoded.', none_allowed=True, empty_allowed=True), Field( 'path', 'Path', 'A wildcard that the path of requests must match (paths generally begin with a "/" and can include a wildcard)', none_allowed=True, empty_allowed=True), FilePathField( 'key_file', 'SSL Certificate Key File', 'The path to the SSL certificate key file (if the certificate requires a key); typically uses .KEY file extension', none_allowed=True, empty_allowed=True, validate_file_existence=True), FilePathField( 'cert_file', 'SSL Certificate File', 'The path to the SSL certificate file (if you want to use encryption); typically uses .DER, .PEM, .CRT, .CER file extensions', none_allowed=False, empty_allowed=False, validate_file_existence=True), Field( 'password', 'Password', 'The password to decrypt the private key, leave it empty if the private key is not encrypted.', none_allowed=True, empty_allowed=True), ] ModularInput.__init__(self, scheme_args, args, logger_name="webhook_modular_input", sleep_interval=60) if timeout > 0: self.timeout = timeout else: self.timeout = 30 self.http_daemons = {}
def __init__(self, timeout=30, **kwargs): scheme_args = { 'title': "FTP", 'description': "Retrieve information over FTP", 'use_single_instance': "false" } args = [ IntegerField("port", "Port", 'The port to run the FTP server on', none_allowed=False, empty_allowed=False), FTPPathField( "path", "Path", 'The path to place the received files; relative paths are based on $SPLUNK_HOME', none_allowed=False, empty_allowed=False), Field( "address", "Address to Listen on", 'The address to have the FTP server listen on; leave blank to listen on all interfaces', none_allowed=True, empty_allowed=True), #DurationField("interval", "Interval", "The interval defining how often to make sure the server is running", empty_allowed=True, none_allowed=True) ] ModularInput.__init__(self, scheme_args, args, logger_name="ftp_modular_input") self.ftp_daemons = []
def __init__(self, timeout=30): scheme_args = { 'title': "Internet Connection Speedtest", 'description': "A speedtest of the Internet connection", 'use_single_instance': False } args = [ Field( "server", "Server", "The server to use for testing; will be automatically assigned if left blank", empty_allowed=True, none_allowed=True, required_on_create=False, required_on_edit=False), IntegerField("runs", "Runs", "The number of runs that should be executed", empty_allowed=False, none_allowed=False) ] ModularInput.__init__(self, scheme_args, args, logger_name='speedtest_modular_input')
def __init__(self, timeout=30): scheme_args = { 'title': "PCAP", 'description': "Watch directories for packet capture files (*.pcap) and process them using Bro." } args = [ Field("path", "Path", "Specify where the pcap files are stored (eg: /var/pcap).", empty_allowed=False), BooleanField( "recursive", "Recursive", "Specify if splunk should monitor all sub directories for incoming pcap. True or False.", empty_allowed=False), Field( "store_dir", "Log directory", "Specify where the created log files by Bro will be stored (eg: /var/log/bro).", empty_allowed=False), Field( "bro_bin", "Bro binary", "Specify where the Bro binary is located (eg: /opt/bro/bin/bro).", empty_allowed=False), Field("bro_opts", "Bro options", "Specify options to pass to Bro (None to deactivate).", empty_allowed=False), Field("bro_script", "Bro script", "Specify a Bro script to use or None do deactivate.", empty_allowed=False), Field( "bro_seeds", "Bro seed file", "Specify if you want to use a seed file to predict Bro UIDs or None do deactivate.", empty_allowed=False), BooleanField( "bro_merge", "Ingest content", "[Bro 2.1 only] Specify if the extracted content by Bro must be encoded in Base64 and appended to Bro logs. This require a Bro script to be set and this is a True or False option.", empty_allowed=False), Field( "content_maxsize", "Content maximum size", "[Bro 2.1 only] Objects greather than the specified size (in bytes) will not be ingested.", empty_allowed=False), Field( "run_maxtime", "Maximum execution time", "When a Bro instance run longer than this time (in secs), kill the instance. Set to 0 to deactivate.", empty_allowed=False), ] ModularInput.__init__(self, scheme_args, args)
def __init__(self): scheme_args = { 'title': "File Meta-data", 'description': "Import file and directory meta-data (size, modification dates, etc.)", 'use_external_validation': "true", 'streaming_mode': "xml", 'use_single_instance': "true" } args = [ FilePathField("file_path", "File or directory path", "The path of the to get information on", empty_allowed=False), BooleanField( "recurse", "Recursively iterate sub-directories", "Indicates whether sub-directories ought to be recursed", empty_allowed=False), BooleanField( "only_if_changed", "Changed items only", "Only include items when one of the time fields is changed", empty_allowed=False), DurationField( "interval", "Interval", "The interval defining how often to import the feed; can include time units (e.g. 15m for 15 minutes, 8h for 8 hours)", empty_allowed=False), BooleanField("include_file_hash", "Compute file hash", "Compute a hash on the file (SHA224)", empty_allowed=False), DataSizeField( "file_hash_limit", "File-size hash limit", "Only include items when one of the time fields is changed", empty_allowed=False), IntegerField( "depth_limit", "Depth Limit", "A limit on how many directories deep to get results for", none_allowed=True, empty_allowed=True), WildcardField("file_filter", "File Name Filter", "A wildcard for which files will be included", none_allowed=True, empty_allowed=True) ] ModularInput.__init__(self, scheme_args, args, logger_name='file_meta_data_modular_input')
def __init__(self, thread_limit=None): scheme_args = { 'title': "Ping", 'description': "Ping a host or a network to see if it is online", 'use_single_instance': True } args = [ ListField("dest", "Destination", "The list of hosts or networks to ping", empty_allowed=True, none_allowed=True, required_on_create=True, required_on_edit=True, instance_class=DomainOrIPNetworkField), RangeField("port", "Port", "The TCP port to use (leave blank to use ICMP instead)", low=1, high=65535, empty_allowed=True, none_allowed=True, required_on_create=False, required_on_edit=False), RangeField("runs", "Runs", "The number of runs that should be executed", low=1, high=100, empty_allowed=False, none_allowed=False), DurationField( "interval", "Interval", "The interval defining how often to perform the check; can include time units (e.g. 15m for 15 minutes, 8h for 8 hours)", empty_allowed=False) ] ModularInput.__init__(self, scheme_args, args, logger_name='ping_modular_input', logger_level=logging.INFO) if thread_limit is None: self.thread_limit = PingInput.DEFAULT_THREAD_LIMIT else: self.thread_limit = thread_limit self.threads = {}
def test_constructor_(self): """ Make sure that the scheme args are accepted properly. """ mod_input = ModularInput({ "use_single_instance" : False, }) self.assertEquals(mod_input.use_single_instance, False) mod_input = ModularInput({ "use_single_instance" : True, }) self.assertEquals(mod_input.use_single_instance, True)
def __init__(self): scheme_args = {'title': "Syndication Feed (RSS, ATOM, RDF)", 'description': "Import syndication feeds (RSS, ATOM, RDF)", 'use_external_validation': "true", 'streaming_mode': "xml", 'use_single_instance': "true"} args = [ URLField("url", "Feed URL", "The URL of the feed to input", empty_allowed=False), BooleanField("include_only_changed", "Include only new or changed entries", "Only include entries that has not been indexed yet (won't get items that were already observed)", empty_allowed=False), Field("username", "Username", "The username to use for authenticating (only HTTP authentication supported)", none_allowed=True, empty_allowed=True, required_on_create=False, required_on_edit=False), Field("password", "Password", "The password to use for authenticating (only HTTP authentication supported)", none_allowed=True, empty_allowed=True, required_on_create=False, required_on_edit=False), DurationField("interval", "Interval", "The interval defining how often to import the feed; can include time units (e.g. 15m for 15 minutes, 8h for 8 hours)", empty_allowed=False), BooleanField("clean_html", "Convert HTML to Text", "Convert HTML to human readable text", empty_allowed=False) ] ModularInput.__init__( self, scheme_args, args, logger_name='syndication_modular_input' )
def __init__(self, timeout=30): scheme_args = {'title': "Django File Cache Size", 'description': "Determines the size of a Django file cache based on the total size and number of files", 'use_external_validation': "true", 'streaming_mode': "xml", 'use_single_instance': "true"} args = [ PathField("path", "Django Cache Path", "The path to the directory used by Django for the cache", empty_allowed=False), DurationField("interval", "Interval", "The interval defining how often to perform the check; can include time units (e.g. 15m for 15 minutes, 8h for 8 hours)", empty_allowed=False) ] ModularInput.__init__( self, scheme_args, args ) if timeout > 0: self.timeout = timeout else: self.timeout = 5
def handle_results(self, results, session_key, in_preview): # FYI: we ignore results since this is a generating command # Make sure that URL is using SSL if on Splunk Cloud if ModularInput.is_on_cloud( session_key) and not self.params["url"].startswith("https"): raise Exception( "The URL to scrape must use HTTPS; Splunk Cloud doesn't allow unsecured network access" ) # Make sure that links get extracted if they point to HTTPS sites if on Splunk Cloud self.params['https_only'] = ModularInput.is_on_cloud(session_key) # Do the scraping results = self.web_scraper.scrape_page(**self.params) # Output the results self.output_results(results)
def __init__(self, thread_limit=None): scheme_args = {'title': "Port Scan", 'description': "Port scan a host to see what ports are open", 'use_single_instance': True} args = [ ListField("dest", "Destination", "The list of hosts or networks to port scan", empty_allowed=True, none_allowed=True, required_on_create=True, required_on_edit=True, instance_class=DomainOrIPNetworkField), PortRangeField("ports", "Ports", "The TCP ports to scan", empty_allowed=False, none_allowed=False, required_on_create=True, required_on_edit=True), DurationField("interval", "Interval", "The interval defining how often to perform the check; can include time units (e.g. 15m for 15 minutes, 8h for 8 hours)", empty_allowed=False) ] ModularInput.__init__(self, scheme_args, args, logger_name='portscan_modular_input', logger_level=logging.DEBUG) if thread_limit is None: self.thread_limit = PortScanInput.DEFAULT_THREAD_LIMIT else: self.thread_limit = thread_limit self.threads = {}
def __init__(self, timeout=30): scheme_args = {'title': "Website Availability Check", 'description': "Connects to a website in order to obtain performance statistics", 'use_external_validation': "true", 'streaming_mode': "xml", 'use_single_instance': "true"} args = [ Field("title", "Title", "A short description (typically just the domain name)", empty_allowed=False), URLField("url", "URL", "The URL to connect to (must be be either HTTP or HTTPS protocol)", empty_allowed=False), DurationField("interval", "Interval", "The interval defining how often to perform the check; can include time units (e.g. 15m for 15 minutes, 8h for 8 hours)", empty_allowed=False), Field("configuration", "Configuration", "Defines a specific proxy configuration to use (in website_monitoring.spec) if not using the default; only used if you want to have multiple proxy servers", none_allowed=True, empty_allowed=True), Field("client_certificate", "Client Certificate Path", "Defines the path to the client certificate (if the website requires client SSL authentication)", none_allowed=True, empty_allowed=True), Field("client_certificate_key", "Client Certificate Key Path", "Defines the path to the client certificate key (necessary of the key is in a separate file from the certificate)", none_allowed=True, empty_allowed=True) ] ModularInput.__init__( self, scheme_args, args, logger_name='web_availability_modular_input' ) if timeout > 0: self.timeout = timeout else: self.timeout = 30
def __init__(self, timeout=30, **kwargs): scheme_args = { 'title': "Web-pages", 'description': "Retrieve information from web-pages", 'use_external_validation': "true", 'streaming_mode': "xml", 'use_single_instance': "true" } args = [ Field("title", "Title", "A short description (typically just the domain name)", empty_allowed=False), URLField( "url", "URL", "The URL to connect to (must be be either HTTP or HTTPS protocol)", empty_allowed=False, require_https_on_cloud=True), DurationField( "interval", "Interval", "The interval defining how often to perform the check; can include time units (e.g. 15m for 15 minutes, 8h for 8 hours)", empty_allowed=False), IntegerField("timeout", "Timeout", 'The timeout (in number of seconds)', none_allowed=True, empty_allowed=True), SelectorField( "selector", "Selector", "A selector that will match the data you want to retrieve", none_allowed=True, empty_allowed=True), # HTTP client options Field("user_agent", "User Agent", "The user-agent to use when communicating with the server", none_allowed=True, empty_allowed=True, required_on_create=False, required_on_edit=False), Field("browser", "Browser", 'The browser to use', none_allowed=True, empty_allowed=True), # Output options ListField("name_attributes", "Field Name Attributes", "A list of attributes to use for assigning a field name", none_allowed=True, empty_allowed=True, required_on_create=False, required_on_edit=False), BooleanField("use_element_name", "Use Element Name as Field Name", "Use the element's tag name as the field name", none_allowed=True, empty_allowed=True, required_on_create=False, required_on_edit=False), BooleanField("output_as_mv", "Output as Multi-value Field", "Output the matches as multi-value field", none_allowed=True, empty_allowed=True, required_on_create=False, required_on_edit=False), StaticListField("output_results", "Indicates when results output should be created", "Output the matches only when results changed", none_allowed=True, empty_allowed=True, required_on_create=False, required_on_edit=False, valid_values=WebInput.OUTPUT_RESULTS_OPTIONS), BooleanField("raw_content", "Raw content", "Return the raw content returned by the server", none_allowed=True, empty_allowed=True, required_on_create=False, required_on_edit=False), BooleanField("empty_matches", "Empty matches", "Include empty rows (otherwise, they are excluded)", none_allowed=True, empty_allowed=True, required_on_create=False, required_on_edit=False), Field( "text_separator", "Text Separator", 'A string that will be placed between the extracted values (e.g. a separator of ":" for a match against "<a>tree</a><a>frog</a>" would return "tree:frog")', none_allowed=True, empty_allowed=True), # Spidering options IntegerField( "page_limit", "Discovered page limit", "A limit on the number of pages that will be auto-discovered", none_allowed=True, empty_allowed=True, required_on_create=False, required_on_edit=False), IntegerField( "depth_limit", "Depth limit", "A limit on how many levels deep the search for pages will go", none_allowed=True, empty_allowed=True, required_on_create=False, required_on_edit=False), Field( "url_filter", "URL Filter", "A wild-card that will indicate which pages it should search for matches in", none_allowed=True, empty_allowed=True, required_on_create=False, required_on_edit=False), # Authentication options Field("username", "Username", "The username to use for authenticating", none_allowed=True, empty_allowed=True, required_on_create=False, required_on_edit=False), Field("password", "Password", "The password to use for authenticating", none_allowed=True, empty_allowed=True, required_on_create=False, required_on_edit=False), Field("username_field", "Username field", "The name of the username field on the login form", none_allowed=True, empty_allowed=True, required_on_create=False, required_on_edit=False), Field("password_field", "Password field", "The name of the password field on the login form", none_allowed=True, empty_allowed=True, required_on_create=False, required_on_edit=False), URLField("authentication_url", "Authentication URL", "The URL of the login form", none_allowed=True, empty_allowed=True, required_on_create=False, required_on_edit=False, require_https_on_cloud=True) ] ModularInput.__init__(self, scheme_args, args, logger_name='web_input_modular_input', logger_level=logging.INFO) if timeout > 0: self.timeout = timeout else: self.timeout = 30
def get_scrape_page(self, request_info, **kwargs): """ Perform a page scrape and return the results (useful for previewing a web_input modular input configuration) """ result = [{}] # Run the input try: web_input = WebInput(timeout=10) kw = {} # Get the URL or URI url = None if 'url' in kwargs: url = kwargs['url'] elif 'uri' in kwargs: url = kwargs['uri'] if url is None: return self.render_error_json("No URL was provided", 202) # Get the selector selector = None if 'selector' in kwargs: selector = kwargs['selector'] # Determine if we should include empty matches if 'empty_matches' in kwargs: kw['include_empty_matches'] = util.normalizeBoolean( kwargs['empty_matches'], True) # Get the use_element_name parameter if 'use_element_name' in kwargs: kw['use_element_name'] = util.normalizeBoolean( kwargs['use_element_name'], False) # Get the text_separator parameter if 'text_separator' in kwargs: kw['text_separator'] = kwargs['text_separator'] # Get the output_as_mv parameter. This parameter is different from the name of the # argument that the class accepts and will be renamed accrdingly. if 'output_as_mv' in kwargs: kw['output_matches_as_mv'] = util.normalizeBoolean( kwargs['output_as_mv'], True) # If we are outputting as multi-valued parameters, then don't include the separate # fields if kw['output_matches_as_mv']: kw['output_matches_as_separate_fields'] = False else: # http://lukemurphey.net/issues/1643 kw['output_matches_as_separate_fields'] = True # Get the field match prefix if 'match_prefix' in kwargs: kw['match_prefix'] = kwargs['match_prefix'] # Get the browser parameter if 'browser' in kwargs: kw['browser'] = kwargs['browser'] # Get the page_limit parameter if 'page_limit' in kwargs: kw['page_limit'] = int(kwargs['page_limit']) # Get the depth_limit parameter if 'depth_limit' in kwargs: kw['depth_limit'] = int(kwargs['depth_limit']) # Get the depth_limit parameter if 'url_filter' in kwargs: kw['url_filter'] = kwargs['url_filter'] # Get the name_attributes parameter if 'name_attributes' in kwargs: kw['name_attributes'] = kwargs['name_attributes'] # Get the raw_content parameter if 'raw_content' in kwargs: kw['include_raw_content'] = util.normalizeBoolean( kwargs['raw_content']) # Only extract links using HTTPS if on Splunk Cloud if ModularInput.is_on_cloud(request_info.session_key): kw['https_only'] = True # Otherwise, allow callers to specify which links to extract elif 'https_only' in kwargs: kw['https_only'] = util.normalizeBoolean(kwargs['https_only']) # Get the proxy configuration conf_stanza = "default" # Get the timeout parameter timeout = 5 if 'timeout' in kwargs: try: timeout = int(kwargs['timeout']) except: # The timeout is invalid. Ignore this for now, it will get picked up when # the user attempts to save the input pass # Make the web scraper instance web_scraper = WebScraper(timeout) # Get the authentication information, if available username = None password = None if 'password' in kwargs and 'username' in kwargs: username = kwargs['username'] password = kwargs['password'] username_field = kwargs.get('username_field', None) password_field = kwargs.get('password_field', None) authentication_url = kwargs.get('authentication_url', None) if authentication_url is not None: authentication_url = urlparse(authentication_url) logger.debug("Using credentials for scrape_page") web_scraper.set_authentication(username, password, authentication_url, username_field, password_field) # Get the user-agent string if 'user_agent' in kwargs: web_scraper.user_agent = kwargs['user_agent'] # Set the proxy authentication try: proxy_type, proxy_server, proxy_port, proxy_user, proxy_password = web_input.get_proxy_config( request_info.session_key, conf_stanza) web_scraper.set_proxy(proxy_type, proxy_server, proxy_port, proxy_user, proxy_password) except ResourceNotFound: return self.render_error_json( "Proxy server information could not be obtained", 202) # Scrape the page result = web_scraper.scrape_page(url, selector, **kw) except FieldValidationException as e: return self.render_error_json(str(e), 220) except ServerNotFoundError as e: return self.render_error_json(str(e), 220) except (SelectorError, SelectorSyntaxError, ExpressionError): return self.render_error_json("Selector is invalid. ", 220) except LoginFormNotFound: return self.render_error_json("Login form was not found", 220) except FormAuthenticationFailed: return self.render_error_json("Form authentication failed", 220) except Exception as e: logger.exception("Error generated during execution") return self.render_error_json(str(e), 500) # Return the information if 'include_first_result_only' in kwargs: return self.render_json(result[0]) else: return self.render_json(result)
def get_load_page(self, request_info, url, **kwargs): """ Proxy a web-page through so that a UI can be displayed for showing potential results. """ web_client = None try: # -------------------------------------- # 1: Make sure that user has permission to make inputs. We don't want to allow people # to use this as a general proxy. # -------------------------------------- if not WebInputOperationsHandler.hasCapability( 'edit_modinput_web_input' ) and WebInputOperationsHandler.hasCapability('admin_all_objects'): return self.render_error_html( 'You need the "edit_modinput_web_input" capability ' + 'to make website inputs', 403) # Don't allow proxying of the javascript files if url.endswith(".js"): return { 'payload': '', 'status': 200, 'headers': { 'Content-Type': 'application/javascript' }, } # -------------------------------------- # 2: Only allow HTTPS if the install is on Splunk Cloud # -------------------------------------- if ModularInput.is_on_cloud(request_info.session_key ) and not url.startswith("https://"): return self.render_error_html( 'URLs on Splunk Cloud must use HTTPS protocol', 401) # TODO: deterine best code # -------------------------------------- # 3: Perform a request for the page # -------------------------------------- # Get the proxy configuration conf_stanza = "default" try: web_input = WebInput(timeout=10) proxy_type, proxy_server, proxy_port, proxy_user, proxy_password = \ web_input.get_proxy_config(request_info.session_key, conf_stanza) except ResourceNotFound: return self.render_error_html( "Proxy server information could not be obtained", 202) # Get the timeout to use timeout = None if 'timeout' in kwargs: try: timeout = int(kwargs['timeout']) except ValueError: timeout = 15 else: timeout = 15 # Get the user-agent user_agent = kwargs.get('user_agent', None) # Get the information on the browser to use browser = None if 'browser' in kwargs: browser = kwargs['browser'] # Make the client if browser is None or browser == WebScraper.INTEGRATED_CLIENT: web_client = DefaultWebClient(timeout, user_agent, logger) elif browser == WebScraper.FIREFOX: web_client = FirefoxClient(timeout, user_agent, logger) elif browser == WebScraper.CHROME: web_client = ChromeClient(timeout, user_agent, logger) web_client.setProxy(proxy_type, proxy_server, proxy_port, proxy_user, proxy_password) # Get the username and password username = kwargs.get('username', None) password = kwargs.get('password', None) username_field = kwargs.get('username_field', None) password_field = kwargs.get('password_field', None) authentication_url = kwargs.get('authentication_url', None) if username is not None and password is not None: username = kwargs['username'] password = kwargs['password'] username_field = kwargs.get('username_field', None) password_field = kwargs.get('password_field', None) authentication_url = kwargs.get('authentication_url', None) web_client.setCredentials(username, password) if authentication_url is not None: logger.debug( "Authenticating using form login in scrape_page") web_client.doFormLogin(authentication_url, username_field, password_field) # Get the page try: content = web_client.get_url(url, 'GET') response = web_client.get_response_headers() except: logger.exception( "Exception generated while attempting to content for url=%s", url) return self.render_error_html( "Page preview could not be obtained using a web-browser", 500) # -------------------------------------- # 4: Render the content with the browser if necessary # -------------------------------------- """ if 'text/html' in response['content-type']: # Get the information on the browser to use browser = None if 'browser' in kwargs: browser = kwargs['browser'] # Try rendering the content using a web-browser try: if browser is not None and browser != WebScraper.INTEGRATED_CLIENT: web_scraper = WebScraper(timeout=timeout) web_scraper.set_proxy(proxy_type, proxy_server, proxy_port, proxy_user, proxy_password) web_scraper.set_authentication(username, password) content = web_scraper.get_result_browser(urlparse(url), browser) except: logger.exception("Exception generated while attempting to get browser rendering or url=%s", url) cherrypy.response.status = 500 return self.render_error_html("Page preview could not be obtained using a web-browser") """ # -------------------------------------- # 5: Rewrite the links in HTML files so that they also point to the internal proxy # -------------------------------------- if "<html" in content: # Parse the content html = lxml.html.document_fromstring(content) # Rewrite the links to point to this internal proxy rewrite_using_internal_proxy = True if rewrite_using_internal_proxy: def relocate_href(link): """ Change the hrefs such that they go through the proxy. """ link = urljoin(url, link) if link.endswith(".js"): return "" if not link.endswith(".css"): return "load_page?url=" + link else: return link html.rewrite_links(relocate_href) # Block the href links for element, attribute, _, _ in html.iterlinks(): if element.tag == "a" and attribute == "href": element.set('href', "#") elif element.tag == "form" and attribute == "action": element.set('action', "?") else: html.make_links_absolute(url) # Determine if we should clean the JS clean_script = True if 'clean_script' in kwargs: clean_script = util.normalizeBoolean( kwargs['clean_script']) # Determine if we should clean the CSS clean_styles = False if 'clean_styles' in kwargs: clean_styles = util.normalizeBoolean( kwargs['clean_styles']) # Clean up the HTML if clean_styles or clean_script: kill_tags = [] if clean_script: kill_tags = ["script"] # Remove the script blocks cleaner = Cleaner(page_structure=False, kill_tags=kill_tags, javascript=False, links=False, style=clean_styles, safe_attrs_only=False) # Get the content content = lxml.html.tostring(cleaner.clean_html(html), encoding="unicode") else: content = lxml.html.tostring(html, encoding="unicode") # -------------------------------------- # 6: Respond with the results # -------------------------------------- headers = {} if 'content-type' in response: headers['Content-Type'] = response['content-type'] else: headers['Content-Type'] = 'text/html' # -------------------------------------- # 7: Clear Javascript files # -------------------------------------- if response.get('content-type', "") == "application/javascript" \ or response.get('content-type', "") == "application/x-javascript" \ or response.get('content-type', "") == "text/javascript" \ or url.endswith(".js"): return {'payload': '', 'headers': headers, 'status': 200} return {'payload': content, 'headers': headers, 'status': 200} except LoginFormNotFound: logger.debug("Login form not found") return self.render_error_html("Login form was not found", 200) except FormAuthenticationFailed as e: logger.debug("Form authentication failed: " + str(e)) return self.render_error_html( "Form authentication failed: " + str(e), 200) except: logger.exception("Error when attempting to proxy an HTTP request") return self.render_error_html("Page preview could not be created", 500) finally: if web_client: web_client.close()