def __init__(self):

        scheme_args = {
            'title': "Google Spreadsheet",
            'description':
            "Allows you to import/export Splunk lookups to/from Google spreadsheets",
            'use_external_validation': "true",
            'streaming_mode': "xml",
            'use_single_instance': "true"
        }

        args = [
            Field("spreadsheet",
                  "Spreadsheet Title",
                  "The title of the spreadsheet",
                  empty_allowed=False),
            Field("worksheet",
                  "Worksheet Name",
                  'The name of the worksheet (e.g. "Sheet1")',
                  empty_allowed=False),
            Field(
                "service_account_key_file",
                "OAuth2 Service Account Key File",
                'The service account key with the credentials necessary for authenticating to Google',
                empty_allowed=False,
                required_on_create=False,
                required_on_edit=False),
            BooleanField(
                "only_if_changed",
                "Import file only if changed",
                "If set to true, then the import will only be done if the Google worksheet was changed.",
                empty_allowed=True,
                required_on_create=False,
                required_on_edit=False),
            Field(
                "operation",
                "Operation",
                "The operation to perform (import into Splunk or export to Google Drive)",
                empty_allowed=False),
            Field("lookup_name",
                  "Lookup File Name",
                  'The name of the lookup file to import the content into',
                  empty_allowed=False),
            DurationField(
                "interval",
                "Interval",
                "The interval defining how often to import the file; can include time units (e.g. 15m for 15 minutes, 8h for 8 hours)",
                empty_allowed=False),
            DeprecatedField("google_login", "Google Login",
                            'The login to use when authenticating to Google'),
            DeprecatedField(
                "google_password", "Google Password",
                'The password to use when authenticating to Google. You will need to use an app-specific password here if you are using two-factor authentication.'
            )
        ]

        ModularInput.__init__(self,
                              scheme_args,
                              args,
                              logger_name='google_spreadsheet_modular_input')
    def __init__(self):

        scheme_args = {'title': "File Meta-data",
                       'description': "Import file and directory meta-data (size, modification dates, etc.)",
                       'use_external_validation': "true",
                       'streaming_mode': "xml",
                       'use_single_instance': "true"}

        args = [
            FilePathField("file_path", "File or directory path",
                          "The path of the to get information on", empty_allowed=False),
            BooleanField("recurse", "Recursively iterate sub-directories",
                         "Indicates whether sub-directories ought to be recursed",
                         empty_allowed=False),
            BooleanField("only_if_changed", "Changed items only",
                         "Only include items when one of the time fields is changed",
                         empty_allowed=False),
            DurationField("interval", "Interval",
                          "The interval defining how often to import the feed; can include time units (e.g. 15m for 15 minutes, 8h for 8 hours)",
                          empty_allowed=False),
            BooleanField("include_file_hash", "Compute file hash",
                         "Compute a hash on the file (SHA224)", empty_allowed=False),
            DataSizeField("file_hash_limit", "File-size hash limit",
                          "Only include items when one of the time fields is changed",
                          empty_allowed=False),
            IntegerField("depth_limit", "Depth Limit",
                         "A limit on how many directories deep to get results for",
                         none_allowed=True, empty_allowed=True),
            WildcardField("file_filter", "File Name Filter",
                         "A wildcard for which files will be included",
                         none_allowed=True, empty_allowed=True)
            ]

        ModularInput.__init__(self, scheme_args, args, logger_name='file_meta_data_modular_input')
    def __init__(self, timeout=30, thread_limit=None):

        scheme_args = {'title': "Website Availability Check",
                       'description': "Connects to a website in order to obtain performance statistics",
                       'use_external_validation': "true",
                       'streaming_mode': "xml",
                       'use_single_instance': "true"}

        args = [
                Field("title", "Title", "A short description (typically just the domain name)", empty_allowed=False),
                URLField("url", "URL", "The URL to connect to (must be be either HTTP or HTTPS protocol)", empty_allowed=False, require_https_on_cloud=True),
                DurationField("interval", "Interval", "The interval defining how often to perform the check; can include time units (e.g. 15m for 15 minutes, 8h for 8 hours)", empty_allowed=False),
                Field("configuration", "Configuration", "Defines a specific proxy configuration to use (in website_monitoring.spec) if not using the default; only used if you want to have multiple proxy servers", none_allowed=True, empty_allowed=True),
                Field("client_certificate", "Client Certificate Path", "Defines the path to the client certificate (if the website requires client SSL authentication)", none_allowed=True, empty_allowed=True),
                Field("client_certificate_key", "Client Certificate Key Path", "Defines the path to the client certificate key (necessary of the key is in a separate file from the certificate)", none_allowed=True, empty_allowed=True),
                Field("username", "Username", "The username to use for authenticating (only HTTP authentication supported)", none_allowed=True, empty_allowed=True, required_on_create=False, required_on_edit=False),
                Field("password", "Password", "The password to use for authenticating (only HTTP authentication supported)", none_allowed=True, empty_allowed=True, required_on_create=False, required_on_edit=False),
                Field("user_agent", "User Agent", "The user-agent to use when communicating with the server", none_allowed=True, empty_allowed=True, required_on_create=False, required_on_edit=False),
                Field("should_contain_string", "String match", "A string that should be present in the content", none_allowed=True, empty_allowed=True, required_on_create=False, required_on_edit=False)
        ]

        ModularInput.__init__(self, scheme_args, args, logger_name='web_availability_modular_input', logger_level=logging.DEBUG)

        if timeout > 0:
            self.timeout = timeout
        else:
            self.timeout = 30

        if thread_limit is None:
            self.thread_limit = WebPing.DEFAULT_THREAD_LIMIT
        else:
            self.thread_limit = thread_limit

        self.threads = {}
    def __init__(self, timeout=30):

        scheme_args = {'title': "Website Availability Check",
                       'description': "Connects to a website in order to obtain performance statistics",
                       'use_external_validation': "true",
                       'streaming_mode': "xml",
                       'use_single_instance': "true"}
        
        args = [
                Field("title", "Title", "A short description (typically just the domain name)", empty_allowed=False),
                URLField("url", "URL", "The URL to connect to (must be be either HTTP or HTTPS protocol)", empty_allowed=False),
                DurationField("interval", "Interval", "The interval defining how often to perform the check; can include time units (e.g. 15m for 15 minutes, 8h for 8 hours)", empty_allowed=False),
                Field("configuration", "Configuration", "Defines a specific proxy configuration to use (in website_monitoring.spec) if not using the default; only used if you want to have multiple proxy servers", none_allowed=True, empty_allowed=True),
                Field("client_certificate", "Client Certificate Path", "Defines the path to the client certificate (if the website requires client SSL authentication)", none_allowed=True, empty_allowed=True),
                Field("client_certificate_key", "Client Certificate Key Path", "Defines the path to the client certificate key (necessary of the key is in a separate file from the certificate)", none_allowed=True, empty_allowed=True),
                Field("username", "Username", "The username to use for authenticating (only HTTP authentication supported)", none_allowed=True, empty_allowed=True, required_on_create=False, required_on_edit=False),
                Field("password", "Password", "The password to use for authenticating (only HTTP authentication supported)", none_allowed=True, empty_allowed=True, required_on_create=False, required_on_edit=False)
                ]
        
        ModularInput.__init__( self, scheme_args, args, logger_name='web_availability_modular_input' )
        
        if timeout > 0:
            self.timeout = timeout
        else:
            self.timeout = 30
Esempio n. 5
0
    def __init__(self, timeout=30, **kwargs):

        scheme_args = {
            'title': "JWT Webhook",
            'description': "Retrieve data from jwt webhook using SSL",
            'use_single_instance': True
        }

        args = [
            IntegerField('port',
                         'Port',
                         'The port to run the web-server on',
                         none_allowed=False,
                         empty_allowed=False),
            Field(
                'secret',
                'Secret',
                'The secret key to decode the JWT encoded payload, leave it empty if the payload is not JWT encoded.',
                none_allowed=True,
                empty_allowed=True),
            Field(
                'path',
                'Path',
                'A wildcard that the path of requests must match (paths generally begin with a "/" and can include a wildcard)',
                none_allowed=True,
                empty_allowed=True),
            FilePathField(
                'key_file',
                'SSL Certificate Key File',
                'The path to the SSL certificate key file (if the certificate requires a key); typically uses .KEY file extension',
                none_allowed=True,
                empty_allowed=True,
                validate_file_existence=True),
            FilePathField(
                'cert_file',
                'SSL Certificate File',
                'The path to the SSL certificate file (if you want to use encryption); typically uses .DER, .PEM, .CRT, .CER file extensions',
                none_allowed=False,
                empty_allowed=False,
                validate_file_existence=True),
            Field(
                'password',
                'Password',
                'The password to decrypt the private key, leave it empty if the private key is not encrypted.',
                none_allowed=True,
                empty_allowed=True),
        ]

        ModularInput.__init__(self,
                              scheme_args,
                              args,
                              logger_name="webhook_modular_input",
                              sleep_interval=60)

        if timeout > 0:
            self.timeout = timeout
        else:
            self.timeout = 30

        self.http_daemons = {}
Esempio n. 6
0
    def __init__(self, timeout=30, **kwargs):

        scheme_args = {
            'title': "FTP",
            'description': "Retrieve information over FTP",
            'use_single_instance': "false"
        }

        args = [
            IntegerField("port",
                         "Port",
                         'The port to run the FTP server on',
                         none_allowed=False,
                         empty_allowed=False),
            FTPPathField(
                "path",
                "Path",
                'The path to place the received files; relative paths are based on $SPLUNK_HOME',
                none_allowed=False,
                empty_allowed=False),
            Field(
                "address",
                "Address to Listen on",
                'The address to have the FTP server listen on; leave blank to listen on all interfaces',
                none_allowed=True,
                empty_allowed=True),
            #DurationField("interval", "Interval", "The interval defining how often to make sure the server is running", empty_allowed=True, none_allowed=True)
        ]

        ModularInput.__init__(self,
                              scheme_args,
                              args,
                              logger_name="ftp_modular_input")

        self.ftp_daemons = []
Esempio n. 7
0
    def __init__(self, timeout=30):

        scheme_args = {
            'title': "Internet Connection Speedtest",
            'description': "A speedtest of the Internet connection",
            'use_single_instance': False
        }

        args = [
            Field(
                "server",
                "Server",
                "The server to use for testing; will be automatically assigned if left blank",
                empty_allowed=True,
                none_allowed=True,
                required_on_create=False,
                required_on_edit=False),
            IntegerField("runs",
                         "Runs",
                         "The number of runs that should be executed",
                         empty_allowed=False,
                         none_allowed=False)
        ]

        ModularInput.__init__(self,
                              scheme_args,
                              args,
                              logger_name='speedtest_modular_input')
Esempio n. 8
0
    def __init__(self, timeout=30):
        scheme_args = {
            'title':
            "PCAP",
            'description':
            "Watch directories for packet capture files (*.pcap) and process them using Bro."
        }

        args = [
            Field("path",
                  "Path",
                  "Specify where the pcap files are stored (eg: /var/pcap).",
                  empty_allowed=False),
            BooleanField(
                "recursive",
                "Recursive",
                "Specify if splunk should monitor all sub directories for incoming pcap. True or False.",
                empty_allowed=False),
            Field(
                "store_dir",
                "Log directory",
                "Specify where the created log files by Bro will be stored (eg: /var/log/bro).",
                empty_allowed=False),
            Field(
                "bro_bin",
                "Bro binary",
                "Specify where the Bro binary is located (eg: /opt/bro/bin/bro).",
                empty_allowed=False),
            Field("bro_opts",
                  "Bro options",
                  "Specify options to pass to Bro (None to deactivate).",
                  empty_allowed=False),
            Field("bro_script",
                  "Bro script",
                  "Specify a Bro script to use or None do deactivate.",
                  empty_allowed=False),
            Field(
                "bro_seeds",
                "Bro seed file",
                "Specify if you want to use a seed file to predict Bro UIDs or None do deactivate.",
                empty_allowed=False),
            BooleanField(
                "bro_merge",
                "Ingest content",
                "[Bro 2.1 only] Specify if the extracted content by Bro must be encoded in Base64 and appended to Bro logs. This require a Bro script to be set and this is a True or False option.",
                empty_allowed=False),
            Field(
                "content_maxsize",
                "Content maximum size",
                "[Bro 2.1 only] Objects greather than the specified size (in bytes) will not be ingested.",
                empty_allowed=False),
            Field(
                "run_maxtime",
                "Maximum execution time",
                "When a Bro instance run longer than this time (in secs), kill the instance. Set to 0 to deactivate.",
                empty_allowed=False),
        ]

        ModularInput.__init__(self, scheme_args, args)
    def __init__(self):

        scheme_args = {
            'title': "File Meta-data",
            'description':
            "Import file and directory meta-data (size, modification dates, etc.)",
            'use_external_validation': "true",
            'streaming_mode': "xml",
            'use_single_instance': "true"
        }

        args = [
            FilePathField("file_path",
                          "File or directory path",
                          "The path of the to get information on",
                          empty_allowed=False),
            BooleanField(
                "recurse",
                "Recursively iterate sub-directories",
                "Indicates whether sub-directories ought to be recursed",
                empty_allowed=False),
            BooleanField(
                "only_if_changed",
                "Changed items only",
                "Only include items when one of the time fields is changed",
                empty_allowed=False),
            DurationField(
                "interval",
                "Interval",
                "The interval defining how often to import the feed; can include time units (e.g. 15m for 15 minutes, 8h for 8 hours)",
                empty_allowed=False),
            BooleanField("include_file_hash",
                         "Compute file hash",
                         "Compute a hash on the file (SHA224)",
                         empty_allowed=False),
            DataSizeField(
                "file_hash_limit",
                "File-size hash limit",
                "Only include items when one of the time fields is changed",
                empty_allowed=False),
            IntegerField(
                "depth_limit",
                "Depth Limit",
                "A limit on how many directories deep to get results for",
                none_allowed=True,
                empty_allowed=True),
            WildcardField("file_filter",
                          "File Name Filter",
                          "A wildcard for which files will be included",
                          none_allowed=True,
                          empty_allowed=True)
        ]

        ModularInput.__init__(self,
                              scheme_args,
                              args,
                              logger_name='file_meta_data_modular_input')
Esempio n. 10
0
    def __init__(self, thread_limit=None):

        scheme_args = {
            'title': "Ping",
            'description': "Ping a host or a network to see if it is online",
            'use_single_instance': True
        }

        args = [
            ListField("dest",
                      "Destination",
                      "The list of hosts or networks to ping",
                      empty_allowed=True,
                      none_allowed=True,
                      required_on_create=True,
                      required_on_edit=True,
                      instance_class=DomainOrIPNetworkField),
            RangeField("port",
                       "Port",
                       "The TCP port to use (leave blank to use ICMP instead)",
                       low=1,
                       high=65535,
                       empty_allowed=True,
                       none_allowed=True,
                       required_on_create=False,
                       required_on_edit=False),
            RangeField("runs",
                       "Runs",
                       "The number of runs that should be executed",
                       low=1,
                       high=100,
                       empty_allowed=False,
                       none_allowed=False),
            DurationField(
                "interval",
                "Interval",
                "The interval defining how often to perform the check; can include time units (e.g. 15m for 15 minutes, 8h for 8 hours)",
                empty_allowed=False)
        ]

        ModularInput.__init__(self,
                              scheme_args,
                              args,
                              logger_name='ping_modular_input',
                              logger_level=logging.INFO)

        if thread_limit is None:
            self.thread_limit = PingInput.DEFAULT_THREAD_LIMIT
        else:
            self.thread_limit = thread_limit

        self.threads = {}
Esempio n. 11
0
    def test_constructor_(self):
        """
        Make sure that the scheme args are accepted properly.
        """

        mod_input = ModularInput({
            "use_single_instance" : False,
        })
        self.assertEquals(mod_input.use_single_instance, False)

        mod_input = ModularInput({
            "use_single_instance" : True,
        })
        self.assertEquals(mod_input.use_single_instance, True)
Esempio n. 12
0
    def __init__(self):

        scheme_args = {'title': "Syndication Feed (RSS, ATOM, RDF)",
                       'description': "Import syndication feeds (RSS, ATOM, RDF)",
                       'use_external_validation': "true",
                       'streaming_mode': "xml",
                       'use_single_instance': "true"}

        args = [
                URLField("url", "Feed URL", "The URL of the feed to input", empty_allowed=False),
                BooleanField("include_only_changed", "Include only new or changed entries", "Only include entries that has not been indexed yet (won't get items that were already observed)", empty_allowed=False),
                Field("username", "Username", "The username to use for authenticating (only HTTP authentication supported)", none_allowed=True, empty_allowed=True, required_on_create=False, required_on_edit=False),
                Field("password", "Password", "The password to use for authenticating (only HTTP authentication supported)", none_allowed=True, empty_allowed=True, required_on_create=False, required_on_edit=False),
                DurationField("interval", "Interval", "The interval defining how often to import the feed; can include time units (e.g. 15m for 15 minutes, 8h for 8 hours)", empty_allowed=False),
                BooleanField("clean_html", "Convert HTML to Text", "Convert HTML to human readable text", empty_allowed=False)
                ]

        ModularInput.__init__( self, scheme_args, args, logger_name='syndication_modular_input' )
    def __init__(self, timeout=30):

        scheme_args = {'title': "Django File Cache Size",
                       'description': "Determines the size of a Django file cache based on the total size and number of files",
                       'use_external_validation': "true",
                       'streaming_mode': "xml",
                       'use_single_instance': "true"}
        
        args = [
                PathField("path", "Django Cache Path", "The path to the directory used by Django for the cache", empty_allowed=False),
                DurationField("interval", "Interval", "The interval defining how often to perform the check; can include time units (e.g. 15m for 15 minutes, 8h for 8 hours)", empty_allowed=False)
                ]
        
        ModularInput.__init__( self, scheme_args, args )
        
        if timeout > 0:
            self.timeout = timeout
        else:
            self.timeout = 5
Esempio n. 14
0
    def handle_results(self, results, session_key, in_preview):

        # FYI: we ignore results since this is a generating command

        # Make sure that URL is using SSL if on Splunk Cloud
        if ModularInput.is_on_cloud(
                session_key) and not self.params["url"].startswith("https"):
            raise Exception(
                "The URL to scrape must use HTTPS; Splunk Cloud doesn't allow unsecured network access"
            )

        # Make sure that links get extracted if they point to HTTPS sites if on Splunk Cloud
        self.params['https_only'] = ModularInput.is_on_cloud(session_key)

        # Do the scraping
        results = self.web_scraper.scrape_page(**self.params)

        # Output the results
        self.output_results(results)
Esempio n. 15
0
    def __init__(self, thread_limit=None):

        scheme_args = {'title': "Port Scan",
                       'description': "Port scan a host to see what ports are open",
                       'use_single_instance': True}

        args = [
            ListField("dest", "Destination", "The list of hosts or networks to port scan", empty_allowed=True, none_allowed=True, required_on_create=True, required_on_edit=True, instance_class=DomainOrIPNetworkField),
            PortRangeField("ports", "Ports", "The TCP ports to scan", empty_allowed=False, none_allowed=False, required_on_create=True, required_on_edit=True),
            DurationField("interval", "Interval", "The interval defining how often to perform the check; can include time units (e.g. 15m for 15 minutes, 8h for 8 hours)", empty_allowed=False)
        ]

        ModularInput.__init__(self, scheme_args, args, logger_name='portscan_modular_input', logger_level=logging.DEBUG)

        if thread_limit is None:
            self.thread_limit = PortScanInput.DEFAULT_THREAD_LIMIT
        else:
            self.thread_limit = thread_limit

        self.threads = {}
Esempio n. 16
0
    def __init__(self, timeout=30):

        scheme_args = {'title': "Website Availability Check",
                       'description': "Connects to a website in order to obtain performance statistics",
                       'use_external_validation': "true",
                       'streaming_mode': "xml",
                       'use_single_instance': "true"}
        
        args = [
                Field("title", "Title", "A short description (typically just the domain name)", empty_allowed=False),
                URLField("url", "URL", "The URL to connect to (must be be either HTTP or HTTPS protocol)", empty_allowed=False),
                DurationField("interval", "Interval", "The interval defining how often to perform the check; can include time units (e.g. 15m for 15 minutes, 8h for 8 hours)", empty_allowed=False),
                Field("configuration", "Configuration", "Defines a specific proxy configuration to use (in website_monitoring.spec) if not using the default; only used if you want to have multiple proxy servers", none_allowed=True, empty_allowed=True),
                Field("client_certificate", "Client Certificate Path", "Defines the path to the client certificate (if the website requires client SSL authentication)", none_allowed=True, empty_allowed=True),
                Field("client_certificate_key", "Client Certificate Key Path", "Defines the path to the client certificate key (necessary of the key is in a separate file from the certificate)", none_allowed=True, empty_allowed=True)
                ]
        
        ModularInput.__init__( self, scheme_args, args, logger_name='web_availability_modular_input' )
        
        if timeout > 0:
            self.timeout = timeout
        else:
            self.timeout = 30
Esempio n. 17
0
    def __init__(self, timeout=30, **kwargs):

        scheme_args = {
            'title': "Web-pages",
            'description': "Retrieve information from web-pages",
            'use_external_validation': "true",
            'streaming_mode': "xml",
            'use_single_instance': "true"
        }

        args = [
            Field("title",
                  "Title",
                  "A short description (typically just the domain name)",
                  empty_allowed=False),
            URLField(
                "url",
                "URL",
                "The URL to connect to (must be be either HTTP or HTTPS protocol)",
                empty_allowed=False,
                require_https_on_cloud=True),
            DurationField(
                "interval",
                "Interval",
                "The interval defining how often to perform the check; can include time units (e.g. 15m for 15 minutes, 8h for 8 hours)",
                empty_allowed=False),
            IntegerField("timeout",
                         "Timeout",
                         'The timeout (in number of seconds)',
                         none_allowed=True,
                         empty_allowed=True),
            SelectorField(
                "selector",
                "Selector",
                "A selector that will match the data you want to retrieve",
                none_allowed=True,
                empty_allowed=True),

            # HTTP client options
            Field("user_agent",
                  "User Agent",
                  "The user-agent to use when communicating with the server",
                  none_allowed=True,
                  empty_allowed=True,
                  required_on_create=False,
                  required_on_edit=False),
            Field("browser",
                  "Browser",
                  'The browser to use',
                  none_allowed=True,
                  empty_allowed=True),

            # Output options
            ListField("name_attributes",
                      "Field Name Attributes",
                      "A list of attributes to use for assigning a field name",
                      none_allowed=True,
                      empty_allowed=True,
                      required_on_create=False,
                      required_on_edit=False),
            BooleanField("use_element_name",
                         "Use Element Name as Field Name",
                         "Use the element's tag name as the field name",
                         none_allowed=True,
                         empty_allowed=True,
                         required_on_create=False,
                         required_on_edit=False),
            BooleanField("output_as_mv",
                         "Output as Multi-value Field",
                         "Output the matches as multi-value field",
                         none_allowed=True,
                         empty_allowed=True,
                         required_on_create=False,
                         required_on_edit=False),
            StaticListField("output_results",
                            "Indicates when results output should be created",
                            "Output the matches only when results changed",
                            none_allowed=True,
                            empty_allowed=True,
                            required_on_create=False,
                            required_on_edit=False,
                            valid_values=WebInput.OUTPUT_RESULTS_OPTIONS),
            BooleanField("raw_content",
                         "Raw content",
                         "Return the raw content returned by the server",
                         none_allowed=True,
                         empty_allowed=True,
                         required_on_create=False,
                         required_on_edit=False),
            BooleanField("empty_matches",
                         "Empty matches",
                         "Include empty rows (otherwise, they are excluded)",
                         none_allowed=True,
                         empty_allowed=True,
                         required_on_create=False,
                         required_on_edit=False),
            Field(
                "text_separator",
                "Text Separator",
                'A string that will be placed between the extracted values (e.g. a separator of ":" for a match against "<a>tree</a><a>frog</a>" would return "tree:frog")',
                none_allowed=True,
                empty_allowed=True),

            # Spidering options
            IntegerField(
                "page_limit",
                "Discovered page limit",
                "A limit on the number of pages that will be auto-discovered",
                none_allowed=True,
                empty_allowed=True,
                required_on_create=False,
                required_on_edit=False),
            IntegerField(
                "depth_limit",
                "Depth limit",
                "A limit on how many levels deep the search for pages will go",
                none_allowed=True,
                empty_allowed=True,
                required_on_create=False,
                required_on_edit=False),
            Field(
                "url_filter",
                "URL Filter",
                "A wild-card that will indicate which pages it should search for matches in",
                none_allowed=True,
                empty_allowed=True,
                required_on_create=False,
                required_on_edit=False),

            # Authentication options
            Field("username",
                  "Username",
                  "The username to use for authenticating",
                  none_allowed=True,
                  empty_allowed=True,
                  required_on_create=False,
                  required_on_edit=False),
            Field("password",
                  "Password",
                  "The password to use for authenticating",
                  none_allowed=True,
                  empty_allowed=True,
                  required_on_create=False,
                  required_on_edit=False),
            Field("username_field",
                  "Username field",
                  "The name of the username field on the login form",
                  none_allowed=True,
                  empty_allowed=True,
                  required_on_create=False,
                  required_on_edit=False),
            Field("password_field",
                  "Password field",
                  "The name of the password field on the login form",
                  none_allowed=True,
                  empty_allowed=True,
                  required_on_create=False,
                  required_on_edit=False),
            URLField("authentication_url",
                     "Authentication URL",
                     "The URL of the login form",
                     none_allowed=True,
                     empty_allowed=True,
                     required_on_create=False,
                     required_on_edit=False,
                     require_https_on_cloud=True)
        ]

        ModularInput.__init__(self,
                              scheme_args,
                              args,
                              logger_name='web_input_modular_input',
                              logger_level=logging.INFO)

        if timeout > 0:
            self.timeout = timeout
        else:
            self.timeout = 30
    def get_scrape_page(self, request_info, **kwargs):
        """
        Perform a page scrape and return the results (useful for previewing a web_input modular
        input configuration)
        """

        result = [{}]

        # Run the input
        try:
            web_input = WebInput(timeout=10)

            kw = {}

            # Get the URL or URI
            url = None

            if 'url' in kwargs:
                url = kwargs['url']
            elif 'uri' in kwargs:
                url = kwargs['uri']

            if url is None:
                return self.render_error_json("No URL was provided", 202)

            # Get the selector
            selector = None

            if 'selector' in kwargs:
                selector = kwargs['selector']

            # Determine if we should include empty matches
            if 'empty_matches' in kwargs:
                kw['include_empty_matches'] = util.normalizeBoolean(
                    kwargs['empty_matches'], True)

            # Get the use_element_name parameter
            if 'use_element_name' in kwargs:
                kw['use_element_name'] = util.normalizeBoolean(
                    kwargs['use_element_name'], False)

            # Get the text_separator parameter
            if 'text_separator' in kwargs:
                kw['text_separator'] = kwargs['text_separator']

            # Get the output_as_mv parameter. This parameter is different from the name of the
            # argument that the class accepts and will be renamed accrdingly.
            if 'output_as_mv' in kwargs:
                kw['output_matches_as_mv'] = util.normalizeBoolean(
                    kwargs['output_as_mv'], True)

                # If we are outputting as multi-valued parameters, then don't include the separate
                # fields
                if kw['output_matches_as_mv']:
                    kw['output_matches_as_separate_fields'] = False
                else:
                    # http://lukemurphey.net/issues/1643
                    kw['output_matches_as_separate_fields'] = True

            # Get the field match prefix
            if 'match_prefix' in kwargs:
                kw['match_prefix'] = kwargs['match_prefix']

            # Get the browser parameter
            if 'browser' in kwargs:
                kw['browser'] = kwargs['browser']

            # Get the page_limit parameter
            if 'page_limit' in kwargs:
                kw['page_limit'] = int(kwargs['page_limit'])

            # Get the depth_limit parameter
            if 'depth_limit' in kwargs:
                kw['depth_limit'] = int(kwargs['depth_limit'])

            # Get the depth_limit parameter
            if 'url_filter' in kwargs:
                kw['url_filter'] = kwargs['url_filter']

            # Get the name_attributes parameter
            if 'name_attributes' in kwargs:
                kw['name_attributes'] = kwargs['name_attributes']

            # Get the raw_content parameter
            if 'raw_content' in kwargs:
                kw['include_raw_content'] = util.normalizeBoolean(
                    kwargs['raw_content'])

            # Only extract links using HTTPS if on Splunk Cloud
            if ModularInput.is_on_cloud(request_info.session_key):
                kw['https_only'] = True

            # Otherwise, allow callers to specify which links to extract
            elif 'https_only' in kwargs:
                kw['https_only'] = util.normalizeBoolean(kwargs['https_only'])

            # Get the proxy configuration
            conf_stanza = "default"

            # Get the timeout parameter
            timeout = 5

            if 'timeout' in kwargs:
                try:
                    timeout = int(kwargs['timeout'])
                except:
                    # The timeout is invalid. Ignore this for now, it will get picked up when
                    # the user attempts to save the input
                    pass

            # Make the web scraper instance
            web_scraper = WebScraper(timeout)

            # Get the authentication information, if available
            username = None
            password = None

            if 'password' in kwargs and 'username' in kwargs:
                username = kwargs['username']
                password = kwargs['password']

                username_field = kwargs.get('username_field', None)
                password_field = kwargs.get('password_field', None)
                authentication_url = kwargs.get('authentication_url', None)

                if authentication_url is not None:
                    authentication_url = urlparse(authentication_url)

                logger.debug("Using credentials for scrape_page")
                web_scraper.set_authentication(username, password,
                                               authentication_url,
                                               username_field, password_field)

            # Get the user-agent string
            if 'user_agent' in kwargs:
                web_scraper.user_agent = kwargs['user_agent']

            # Set the proxy authentication
            try:
                proxy_type, proxy_server, proxy_port, proxy_user, proxy_password = web_input.get_proxy_config(
                    request_info.session_key, conf_stanza)

                web_scraper.set_proxy(proxy_type, proxy_server, proxy_port,
                                      proxy_user, proxy_password)

            except ResourceNotFound:
                return self.render_error_json(
                    "Proxy server information could not be obtained", 202)

            # Scrape the page
            result = web_scraper.scrape_page(url, selector, **kw)

        except FieldValidationException as e:
            return self.render_error_json(str(e), 220)

        except ServerNotFoundError as e:
            return self.render_error_json(str(e), 220)

        except (SelectorError, SelectorSyntaxError, ExpressionError):
            return self.render_error_json("Selector is invalid. ", 220)

        except LoginFormNotFound:
            return self.render_error_json("Login form was not found", 220)

        except FormAuthenticationFailed:
            return self.render_error_json("Form authentication failed", 220)

        except Exception as e:
            logger.exception("Error generated during execution")
            return self.render_error_json(str(e), 500)

        # Return the information
        if 'include_first_result_only' in kwargs:
            return self.render_json(result[0])
        else:
            return self.render_json(result)
    def get_load_page(self, request_info, url, **kwargs):
        """
        Proxy a web-page through so that a UI can be displayed for showing potential results.
        """

        web_client = None

        try:

            # --------------------------------------
            # 1: Make sure that user has permission to make inputs. We don't want to allow people
            #    to use this as a general proxy.
            # --------------------------------------
            if not WebInputOperationsHandler.hasCapability(
                    'edit_modinput_web_input'
            ) and WebInputOperationsHandler.hasCapability('admin_all_objects'):
                return self.render_error_html(
                    'You need the "edit_modinput_web_input" capability ' +
                    'to make website inputs', 403)

            # Don't allow proxying of the javascript files
            if url.endswith(".js"):
                return {
                    'payload': '',
                    'status': 200,
                    'headers': {
                        'Content-Type': 'application/javascript'
                    },
                }

            # --------------------------------------
            # 2: Only allow HTTPS if the install is on Splunk Cloud
            # --------------------------------------
            if ModularInput.is_on_cloud(request_info.session_key
                                        ) and not url.startswith("https://"):
                return self.render_error_html(
                    'URLs on Splunk Cloud must use HTTPS protocol',
                    401)  # TODO: deterine best code

            # --------------------------------------
            # 3: Perform a request for the page
            # --------------------------------------

            # Get the proxy configuration
            conf_stanza = "default"

            try:
                web_input = WebInput(timeout=10)

                proxy_type, proxy_server, proxy_port, proxy_user, proxy_password = \
                web_input.get_proxy_config(request_info.session_key, conf_stanza)

            except ResourceNotFound:
                return self.render_error_html(
                    "Proxy server information could not be obtained", 202)

            # Get the timeout to use
            timeout = None

            if 'timeout' in kwargs:
                try:
                    timeout = int(kwargs['timeout'])
                except ValueError:
                    timeout = 15
            else:
                timeout = 15

            # Get the user-agent
            user_agent = kwargs.get('user_agent', None)

            # Get the information on the browser to use
            browser = None

            if 'browser' in kwargs:
                browser = kwargs['browser']

            # Make the client
            if browser is None or browser == WebScraper.INTEGRATED_CLIENT:
                web_client = DefaultWebClient(timeout, user_agent, logger)
            elif browser == WebScraper.FIREFOX:
                web_client = FirefoxClient(timeout, user_agent, logger)
            elif browser == WebScraper.CHROME:
                web_client = ChromeClient(timeout, user_agent, logger)

            web_client.setProxy(proxy_type, proxy_server, proxy_port,
                                proxy_user, proxy_password)

            # Get the username and password
            username = kwargs.get('username', None)
            password = kwargs.get('password', None)

            username_field = kwargs.get('username_field', None)
            password_field = kwargs.get('password_field', None)
            authentication_url = kwargs.get('authentication_url', None)

            if username is not None and password is not None:
                username = kwargs['username']
                password = kwargs['password']

                username_field = kwargs.get('username_field', None)
                password_field = kwargs.get('password_field', None)
                authentication_url = kwargs.get('authentication_url', None)

                web_client.setCredentials(username, password)

                if authentication_url is not None:
                    logger.debug(
                        "Authenticating using form login in scrape_page")
                    web_client.doFormLogin(authentication_url, username_field,
                                           password_field)

            # Get the page
            try:
                content = web_client.get_url(url, 'GET')
                response = web_client.get_response_headers()
            except:
                logger.exception(
                    "Exception generated while attempting to content for url=%s",
                    url)
                return self.render_error_html(
                    "Page preview could not be obtained using a web-browser",
                    500)

            # --------------------------------------
            # 4: Render the content with the browser if necessary
            # --------------------------------------
            """
            if 'text/html' in response['content-type']:

                # Get the information on the browser to use
                browser = None

                if 'browser' in kwargs:
                    browser = kwargs['browser']

                # Try rendering the content using a web-browser
                try:
                    if browser is not None and browser != WebScraper.INTEGRATED_CLIENT:
                        
                        web_scraper = WebScraper(timeout=timeout)
                        web_scraper.set_proxy(proxy_type, proxy_server, proxy_port, proxy_user, proxy_password)
                        web_scraper.set_authentication(username, password)
                        content = web_scraper.get_result_browser(urlparse(url), browser)

                except:
                    logger.exception("Exception generated while attempting to get browser rendering or url=%s", url)

                    cherrypy.response.status = 500
                    return self.render_error_html("Page preview could not be obtained using a web-browser")
            """

            # --------------------------------------
            # 5: Rewrite the links in HTML files so that they also point to the internal proxy
            # --------------------------------------
            if "<html" in content:

                # Parse the content
                html = lxml.html.document_fromstring(content)

                # Rewrite the links to point to this internal proxy
                rewrite_using_internal_proxy = True

                if rewrite_using_internal_proxy:

                    def relocate_href(link):
                        """
                        Change the hrefs such that they go through the proxy.
                        """

                        link = urljoin(url, link)

                        if link.endswith(".js"):
                            return ""
                        if not link.endswith(".css"):
                            return "load_page?url=" + link
                        else:
                            return link

                    html.rewrite_links(relocate_href)

                    # Block the href links
                    for element, attribute, _, _ in html.iterlinks():
                        if element.tag == "a" and attribute == "href":
                            element.set('href', "#")

                        elif element.tag == "form" and attribute == "action":
                            element.set('action', "?")
                else:
                    html.make_links_absolute(url)

                # Determine if we should clean the JS
                clean_script = True

                if 'clean_script' in kwargs:
                    clean_script = util.normalizeBoolean(
                        kwargs['clean_script'])

                # Determine if we should clean the CSS
                clean_styles = False

                if 'clean_styles' in kwargs:
                    clean_styles = util.normalizeBoolean(
                        kwargs['clean_styles'])

                # Clean up the HTML
                if clean_styles or clean_script:

                    kill_tags = []

                    if clean_script:
                        kill_tags = ["script"]

                    # Remove the script blocks
                    cleaner = Cleaner(page_structure=False,
                                      kill_tags=kill_tags,
                                      javascript=False,
                                      links=False,
                                      style=clean_styles,
                                      safe_attrs_only=False)

                    # Get the content
                    content = lxml.html.tostring(cleaner.clean_html(html),
                                                 encoding="unicode")

                else:
                    content = lxml.html.tostring(html, encoding="unicode")

            # --------------------------------------
            # 6: Respond with the results
            # --------------------------------------
            headers = {}

            if 'content-type' in response:
                headers['Content-Type'] = response['content-type']
            else:
                headers['Content-Type'] = 'text/html'

            # --------------------------------------
            # 7: Clear Javascript files
            # --------------------------------------
            if response.get('content-type', "") == "application/javascript" \
               or response.get('content-type', "") == "application/x-javascript" \
               or response.get('content-type', "") == "text/javascript" \
               or url.endswith(".js"):

                return {'payload': '', 'headers': headers, 'status': 200}

            return {'payload': content, 'headers': headers, 'status': 200}

        except LoginFormNotFound:
            logger.debug("Login form not found")
            return self.render_error_html("Login form was not found", 200)

        except FormAuthenticationFailed as e:
            logger.debug("Form authentication failed: " + str(e))
            return self.render_error_html(
                "Form authentication failed: " + str(e), 200)

        except:
            logger.exception("Error when attempting to proxy an HTTP request")
            return self.render_error_html("Page preview could not be created",
                                          500)

        finally:
            if web_client:
                web_client.close()