def test_unparsable(self): web_input = WebInput(timeout=3) url_field = URLField( "test_web_input", "title", "this is a test" ) selector_field = SelectorField( "test_web_input_css", "title", "this is a test" ) result = WebInput.scrape_page( url_field.to_python("http://textcritical.net/media/images/link_external.png"), selector_field.to_python(".hero-unit .main_background"), timeout=3, output_matches_as_mv=True ) self.assertEqual(result['match'], [])
def test_scrape_page_name_attributes_separate_fields(self): web_input = WebInput(timeout=3) url_field = URLField( "test_web_input", "title", "this is a test" ) selector_field = SelectorField( "test_web_input_css", "title", "this is a test" ) result = WebInput.scrape_page( url_field.to_python("http://127.0.0.1:8888"), selector_field.to_python(".hd"), username="******", password="******", timeout=3, name_attributes=["class"], output_matches_as_separate_fields=True, output_matches_as_mv=False) self.assertEqual(result['match_hd_1'], 'Mode:')
def test_scrape_page_name_attributes(self): web_input = WebInput(timeout=3) url_field = URLField( "test_web_input", "title", "this is a test" ) selector_field = SelectorField( "test_web_input_css", "title", "this is a test" ) result = WebInput.scrape_page( url_field.to_python("http://127.0.0.1:8888"), selector_field.to_python(".hd"), username="******", password="******", timeout=3, name_attributes=["class"] ) self.assertEqual(len(result['hd']), 31)
def test_scrape_encoding_detect_meta(self): web_input = WebInput(timeout=3) url_field = URLField( "test_web_input", "title", "this is a test" ) selector_field = SelectorField( "test_web_input_css", "title", "this is a test" ) result = WebInput.scrape_page( url_field.to_python("http://textcritical.net/work/new-testament/Mark/1/2"), selector_field.to_python(".verse-container"), charset_detect_meta_enabled=True, charset_detect_content_type_header_enabled=False, charset_detect_sniff_enabled=False ) self.assertEqual(result['response_code'], 200) self.assertEqual(result['encoding'], "utf-8")
def test_scrape_unavailable_page(self): web_input = WebInput(timeout=3) url_field = URLField( "test_web_input", "title", "this is a test" ) selector_field = SelectorField( "test_web_input_css", "title", "this is a test" ) result = WebInput.scrape_page( url_field.to_python("http://192.168.30.23/"), selector_field.to_python(".hero-unit.main_background"), timeout=3 ) self.assertEqual(result['timed_out'], True)
def test_scrape_page(self): web_input = WebInput(timeout=3) url_field = URLField( "test_web_input", "title", "this is a test" ) selector_field = SelectorField( "test_web_input_css", "title", "this is a test" ) result = WebInput.scrape_page( url_field.to_python("http://textcritical.net/"), selector_field.to_python(".hero-unit.main_background") ) self.assertEqual(result['response_code'], 200) self.assertEqual(len(result['match']), 1)
def test_scrape_page_name_attributes_escaped_name(self): web_input = WebInput(timeout=3) url_field = URLField( "test_web_input", "title", "this is a test" ) selector_field = SelectorField( "test_web_input_css", "title", "this is a test" ) result = WebInput.scrape_page( url_field.to_python("http://127.0.0.1:8888"), selector_field.to_python("input"), username="******", password="******", timeout=3, name_attributes=["onclick"], include_empty_matches=True) self.assertTrue('btnBerTest__' in result) self.assertTrue('btnReset__' in result)
def test_scrape_page_adjacent_selector(self): # For bug: http://lukemurphey.net/issues/773 web_input = WebInput(timeout=3) url_field = URLField( "test_web_input", "title", "this is a test" ) selector_field = SelectorField( "test_web_input_css", "title", "this is a test" ) result = WebInput.scrape_page( url_field.to_python("http://textcritical.net/"), selector_field.to_python("h1+p,.sharing-buttons"), timeout=3, output_matches_as_mv=True ) self.assertEqual(len(result['match']), 2)
def test_scrape_page_with_invalid_credentials(self): web_input = WebInput(timeout=3) url_field = URLField( "test_web_input", "title", "this is a test" ) selector_field = SelectorField( "test_web_input_css", "title", "this is a test" ) result = WebInput.scrape_page( url_field.to_python("http://127.0.0.1:8888"), selector_field.to_python("tr"), timeout=3, output_matches_as_mv=True ) #print result['match'] self.assertEqual(len(result['match']), 0)
def test_scrape_page_bad_encoding(self): #http://lukemurphey.net/issues/987 web_input = WebInput(timeout=3) url_field = URLField( "test_web_input", "title", "this is a test" ) selector_field = SelectorField( "test_web_input_css", "title", "this is a test" ) result = WebInput.scrape_page( url_field.to_python("http://rss.slashdot.org/Slashdot/slashdot"), selector_field.to_python("description") ) self.assertEqual(result['response_code'], 200) self.assertGreater(len(result['match']), 0) self.assertEqual(result['encoding'], "ISO-8859-1")
def test_scrape_encoding_detect_page(self): web_input = WebInput(timeout=3) url_field = URLField( "test_web_input", "title", "this is a test" ) selector_field = SelectorField( "test_web_input_css", "title", "this is a test" ) result = WebInput.scrape_page( url_field.to_python("http://textcritical.net/work/new-testament/Mark/1/2?async"), selector_field.to_python(".verse-container") ) self.assertEqual(result['response_code'], 200) self.assertEqual(len(result['match']), 45) #print result['match'] self.assertEqual(unicodedata.normalize('NFC', result['match'][1]), unicodedata.normalize('NFC', u"2 Καθὼς γέγραπται ἐν τῷ Ἠσαίᾳ τῷ προφήτῃ Ἰδοὺ ἀποστέλλω τὸν ἄγγελόν μου πρὸ προσώπου σου , ὃς κατασκευάσει τὴν ὁδόν σου :")) self.assertEqual(result['encoding'], "utf-8")
def test_scrape_page_child_text(self): # This text ensure that text from nodes under the selected nodes is properly extracted web_input = WebInput(timeout=3) url_field = URLField( "test_web_input", "title", "this is a test" ) selector_field = SelectorField( "test_web_input_css", "title", "this is a test" ) result = WebInput.scrape_page( url_field.to_python("http://textcritical.net/"), selector_field.to_python(".hero-unit.main_background"), output_matches_as_mv=True ) self.assertEqual(result['response_code'], 200) self.assertEqual(len(result['match']), 1) self.assertEqual(result['match'][0], "Ancient Greek, Modern Design TextCritical.net is a website that provides a library of ancient Greek works")
def handle_results(self, results, in_preview, session_key): # FYI: we ignore results since this is a generating command # Do the scraping result = WebInput.scrape_page(self.url, self.selector, self.username, self.password, self.timeout, self.name_attributes, self.output_matches_as_mv, self.output_matches_as_separate_fields, include_empty_matches=False, proxy_type="http", proxy_server=None, proxy_port=None, proxy_user=None, proxy_password=None) self.logger.debug("Retrieved results, result=%r", result) # Output the results self.output_results([result])
def test_scrape_page_mv(self): web_input = WebInput(timeout=3) url_field = URLField( "test_web_input", "title", "this is a test" ) selector_field = SelectorField( "test_web_input_css", "title", "this is a test" ) result = WebInput.scrape_page( url_field.to_python("http://textcritical.net/"), selector_field.to_python("h2"), output_matches_as_mv=True ) self.assertEqual(result['response_code'], 200) self.assertEqual(len(result['match']), 3) out = StringIO() web_input.output_event(result, stanza="web_input://textcritical_net", index="main", source="test_web_input", sourcetype="sourcetype", out=out) self.assertEquals( len(re.findall("match=", out.getvalue())), 3)
def scrape_page(self, url, selector, **kwargs): """ Perform a page scrape and return the results (useful for previewing a web_input modular input configuration) """ result = {} # Run the input try: web_input = WebInput(timeout=10) # Get the authentication information, if available username = None password = None if( 'password' in kwargs and 'username' in kwargs): username = kwargs['username'] password = kwargs['password'] # Get the user-agent string user_agent = None if( 'user_agent' in kwargs): user_agent = kwargs['user_agent'] # Determine if we should include empty matches include_empty_matches = False if 'include_empty_matches' in kwargs: include_empty_matches = util.normalizeBoolean(kwargs['include_empty_matches'], True) # Get the proxy configuration conf_stanza = "default" try: proxy_type, proxy_server, proxy_port, proxy_user, proxy_password = web_input.get_proxy_config(cherrypy.session.get('sessionKey'), conf_stanza) except splunk.ResourceNotFound: cherrypy.response.status = 202 return self.render_error_json(_("Proxy server information could not be obtained")) # Scrape the page result = WebInput.scrape_page( url, selector, username=username, password=password, include_empty_matches=include_empty_matches, proxy_type=proxy_type, proxy_server=proxy_server, proxy_port=proxy_port, proxy_user=proxy_user, proxy_password=proxy_password, user_agent=user_agent) except FieldValidationException, e: cherrypy.response.status = 202 return self.render_error_json(_(str(e)))
def test_input_timeout(self): url_field = URLField( "test_input_timeout", "title", "this is a test" ) selector_field = SelectorField( "test_web_input_css", "title", "this is a test" ) result = WebInput.scrape_page( url_field.to_python("https://192.168.30.23/"), selector_field.to_python("div"), timeout=3 ) self.assertEquals(result['timed_out'], True)