def test_page_count(self): w = WebSite(None, None) w.domainBase = "google.com" w.max_page_count = 2 w.get_urls(WEB_SITE) self.assertEqual(w.max_page_count, 0) self.assertEqual(len(w.toProcess), 2)
def test_find_plain_text_ip(self): config = Configuration(SEARCH_STRING= "[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}", FILTER_STRING = "<.*?>") w = WebSite(None, config) w.process_data(WEB_DATA, "TEXT") self.assertEqual(len(w.get_data()), 1) self.assertTrue("127.0.0.1" in w.get_data())
def test_find_urls(self): sites = ["http://www.google.com/help", "http://www.google.com/mail", "https://mail.google.com"] w = WebSite(None, None) w.domainBase = "google.com" w.max_page_count=10 w.get_urls(WEB_SITE) for site in sites: self.assertTrue(site in w.toProcess) self.assertTrue(w is not None)
def process_source(self, url, config): """ Process an individual source. This runs the fetching of a data source, determines which class should represent the output, and selects where the output goes. @param url: The string URL of the configuration. @param config: The configuration of the system. """ type = config.get_source_type() today = datetime.datetime.today() file = config.get_format().gen_fmt_string(YEAR="%4d"%today.year, MONTH="%02d"%today.month, DAY="%02d"%today.day, HOUR="%02d"%today.hour, MINUTE="%02d"%today.minute, SECOND="%02d"%today.second) file = "%s/%s" % (config.get_output_directory(), file) try: if type == WEB_SOURCE: source = WebSite(url, config) source.output_file(file) elif type == RSS_SOURCE: source = RSSReader(url, config) source.output_file(file) elif type == RSYNC_SOURCE: source = Rsync(url, config) except Exception as inst: os.system("touch %s" % file) logger.warning("No Data found at '%s'" % url)
def test_find_ips(self): config = Configuration(SEARCH_STRING= "[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}") w = WebSite(None, config) w.process_data(WEB_DATA, "TEXT") self.assertEqual(len(w.get_data()), 2) self.assertTrue("127.0.0.1" in w.get_data()) self.assertTrue("192.168.1.1" in w.get_data())
def test_url_parsing_from_config(self): w = WebSite(None, Configuration(DOMAIN_BASE = "google.com", MAX_PAGE_COUNT = 10)) w.get_urls(WEB_SITE) self.assertEqual(w.max_page_count, 6) self.assertEqual(len(w.toProcess) + len(w.processed), 4)
def test_remove_html(self): config = Configuration(FILTER_STRING="<.*?>") w = WebSite(None, config) w.process_data(WEB_DATA, "TEXT") self.assertEqual(NO_HTML, w.get_data())
def test_get_google(self): w = WebSite("http://www.google.com") self.assertNotEqual(len(w.get_data()), 0)
def test_empty(self): w = WebSite(None) self.assertEqual(len(w.get_data()), 0)