Example #1
0
 def test_page_count(self):
     w = WebSite(None, None)
     w.domainBase = "google.com"
     w.max_page_count = 2
     w.get_urls(WEB_SITE)
     self.assertEqual(w.max_page_count, 0)
     self.assertEqual(len(w.toProcess), 2)
Example #2
0
 def test_find_plain_text_ip(self):
     config = Configuration(SEARCH_STRING=
                           "[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}",
                           FILTER_STRING = "<.*?>")
     w = WebSite(None, config)
     w.process_data(WEB_DATA, "TEXT")
     self.assertEqual(len(w.get_data()), 1)
     self.assertTrue("127.0.0.1" in w.get_data())
Example #3
0
 def test_find_urls(self):
     sites = ["http://www.google.com/help",
              "http://www.google.com/mail",
              "https://mail.google.com"]
     w = WebSite(None, None)
     w.domainBase = "google.com"
     w.max_page_count=10
     w.get_urls(WEB_SITE)
     for site in sites:
         self.assertTrue(site in w.toProcess)
     self.assertTrue(w is not None)
Example #4
0
    def process_source(self, url, config):
        """
        Process an individual source.  This runs the fetching of a 
        data source, determines which class should represent the output,
        and selects where the output goes.

        @param url: The string URL of the configuration.
        @param config: The configuration of the system.
        """
        type = config.get_source_type()
        today = datetime.datetime.today()
        file = config.get_format().gen_fmt_string(YEAR="%4d"%today.year,
                                                  MONTH="%02d"%today.month,
                                                  DAY="%02d"%today.day,
                                                  HOUR="%02d"%today.hour,
                                                  MINUTE="%02d"%today.minute,
                                                  SECOND="%02d"%today.second)
        file = "%s/%s" % (config.get_output_directory(), file)
	try:
            if type == WEB_SOURCE:
                source = WebSite(url, config)
                source.output_file(file)
            elif type == RSS_SOURCE:
                source = RSSReader(url, config) 
                source.output_file(file)
            elif type == RSYNC_SOURCE:
                source = Rsync(url, config)
        except Exception as inst:
            os.system("touch %s" % file)
            logger.warning("No Data found at '%s'" % url) 
Example #5
0
 def test_find_ips(self):
     config = Configuration(SEARCH_STRING=
                            "[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}")
     w = WebSite(None, config)
     w.process_data(WEB_DATA, "TEXT")
     self.assertEqual(len(w.get_data()), 2)
     self.assertTrue("127.0.0.1" in w.get_data())
     self.assertTrue("192.168.1.1" in w.get_data())
Example #6
0
 def test_url_parsing_from_config(self):
     w = WebSite(None, Configuration(DOMAIN_BASE = "google.com",
                       MAX_PAGE_COUNT = 10))
     w.get_urls(WEB_SITE)
     self.assertEqual(w.max_page_count, 6)
     self.assertEqual(len(w.toProcess) + len(w.processed), 4)
Example #7
0
 def test_remove_html(self):
     config = Configuration(FILTER_STRING="<.*?>")
     w = WebSite(None, config)
     w.process_data(WEB_DATA, "TEXT")
     self.assertEqual(NO_HTML, w.get_data())
Example #8
0
 def test_get_google(self):
     w = WebSite("http://www.google.com")
     self.assertNotEqual(len(w.get_data()), 0)
Example #9
0
 def test_empty(self):
     w = WebSite(None)
     self.assertEqual(len(w.get_data()), 0)