Beispiel #1
0
class URLParserTest(TestCase):
    
    TEST_URLS = {
                 'http://www.onet.pl/' : ('http', False, 'www.onet.pl', '/'),
                 'https://gmail.com/' : ('http', True, 'gmail.com', '/'),
                 'ftp://ftp.task.gda.pl/' : ('ftp', False, 'ftp.task.gda.pl', '/')
                 }
    
    TEST_URLS_TYPE = {
                      'http://onet.pl' : 'global',
                      '/test.php?a=1' : 'relative',
                      '/' : 'main',
                      '#/soma/fb/path' : 'id',
                      'test.php?a=3' : 'local'
                      }
    
    def setUp(self):
        self.parser = URLParser()
        
    def test_parse(self):
        for url, cls in self.TEST_URLS.iteritems():
            self.assertEquals(self.parser.parse(url), cls)
            
    def test_join(self):
        for url, cls in self.TEST_URLS.iteritems():
            self.assertEquals(self.parser.join(cls), url)
            
    def test_get_type(self):
        for url, type in self.TEST_URLS_TYPE.iteritems():
            self.assertEquals(type, self.parser.get_type(url))
Beispiel #2
0
 def main(self):
     
     documents = []
     queue = []
     opener = URLOpener()
     parser = URLParser()
     db = BotDB(self.conf)
     parsed = []
     
     queue += self.conf['initial']['sites']
     print queue
     
     while len(queue) > 0:
         site = queue.pop(0)
         
         if site in parsed:
             continue
 
         parsed.append(site)
         self.logger.info("Parsing site: {0}".format(site))
         self.logger.info("Len of queue: {0}".format(len(queue)))
         headers, data = opener.open(site)
         
         if 'Content-Type' in headers:
             if headers['Content-Type'].split(';')[0] == 'text/html':
                 quad = parser.parse(site)
     
                 doc = Document(quad[0], quad[1], quad[2], quad[3], headers, data)
                 documents.append(doc)
                 self._follow(doc, parser, queue, parsed, quad)
Beispiel #3
0
def main():
    conf = get_config('../conf/config.yaml')
    documents = []
    opener = URLOpener()
    parser = URLParser()
    
    sites = conf['initial']['sites']
    
    for site in sites:
        headers, data = opener.open(site)
        if headers.getheader('Content-Type').split(';')[0] == 'text/html':
            typ = parser.parse(site)
            
            doc = Document(typ[0], typ[1], typ[2], typ[3], headers, data)
            documents.append(doc)
            
            print doc.get_text()
Beispiel #4
0
 def setUp(self):
     self.parser = URLParser()