class URLParserTest(TestCase): TEST_URLS = { 'http://www.onet.pl/' : ('http', False, 'www.onet.pl', '/'), 'https://gmail.com/' : ('http', True, 'gmail.com', '/'), 'ftp://ftp.task.gda.pl/' : ('ftp', False, 'ftp.task.gda.pl', '/') } TEST_URLS_TYPE = { 'http://onet.pl' : 'global', '/test.php?a=1' : 'relative', '/' : 'main', '#/soma/fb/path' : 'id', 'test.php?a=3' : 'local' } def setUp(self): self.parser = URLParser() def test_parse(self): for url, cls in self.TEST_URLS.iteritems(): self.assertEquals(self.parser.parse(url), cls) def test_join(self): for url, cls in self.TEST_URLS.iteritems(): self.assertEquals(self.parser.join(cls), url) def test_get_type(self): for url, type in self.TEST_URLS_TYPE.iteritems(): self.assertEquals(type, self.parser.get_type(url))
def main(self): documents = [] queue = [] opener = URLOpener() parser = URLParser() db = BotDB(self.conf) parsed = [] queue += self.conf['initial']['sites'] print queue while len(queue) > 0: site = queue.pop(0) if site in parsed: continue parsed.append(site) self.logger.info("Parsing site: {0}".format(site)) self.logger.info("Len of queue: {0}".format(len(queue))) headers, data = opener.open(site) if 'Content-Type' in headers: if headers['Content-Type'].split(';')[0] == 'text/html': quad = parser.parse(site) doc = Document(quad[0], quad[1], quad[2], quad[3], headers, data) documents.append(doc) self._follow(doc, parser, queue, parsed, quad)
def main(): conf = get_config('../conf/config.yaml') documents = [] opener = URLOpener() parser = URLParser() sites = conf['initial']['sites'] for site in sites: headers, data = opener.open(site) if headers.getheader('Content-Type').split(';')[0] == 'text/html': typ = parser.parse(site) doc = Document(typ[0], typ[1], typ[2], typ[3], headers, data) documents.append(doc) print doc.get_text()
def setUp(self): self.parser = URLParser()