def test_create_links(self): # no pages should exist self.assertEqual(Page.select().count(), 0) # create links links = create_links(link1, [link2]) self.assertTrue(len(links), 1) link = links[0] self.assertEqual(link.from_page.url, link1) self.assertEqual(link.to_page.url, link2) # 2 pages should exist self.assertEqual(Page.select().count(), 2)
def test_page_twice(self): # make sure database is cleared out between tests url = 'http://www.example.com/foo' Page.create(url=url, content='hi world', status_code=200) self.assertEqual(Page.select().count(), 1) self.assertRaises(IntegrityError, Page.create, url=url, content='hi world', status_code=200)
def test_permanent_redirect(self, requests_get, requests_head): url = "http://www.example.com/foo" redirect_url = "http://www.example.com/bar" headers = {'location': redirect_url} requests_head.return_value = MagicMock(status_code=301, headers=headers) page = Page.create(url=url, content='', status_code=0) add_page_info_to_page(page) to_page = Page.select().where(Page.url == redirect_url).first() self.assertTrue(to_page) url_redirect_link = Link.select().where(Link.from_page == page, Link.to_page == to_page) self.assertTrue(url_redirect_link.exists()) self.assertEqual(requests_head.call_count, 1) self.assertFalse(requests_get.called) self.assertEqual(page.content, redirect_url)
def test_permanent_redirect(self, requests_get, requests_head): url = "http://www.example.com/foo" redirect_url = "http://www.example.com/bar" headers = { 'location': redirect_url } requests_head.return_value = MagicMock(status_code=301, headers=headers) page = Page.create(url=url, content='', status_code=0) add_page_info_to_page(page) to_page = Page.select().where(Page.url == redirect_url).first() self.assertTrue(to_page) url_redirect_link = Link.select().where( Link.from_page == page, Link.to_page == to_page) self.assertTrue(url_redirect_link.exists()) self.assertEqual(requests_head.call_count, 1) self.assertFalse(requests_get.called) self.assertEqual(page.content, redirect_url)
def sizeof_fmt(num, suffix='B'): """ print formatted file size http://stackoverflow.com/a/1094933 """ for unit in ['', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi']: if abs(num) < 1024.0: return "%3.1f%s%s" % (num, unit, suffix) num /= 1024.0 return "%.1f%s%s" % (num, 'Yi', suffix) if __name__ == "__main__": initialize('corpus.db') page_count = Page.select().count() crawled_count = Page.select().where( (Page.status_code == 200) & ((Page.content_type == 'text/html') | (Page.content_type == 'text/plain')))\ .count() redirect_count = Page.select().where(Page.status_code == 301).count() to_crawl_count = Page.select().where(Page.status_code == 0).count() other_count = page_count - crawled_count - redirect_count - to_crawl_count link_count = Link.select().count() corpus_size = os.stat('corpus.db').st_size corpus_size = sizeof_fmt(corpus_size) print('crawled pages: {}'.format(crawled_count))
def sizeof_fmt(num, suffix='B'): """ print formatted file size http://stackoverflow.com/a/1094933 """ for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']: if abs(num) < 1024.0: return "%3.1f%s%s" % (num, unit, suffix) num /= 1024.0 return "%.1f%s%s" % (num, 'Yi', suffix) if __name__ == "__main__": initialize('corpus.db') page_count = Page.select().count() crawled_count = Page.select().where( (Page.status_code == 200) & ((Page.content_type == 'text/html') | (Page.content_type == 'text/plain')))\ .count() redirect_count = Page.select().where(Page.status_code == 301).count() to_crawl_count = Page.select().where(Page.status_code == 0).count() other_count = page_count - crawled_count - redirect_count - to_crawl_count link_count = Link.select().count() corpus_size = os.stat('corpus.db').st_size corpus_size = sizeof_fmt(corpus_size) print('crawled pages: {}'.format(crawled_count))