class SpiderTest(unittest.TestCase): def setUp(self): self.spider = Spider('http://www.google.com', loglevel=1) def tearDown(self): self.spider.logger.handlers = [] def test_is_valid_url_with_valid_url(self): r = self.spider.is_valid_url('http://www.google.com') self.assertTrue(r) r = self.spider.is_valid_url('https://www.google.com') self.assertTrue(r) def test_is_valid_url_with_invalid_url(self): r = self.spider.is_valid_url('www.google.com') self.assertFalse(r) r = self.spider.is_valid_url('google') self.assertFalse(r) def test_get_abs_url_with_valid_base(self): r = self.spider.get_abs_url(r'http://www.google.com/a/', 'b') self.assertEqual(r, r'http://www.google.com/a/b') r = self.spider.get_abs_url(r'http://www.google.com/a', 'b') self.assertEqual(r, r'http://www.google.com/b') r = self.spider.get_abs_url(r'https://www.google.com/a/c', '/b') self.assertEqual(r, r'https://www.google.com/b') r = self.spider.get_abs_url(r'http://www.google.com', r'http://www.facebook.com') self.assertEqual(r, r'http://www.facebook.com') r = self.spider.get_abs_url(None, r'http://www.google.com') self.assertEqual(r, r'http://www.google.com') def test_get_abs_url_with_invalid_base(self): r = self.spider.get_abs_url('http://', 'www.google.com') self.assertEqual(r, 'http://www.google.com') def test_get_abs_url_with_invalid_url(self): r = self.spider.get_abs_url(None, 'http://') self.assertEqual(r, 'http://') def test_get_all_links_with_valid_content(self): content = ''.join(('<html><body>', '<a href="http://a/b/c"></a>', '<a href="a/b/c"></a>', '<a href="/a/b/c"></a>', '<a href="a"></a>', '<a href="#"></a>', '<a href="javascript"></a>', '</body></html>')) l1 = ['http://a/b/c', 'a/b/c', '/a/b/c', 'a', '#', 'javascript'] l2 = self.spider.get_all_links(content) self.assertEqual(l1, l2) def test_convert_to_unicode_gbk(self): s = u'测试' sgbk = s.encode('gbk') r = self.spider.convert_to_unicode(sgbk) self.assertEqual(r, s) def test_convert_to_unicode_gb2312(self): s = u'测试' sgb2312 = s.encode('gb2312') r = self.spider.convert_to_unicode(sgb2312) self.assertEqual(r, s) def test_convert_to_unicode_utf8(self): s = u'测试' sutf8 = s.encode('utf-8') r = self.spider.convert_to_unicode(sutf8) self.assertEqual(r, s) def test_convert_to_unicode_unknown(self): s = u'测试' sunknown = s.encode('utf-16') self.assertRaises(Exception, self.spider.convert_to_unicode, sunknown) def test_filter_links_with_fragment(self): links = ['http://a#one', 'https://b/c/d#one#two'] flinks = self.spider.filter_links(links) flinks.sort() links = ['http://a', 'https://b/c/d'] links.sort() self.assertEqual(flinks, links) def test_filter_links_with_relative_url(self): links = ['www.google.com'] flinks = self.spider.filter_links(links) links = ['http://www.google.com'] self.assertEqual(flinks, links) def test_filter_links_with_duplicated_url(self): links = ['http://www.google.com', 'http://www.google.com'] flinks = self.spider.filter_links(links) links = ['http://www.google.com'] self.assertEqual(flinks, links) def test_filter_links_with_invalid_url(self): links = ['http://', 'https://'] flinks = self.spider.filter_links(links) self.assertEqual(flinks, []) def test_get_logger(self): logger = self.spider.get_logger('/tmp/test.log', 5) logger.debug('logger test') log_msg_exist = False with open('/tmp/test.log') as f: lines = f.readlines() for line in lines: if line.strip().endswith('logger test'): log_msg_exist = not log_msg_exist os.remove('/tmp/test.log') self.assertTrue(log_msg_exist) def test_verify_page_headers(self): headers = {'ETag': '123', 'Last-Modified': '456', 'Content-Type': 'text/html'} r = self.spider.verify_page_headers(headers) self.assertEqual(r.get('etag', ''), '123') self.assertEqual(r.get('lastmodified', ''), '456') def test_verify_page_headers_with_invalid_input(self): headers = {'content-type': 'msdoc'} self.assertRaises(Exception, self.spider.verify_page_headers, headers) def test_get_key_pattern_with_invalid_input(self): key = u'测试'.encode('utf-16') k = self.spider.get_key_pattern(key) self.assertEqual(k, None) key = None k = self.spider.get_key_pattern(key) self.assertEqual(k, None) def test_get_pattern(self): key = u'测试'.encode('gbk') k = self.spider.get_key_pattern(key) r = k.search(u'我是测试') self.assertNotEqual(r, None)