def test_fill_disallow_urls_from_robot(self): with patch.object(requests, 'get') as mock_get: with open('fake_robots.txt', 'r') as fake_robots_txt: mock_get.return_value = FakeResponse() mock_get.return_value.text = fake_robots_txt.read() test_crawler = Crawler( 'https://a/', [''], {}) test_crawler.fill_disallow_urls(URL('https://a/')) test_crawler.close() self.assertEqual({re.compile('https://a/b.+', re.IGNORECASE)}, test_crawler.disallow_urls)
def test_searcher_with_seen_urls(self): with patch.object(Crawler, 'get_html') as mock_get_html: mock_get_html.return_value = '<a href=http://scala-lang.org></a>' \ '<a href=https://scala11.html></a>' with patch.object(Crawler, 'write_html') as mock_write_html: mock_write_html.return_value = None test_crawler = Crawler( 'https://docs.scala-lang.org/ru/tour/tour-of-scala.html', ['scala'], {}, 2) test_crawler.seen_urls.add(Page(URL('http://scala-lang.org'))) test_result = test_crawler.crawl() test_crawler.close() assert 'http://scala-lang.org' not in test_result
def test_crawler_zero_result(self): with patch.object(Crawler, 'get_html') as mock_get_html: mock_get_html.return_value = '<a href=https://scala1.html></a>' with patch.object(Crawler, 'write_html') as mock_write_html: mock_write_html.return_value = None test_crawler = Crawler( 'https://docs.scala-lang.org/ru/tour/tour-of-scala.html', ['dog'], {}, 2) test_result = test_crawler.crawl() test_crawler.close() self.assertEqual(test_result, set())
def test_update_parents(self): with patch.object(Crawler, 'get_html') as mock_get_html: mock_get_html.return_value = '<a href=http://a/c/></a>' \ '<a href=http://a/b/></a>' with patch.object(Crawler, 'write_html') as mock_write_html: mock_write_html.return_value = None test_crawler = Crawler( 'http://a', [''], {}, max_urls_count=3) test_result = test_crawler.crawl() test_crawler.close() for page in test_result: if page.parent: self.assertEqual(page.parent, Page(URL('http://a')))
def crawler_sqlmap(entry_url, depth=-1, level=1, threads=2, timeout=30, checkhost=True): """启动sqlmap扫描的入口函数。 :param entry_url: 扫描网站的入口地址 :param depth: 网页爬虫爬取页面深度,-1则表示不设置深度,默认-1 :param level: sqlmap扫描测试等级:1-5(默认为1),等级越高使用的测试样例越多,结果越精确,时间也越长 :param threads: sqlmap多线程扫描设置(默认为2) :param timeout: sqlmap扫描超时时间(默认30s) :param checkhost: 检查爬取链接是否属于同一域 :return: 返回值为四元组(ret, url, simple, content) ret: 执行结果, False为失败, True为成功 url: 扫描目标地址 simple: 解析content抽取重要数据生成的报告,字典类型 content: sqlmap返回的完整报告,字典类型 若执行结果为False,那么把扫描错误信息存在扫描关键结果(simple)这个位置 """ settings = Setting(handle=False) settings.depth = depth settings.nocheckhost = not checkhost settings.level = level settings.threads = threads settings.timeout = timeout sqlmap, crawler = None, None try: sqlmap, ip, port = start_sqlmap() # crawler的创建必须在sqlmap启动之后, 才能正确获取sqlmap的端口号 crawler = Crawler(BASE_DIR, ip, port, entry_url, setting=settings) crawler.run() cont, simple = crawler.raw_report() return True, entry_url, simple, cont except: logger.error(traceback.format_exc()) return False, entry_url, traceback.format_exc(), {} finally: if crawler: crawler.close() if sqlmap: sqlmap.terminate()
def test_searcher_with_result(self): with patch.object(Crawler, 'get_html') as mock_get_html: mock_get_html.return_value = '<a href=https://scala1.html></a>' \ '<a href=https://scala2.html></a>' \ '<a href=https://scala3.html></a>' \ '<a href=https://scala4.html></a>' \ '<a href=https://scala5.html></a>' \ '<a href=https://scala6.html></a>' \ '<a href=https://scala7.html></a>' \ '<a href=https://scala8.html></a>' \ '<a href=https://scala9.html></a>' \ '<a href=https://scala10.html></a>' \ '<a href=https://scala11.html></a>' with patch.object(Crawler, 'write_html') as mock_write_html: mock_write_html.return_value = None test_crawler = Crawler( 'https://docs.scala-lang.org/ru/tour/tour-of-scala.html', ['scala'], {}) test_result = test_crawler.crawl() test_crawler.close() self.assertEqual(len(test_result), 10)
default='pages', help='Directory for downloaded pages') arguments_parser.add_argument('-g', action='store_true', help='Show graph') arguments_parser.add_argument('-w', action='store_true', help='Save founded pages') args = arguments_parser.parse_args() white_domains = [] for domain in args.wildcard: if domain.startswith('*'): white_domains.append(re.compile(fr'[^.]+.{domain[1::]}')) else: white_domains.append(domain) if args.start_url[-1] == '/': url = args.start_url[:-1] else: url = args.start_url crawler = Crawler(url, args.request, white_domains, args.d, args.f, args.w) try: result = crawler.crawl() if args.g: show_graph(result) for link in result: print(link) print('Program is completed') plt.show() except KeyboardInterrupt: print('Program is completed') finally: crawler.close()