def test_crawler_has_root_domain(allow_subd, subd_result): cr = crawler.Crawler("https://google.com", "", allow_subdomains=allow_subd) assert cr._has_root_url("https://google.com/search") == True assert cr._has_root_url("https://yandex.ru/search") == False assert cr._has_root_url("https://mail.google.com") == subd_result
def test_crawler_filter_links(checkup_html, allow_subd, result): cr = crawler.Crawler("https://google.com", "", allow_subdomains=allow_subd) filter_result = list( cr._filter_links(checkup_html["title_nonempty_links"][1], "https://google.com")) filter_result.sort() assert filter_result == checkup_html[result]
def test_crawler_normalize_link(): cr = crawler.Crawler("https://google.com", "") assert (cr._normalize_link( "https://google.com/help", "https://google.com/") == "https://google.com/help") assert (cr._normalize_link( "/help", "https://google.com/") == "https://google.com/help") assert (cr._normalize_link( "https://mail.google.com/help#fragment", "https://google.com/") == "https://mail.google.com/help")
def test_crawler_instance(): cr = crawler.Crawler( "https://google.com", splash_address="0.0.0.0:8050", allow_subdomains=True, allow_queries=True, depth_by_desc=3, concurrency=5, max_pause=10.0, ) assert isinstance(cr, crawler.Crawler)
def test_crawler_is_valid_link(allow_subd, subd_result): cr = crawler.Crawler("https://google.com", "", allow_subdomains=allow_subd) assert (cr._is_valid_link("https://google.com/search", "https://google.com") == True) assert cr._is_valid_link("/help", "https://google.com") == True assert cr._is_valid_link("#help", "https://google.com") == False assert (cr._is_valid_link("https.google.com/#help", "https://google.com") == False) assert (cr._is_valid_link("https://yandex.ru/search", "https://google.com") == False) assert (cr._is_valid_link("https://mail.google.com", "https://google.com") == subd_result)
def test_remove_query(): cr = crawler.Crawler("https://google.com", "") assert (cr._remove_query("https://google.com/search?page=42") == "https://google.com/search")
def test_crawler_get_page_data(checkup_html): cr = crawler.Crawler("https://google.com", "") assert (cr.get_page_data( checkup_html["html"]) == checkup_html["title_nonempty_links"])
def save( url, splash_address, subdomains, queries, depth, concurrency, max_pause, file_prefix, ): """Perform the crawling and save crawled data as JSON graph representaiont and TAB-formatted text file in case of success. EXAMPLE: python baby_crawler save http://scrapethissite.com -s http://localhost:8050 """ # Perfom crawling and measure time cr = crawler.Crawler( url, splash_address, subdomains, queries, depth, concurrency, max_pause, ) start = time.perf_counter() cr.make_site_map() elapsed = time.perf_counter() - start # Count processed links and errors links_found = len(cr.added_tasks) links_crawled = len(cr.crawled_links) - 1 echo( f"Found {links_found} unique links on {url} wiht depth level {depth}.", color="green", ) echo(f"Successfully crawled {links_crawled} links.", color="green") echo( "Elapsed time {}".format(datetime.timedelta(seconds=elapsed)), color="green", ) if cr.error_count: echo("Errors:", color="red") for error, ammount in cr.error_count.items(): echo(f"{error}: {ammount}", color="red") # Write files in case of successfull crawling if links_found: if not file_prefix: file_prefix = re.sub("[^a-zA-Z0-9]", "_", url) file_prefix += time.strftime("_%y-%m-%d_%H-%M-%S", time.localtime()) with open(f"{file_prefix}.json", "w+", encoding="utf-8") as json_file: json.dump(nx.node_link_data(cr.site_graph), json_file, ensure_ascii=False) echo(f"Written graph data to {file_prefix}.json", color="green") with open(f"{file_prefix}.txt", "w+") as txt_file: txt_file.write(cr.site_graph.nodes[1]["url"] + "\n") def writegraph(graph, start_node, level): level += 1 for i in graph.successors(start_node): txt_file.write("\t" * level + graph.nodes[i]["url"] + "\n") writegraph(graph, i, level) writegraph(cr.site_graph, 1, 0) echo(f"Written found links to {file_prefix}.txt", color="green")