def test_crawler_parse_hrefs(crawler): host_link = make_hyperlink("https://www.example.com") links = [ "https://www.example.com#with-fragment", "https://www.example.com?with=query", "https://www.example.com/?with=query#with-fragment", "#with-fragment", "?with=query", "/?with=query#with-fragment", "/some/path", "/another/path", "https://www.example.com/", "https://www.example.com/", "https://www.example.com/third/path", "https://www.dont-find.com", "https://www.subdomain.example.com", ] input_hrefs = make_hyperlink_set([make_hyperlink(link) for link in links]) assert crawler._parse_hrefs(input_hrefs, host_link) == make_hyperlink_set( [ host_link, host_link + "/some/path", host_link + "/another/path", host_link + "/third/path", ] )
def _render_results(self) -> Set[str]: """render all urls as a set of strings and reset crawler""" results = {str(url) for url in self._done_urls} # reset to start point self._queue = queue.Queue() self._seen_urls = make_hyperlink_set() self._done_urls = make_hyperlink_set() return results
def __init__( self, user_agent: str = DEFAULT_USER_AGENT, session: Session = None, max_workers: int = 1, timeout: int = 10, obey_robots: bool = True, check_head: bool = False, trim_query: bool = True, trim_fragment: bool = True, recover_from_error: bool = False, db_config: Configuration = None, metadata_table_name: str = 'crawler_metadata' ): # config elements self.user_agent = user_agent self.max_workers = max_workers self.timeout = timeout self.obey_robots = obey_robots self.check_head = check_head self.trim_query = trim_query self.trim_fragment = trim_fragment # setup internal elements self._requester = Requester(user_agent=self.user_agent, session=session) self._queue = queue.Queue() self._seen_urls = make_hyperlink_set() self._done_urls = make_hyperlink_set() # todo elements: could allow recording of redirects, client errors & server errors self.record_redirects = False # self.record_client_errors = False # self.record_server_errors = False self.recover_from_error = recover_from_error self.recover_url = None self.recover_hrefs = None if db_config: self.db = MySqlDatastore(db_config.get_datastores()[0]) self.metadata_table_name = metadata_table_name self.tag = db_config.tag if self.recover_from_error: self.recover_url, self.recover_hrefs = self.recover() else: self.db = None if self.recover_from_error: raise Exception("Can't recover from error without setting a DB!")
def test_anchor_tag_parser_multiple_links_with_duplicates(links): html, hrefs = ( make_html(make_a_tags(links)), {make_hyperlink(link) for link in links}, ) parser = AnchorTagParser() parser.feed(html) assert parser.found_links.collection == hrefs assert parser.found_links == make_hyperlink_set(hrefs)
def recover(self): retrieved_data = self.db.select_from_table(table=self.metadata_table_name, columns='*', where=f"crawler_tag='{self.tag}'", order_by='id', asc_or_desc='DESC') last_found_entry = next(filter(lambda row: row[2] == 'Found', retrieved_data)) url = make_hyperlink(last_found_entry[3]) current_href = last_found_entry[4] hrefs = last_found_entry[5].split(',') href_index = hrefs.index(current_href) hrefs = hrefs[href_index:] hrefs = [make_hyperlink(href) for href in hrefs] hrefs = make_hyperlink_set(hrefs) return url, hrefs
def test_hyperlink_set_behaves_like_set(): links = {"/hello", "/world", "/?hello=world"} # check __init__ hrefs = make_hyperlink_set(links) # check __len__ assert len(hrefs) == 3 # check append hrefs.add(make_hyperlink("/?hello=world&world=hello")) # check __len__ again assert len(hrefs) == 4 # check __contains__ for link in links: assert make_hyperlink(link) in hrefs # check __iter__ found = set() for href in hrefs: found.add(href) assert found == hrefs.collection
def _get_hrefs(self, url: Hyperlink) -> HyperlinkSet: """get hrefs from url with requester""" resp = self._requester( url, check_head_first=self.check_head, follow_redirects=(not self.record_redirects), ) # if we want to record redirects # and the response returns a redirect # then we will grab the the "Location" header from the response # because there will be no links to scrape from the text if self.record_redirects and str(resp.status_code).startswith("3"): hrefs = make_hyperlink_set([make_hyperlink(resp.headers["Location"])]) # else we scrape from the text else: hrefs = get_hrefs_from_html(resp.text) return hrefs
def test_crawler_render_results(crawler): assert crawler._queue.empty() assert crawler._seen_urls == make_hyperlink_set() assert crawler._done_urls == make_hyperlink_set() crawler._queue.put("job") crawler._seen_urls = make_hyperlink_set(["/hello", "world"]) crawler._done_urls = make_hyperlink_set(["/this", "/that"]) results = crawler._render_results() assert results == {"/this", "/that"} assert crawler._queue.empty() assert crawler._seen_urls == make_hyperlink_set() assert crawler._done_urls == make_hyperlink_set()
def test_crawler_crawl_url(crawler_server, crawler): crawler._crawl_url(crawler_server.href / "hello") assert crawler._queue.get() == crawler_server.href / "world" assert crawler._seen_urls == make_hyperlink_set([crawler_server.href / "world"]) assert crawler._done_urls == make_hyperlink_set([crawler_server.href / "hello"])
def test_crawler_get_hrefs(crawler_server, crawler, record_redirects, found_link): crawler.record_redirects = record_redirects found_link = found_link.format(host=crawler_server.url) assert crawler._get_hrefs(crawler_server.href + "/redirect/hello") == make_hyperlink_set( [found_link] )
def test_anchor_tag_parser_single_link(link): html, href = make_html(make_a_tag(link)), make_hyperlink(link) parser = AnchorTagParser() parser.feed(html) assert parser.found_links.collection == {href} assert parser.found_links == make_hyperlink_set([href])
def test_hyperlink_set_filter_by_mutli_kwargs(fields, input_links, output_links): input_hrefs = make_hyperlink_set(input_links) filtered_hrefs = input_hrefs.filter_by(**fields) output_hrefs = make_hyperlink_set(output_links) assert filtered_hrefs == output_hrefs
def test_hyperlink_set_filter_by(fields, input_links, output_links): input_hrefs = make_hyperlink_set(input_links) k, v = fields filtered_hrefs = input_hrefs.filter_by(**{k: v}) output_hrefs = make_hyperlink_set(output_links) assert filtered_hrefs == output_hrefs
def test_hyperlink_set_absolute_links_join_all(input_links, output_links): links = make_hyperlink_set(input_links) domain = "https://www.google.com" assert links.join_all(domain) == make_hyperlink_set(output_links)
def test_hyperlink_set_relative_links_join_all(input_links, output_links): links = make_hyperlink_set(input_links) domain = "https://www.google.com" assert links.join_all(domain) == make_hyperlink_set( [make_hyperlink(domain + link) for link in output_links] )
def crawler(): crawler = Crawler(timeout=0) assert crawler._queue.empty() assert crawler._seen_urls == make_hyperlink_set() assert crawler._done_urls == make_hyperlink_set() return crawler
def __init__(self): # init parent super().__init__() # create set of links found self.found_links = make_hyperlink_set()
def test_get_hrefs_from_html_unique(input_links, output_results): html = make_html(make_a_tags(input_links)) hrefs = {make_hyperlink(link) for link in output_results} assert get_hrefs_from_html(html).collection == hrefs assert get_hrefs_from_html(html) == make_hyperlink_set(hrefs)