Exemple #1
0
def test_crawler_parse_hrefs(crawler):
    host_link = make_hyperlink("https://www.example.com")
    links = [
        "https://www.example.com#with-fragment",
        "https://www.example.com?with=query",
        "https://www.example.com/?with=query#with-fragment",
        "#with-fragment",
        "?with=query",
        "/?with=query#with-fragment",
        "/some/path",
        "/another/path",
        "https://www.example.com/",
        "https://www.example.com/",
        "https://www.example.com/third/path",
        "https://www.dont-find.com",
        "https://www.subdomain.example.com",
    ]
    input_hrefs = make_hyperlink_set([make_hyperlink(link) for link in links])
    assert crawler._parse_hrefs(input_hrefs, host_link) == make_hyperlink_set(
        [
            host_link,
            host_link + "/some/path",
            host_link + "/another/path",
            host_link + "/third/path",
        ]
    )
 def _render_results(self) -> Set[str]:
     """render all urls as a set of strings and reset crawler"""
     results = {str(url) for url in self._done_urls}
     # reset to start point
     self._queue = queue.Queue()
     self._seen_urls = make_hyperlink_set()
     self._done_urls = make_hyperlink_set()
     return results
    def __init__(
        self,
        user_agent: str = DEFAULT_USER_AGENT,
        session: Session = None,
        max_workers: int = 1,
        timeout: int = 10,
        obey_robots: bool = True,
        check_head: bool = False,
        trim_query: bool = True,
        trim_fragment: bool = True,
        recover_from_error: bool = False,
        db_config: Configuration = None,
        metadata_table_name: str = 'crawler_metadata'
    ):
        # config elements
        self.user_agent = user_agent
        self.max_workers = max_workers
        self.timeout = timeout
        self.obey_robots = obey_robots
        self.check_head = check_head
        self.trim_query = trim_query
        self.trim_fragment = trim_fragment

        # setup internal elements
        self._requester = Requester(user_agent=self.user_agent, session=session)
        self._queue = queue.Queue()
        self._seen_urls = make_hyperlink_set()
        self._done_urls = make_hyperlink_set()

        # todo elements: could allow recording of redirects, client errors & server errors
        self.record_redirects = False
        # self.record_client_errors = False
        # self.record_server_errors = False

        self.recover_from_error = recover_from_error
        self.recover_url = None
        self.recover_hrefs = None

        if db_config:
            self.db = MySqlDatastore(db_config.get_datastores()[0])
            self.metadata_table_name = metadata_table_name
            self.tag = db_config.tag
            if self.recover_from_error:
                self.recover_url, self.recover_hrefs = self.recover()
        else:
            self.db = None
            if self.recover_from_error:
                raise Exception("Can't recover from error without setting a DB!")
Exemple #4
0
def test_anchor_tag_parser_multiple_links_with_duplicates(links):
    html, hrefs = (
        make_html(make_a_tags(links)),
        {make_hyperlink(link) for link in links},
    )
    parser = AnchorTagParser()
    parser.feed(html)
    assert parser.found_links.collection == hrefs
    assert parser.found_links == make_hyperlink_set(hrefs)
    def recover(self):
        retrieved_data = self.db.select_from_table(table=self.metadata_table_name,
                                                   columns='*',
                                                   where=f"crawler_tag='{self.tag}'",
                                                   order_by='id',
                                                   asc_or_desc='DESC')

        last_found_entry = next(filter(lambda row: row[2] == 'Found', retrieved_data))
        url = make_hyperlink(last_found_entry[3])
        current_href = last_found_entry[4]
        hrefs = last_found_entry[5].split(',')
        href_index = hrefs.index(current_href)
        hrefs = hrefs[href_index:]
        hrefs = [make_hyperlink(href) for href in hrefs]
        hrefs = make_hyperlink_set(hrefs)
        return url, hrefs
Exemple #6
0
def test_hyperlink_set_behaves_like_set():
    links = {"/hello", "/world", "/?hello=world"}
    # check __init__
    hrefs = make_hyperlink_set(links)
    # check __len__
    assert len(hrefs) == 3
    # check append
    hrefs.add(make_hyperlink("/?hello=world&world=hello"))
    # check __len__ again
    assert len(hrefs) == 4
    # check __contains__
    for link in links:
        assert make_hyperlink(link) in hrefs
    # check __iter__
    found = set()
    for href in hrefs:
        found.add(href)
    assert found == hrefs.collection
    def _get_hrefs(self, url: Hyperlink) -> HyperlinkSet:
        """get hrefs from url with requester"""
        resp = self._requester(
            url,
            check_head_first=self.check_head,
            follow_redirects=(not self.record_redirects),
        )

        # if we want to record redirects
        # and the response returns a redirect
        # then we will grab the the "Location" header from the response
        # because there will be no links to scrape from the text
        if self.record_redirects and str(resp.status_code).startswith("3"):
            hrefs = make_hyperlink_set([make_hyperlink(resp.headers["Location"])])
        # else we scrape from the text
        else:
            hrefs = get_hrefs_from_html(resp.text)

        return hrefs
Exemple #8
0
def test_crawler_render_results(crawler):
    assert crawler._queue.empty()
    assert crawler._seen_urls == make_hyperlink_set()
    assert crawler._done_urls == make_hyperlink_set()

    crawler._queue.put("job")
    crawler._seen_urls = make_hyperlink_set(["/hello", "world"])
    crawler._done_urls = make_hyperlink_set(["/this", "/that"])

    results = crawler._render_results()
    assert results == {"/this", "/that"}
    assert crawler._queue.empty()
    assert crawler._seen_urls == make_hyperlink_set()
    assert crawler._done_urls == make_hyperlink_set()
Exemple #9
0
def test_crawler_crawl_url(crawler_server, crawler):
    crawler._crawl_url(crawler_server.href / "hello")
    assert crawler._queue.get() == crawler_server.href / "world"
    assert crawler._seen_urls == make_hyperlink_set([crawler_server.href / "world"])
    assert crawler._done_urls == make_hyperlink_set([crawler_server.href / "hello"])
Exemple #10
0
def test_crawler_get_hrefs(crawler_server, crawler, record_redirects, found_link):
    crawler.record_redirects = record_redirects
    found_link = found_link.format(host=crawler_server.url)
    assert crawler._get_hrefs(crawler_server.href + "/redirect/hello") == make_hyperlink_set(
        [found_link]
    )
Exemple #11
0
def test_anchor_tag_parser_single_link(link):
    html, href = make_html(make_a_tag(link)), make_hyperlink(link)
    parser = AnchorTagParser()
    parser.feed(html)
    assert parser.found_links.collection == {href}
    assert parser.found_links == make_hyperlink_set([href])
Exemple #12
0
def test_hyperlink_set_filter_by_mutli_kwargs(fields, input_links, output_links):
    input_hrefs = make_hyperlink_set(input_links)
    filtered_hrefs = input_hrefs.filter_by(**fields)
    output_hrefs = make_hyperlink_set(output_links)
    assert filtered_hrefs == output_hrefs
Exemple #13
0
def test_hyperlink_set_filter_by(fields, input_links, output_links):
    input_hrefs = make_hyperlink_set(input_links)
    k, v = fields
    filtered_hrefs = input_hrefs.filter_by(**{k: v})
    output_hrefs = make_hyperlink_set(output_links)
    assert filtered_hrefs == output_hrefs
Exemple #14
0
def test_hyperlink_set_absolute_links_join_all(input_links, output_links):
    links = make_hyperlink_set(input_links)
    domain = "https://www.google.com"
    assert links.join_all(domain) == make_hyperlink_set(output_links)
Exemple #15
0
def test_hyperlink_set_relative_links_join_all(input_links, output_links):
    links = make_hyperlink_set(input_links)
    domain = "https://www.google.com"
    assert links.join_all(domain) == make_hyperlink_set(
        [make_hyperlink(domain + link) for link in output_links]
    )
Exemple #16
0
def crawler():
    crawler = Crawler(timeout=0)
    assert crawler._queue.empty()
    assert crawler._seen_urls == make_hyperlink_set()
    assert crawler._done_urls == make_hyperlink_set()
    return crawler
    def __init__(self):
        # init parent
        super().__init__()

        # create set of links found
        self.found_links = make_hyperlink_set()
Exemple #18
0
def test_get_hrefs_from_html_unique(input_links, output_results):
    html = make_html(make_a_tags(input_links))
    hrefs = {make_hyperlink(link) for link in output_results}
    assert get_hrefs_from_html(html).collection == hrefs
    assert get_hrefs_from_html(html) == make_hyperlink_set(hrefs)