Esempio n. 1
0
def test_crawler_parse_hrefs(crawler):
    host_link = make_hyperlink("https://www.example.com")
    links = [
        "https://www.example.com#with-fragment",
        "https://www.example.com?with=query",
        "https://www.example.com/?with=query#with-fragment",
        "#with-fragment",
        "?with=query",
        "/?with=query#with-fragment",
        "/some/path",
        "/another/path",
        "https://www.example.com/",
        "https://www.example.com/",
        "https://www.example.com/third/path",
        "https://www.dont-find.com",
        "https://www.subdomain.example.com",
    ]
    input_hrefs = make_hyperlink_set([make_hyperlink(link) for link in links])
    assert crawler._parse_hrefs(input_hrefs, host_link) == make_hyperlink_set(
        [
            host_link,
            host_link + "/some/path",
            host_link + "/another/path",
            host_link + "/third/path",
        ]
    )
    def recover(self):
        retrieved_data = self.db.select_from_table(table=self.metadata_table_name,
                                                   columns='*',
                                                   where=f"crawler_tag='{self.tag}'",
                                                   order_by='id',
                                                   asc_or_desc='DESC')

        last_found_entry = next(filter(lambda row: row[2] == 'Found', retrieved_data))
        url = make_hyperlink(last_found_entry[3])
        current_href = last_found_entry[4]
        hrefs = last_found_entry[5].split(',')
        href_index = hrefs.index(current_href)
        hrefs = hrefs[href_index:]
        hrefs = [make_hyperlink(href) for href in hrefs]
        hrefs = make_hyperlink_set(hrefs)
        return url, hrefs
Esempio n. 3
0
def test_anchor_tag_parser_multiple_links_with_duplicates(links):
    html, hrefs = (
        make_html(make_a_tags(links)),
        {make_hyperlink(link) for link in links},
    )
    parser = AnchorTagParser()
    parser.feed(html)
    assert parser.found_links.collection == hrefs
    assert parser.found_links == make_hyperlink_set(hrefs)
Esempio n. 4
0
def test_hyperlink_set_behaves_like_set():
    links = {"/hello", "/world", "/?hello=world"}
    # check __init__
    hrefs = make_hyperlink_set(links)
    # check __len__
    assert len(hrefs) == 3
    # check append
    hrefs.add(make_hyperlink("/?hello=world&world=hello"))
    # check __len__ again
    assert len(hrefs) == 4
    # check __contains__
    for link in links:
        assert make_hyperlink(link) in hrefs
    # check __iter__
    found = set()
    for href in hrefs:
        found.add(href)
    assert found == hrefs.collection
    def handle_starttag(self, tag: str, attrs: list) -> None:
        # https://docs.python.org/3/library/html.parser.html#html.parser.HTMLParser.handle_starttag
        # HTMLParser manages lowercase for us

        # grab only a tags
        if tag == "a":
            for attr, value in attrs:
                # grab only hrefs
                if attr == "href":
                    href = make_hyperlink(value)
                    self.found_links.add(href)
    def crawl(self, domain: str) -> Set[str]:
        """crawl any site for all urls"""
        domain = make_hyperlink(domain)
        self._queue.put(domain)

        # get robots
        # todo: only do this if we obey robots?
        robots = self._get_robots(domain)

        with self._executor() as executor:
            while True:
                # exit if we have crawled all urls found
                if self._seen_urls == self._done_urls and self._seen_urls.is_not_empty():
                    # return results
                    return self._render_results()

                # wait for more urls to enter queue or return if we timeout
                try:
                    url = self._queue.get(timeout=self.timeout)
                except queue.Empty:
                    # return results
                    return self._render_results()

                # if the url has been done start flow again
                if url in self._done_urls:
                    continue

                # if we are to obey the robots then we need to see what we can scrape
                if self.obey_robots:
                    # start again if we can't fetch a url
                    if not robots.can_fetch(self.user_agent, str(url)):
                        print(f"{self.user_agent} can't crawl {url}")
                        continue

                    # there is a bug in py3.6 https://bugs.python.org/issue35922
                    # this try, except will allow for 3.6
                    try:
                        # wait for delay if we can scrape but must crawl slowly
                        if robots.crawl_delay(self.user_agent):
                            delay = int(robots.crawl_delay(self.user_agent))
                            print(f"{self.user_agent} has a delay of {delay}, waiting...")
                            time.sleep(delay)
                    except AttributeError:
                        pass

                # submit crawl_url to executor
                executor.submit(self._crawl_url, url)
    def _get_hrefs(self, url: Hyperlink) -> HyperlinkSet:
        """get hrefs from url with requester"""
        resp = self._requester(
            url,
            check_head_first=self.check_head,
            follow_redirects=(not self.record_redirects),
        )

        # if we want to record redirects
        # and the response returns a redirect
        # then we will grab the the "Location" header from the response
        # because there will be no links to scrape from the text
        if self.record_redirects and str(resp.status_code).startswith("3"):
            hrefs = make_hyperlink_set([make_hyperlink(resp.headers["Location"])])
        # else we scrape from the text
        else:
            hrefs = get_hrefs_from_html(resp.text)

        return hrefs
 def href(self):
     return make_hyperlink(self.url)
Esempio n. 9
0
def test_anchor_tag_parser_single_link(link):
    html, href = make_html(make_a_tag(link)), make_hyperlink(link)
    parser = AnchorTagParser()
    parser.feed(html)
    assert parser.found_links.collection == {href}
    assert parser.found_links == make_hyperlink_set([href])
Esempio n. 10
0
def test_get_hrefs_from_html_unique(input_links, output_results):
    html = make_html(make_a_tags(input_links))
    hrefs = {make_hyperlink(link) for link in output_results}
    assert get_hrefs_from_html(html).collection == hrefs
    assert get_hrefs_from_html(html) == make_hyperlink_set(hrefs)
Esempio n. 11
0
def test_hyperlink_join_with_relative_links(input_link, output_result):
    href = make_hyperlink(input_link)
    domain = "https://helloworld.com"
    assert str(href.join(domain)) == domain + output_result
Esempio n. 12
0
def test_hyperlink_is_absolute_or_relative(input_link, is_absolute_link):
    href = make_hyperlink(input_link)
    assert href.is_absolute == is_absolute_link
    assert href.is_relative != is_absolute_link
Esempio n. 13
0
def test_hyperlink(input_link, output_result):
    href = make_hyperlink(input_link)
    assert str(href) == output_result
Esempio n. 14
0
def test_hyperlink_set_relative_links_join_all(input_links, output_links):
    links = make_hyperlink_set(input_links)
    domain = "https://www.google.com"
    assert links.join_all(domain) == make_hyperlink_set(
        [make_hyperlink(domain + link) for link in output_links]
    )
Esempio n. 15
0
def test_hyperlink_normalisation(input_link, output_result):
    assert make_hyperlink(input_link).url == output_result