Beispiel #1
0
    def _crawl_with_get_request(self, url):
        self.logger.crawl_with_get_request(url)

        node = self.graph.nodes[url]
        node.request_type = "get"

        try:
            # timeout after 10 seconds
            res = requests.get(url, timeout=10)
        except Exception as e:
            node.status = "failure"
            node.error = e
        else:
            if res.text and res.headers["Content-Type"].split(
                    "/")[0] == "text":
                (neighbors,
                 errors) = html_helper.get_neighbors(res.text, res.url)

                self.errors.extend(errors)

                for neighbor_url in neighbors:
                    self.graph.add_neighbor(url, neighbor_url)
                    self._enqueue(neighbor_url)

            data = {"request_type": "get", "status": res.status_code}

            if res.status_code == 301:
                data["headers"] = res.headers

            node.status = "success"
            node.status_code = res.status_code
            node.contents = res.text

        self._finalize_crawl(url)
Beispiel #2
0
    def _crawl_with_get_request(self, url):
        self.logger.crawl_with_get_request(url)

        node = self.graph.nodes[url]
        node.request_type = "get"

        try:
            # timeout after 10 seconds
            res = requests.get(url, timeout=2)
        except Exception as e:
            node.status = "failure"
            node.error = e
        else:
            if res.text and res.headers["Content-Type"].split("/")[0] == "text":
                (neighbors, errors) = html_helper.get_neighbors(res.text, res.url)

                self.errors.extend(errors)

                for neighbor_url in neighbors:
                    self.graph.add_neighbor(url, neighbor_url)
                    self._enqueue(neighbor_url)

            data = {"request_type": "get", "status": res.status_code}

            if res.status_code == 301:
                data["headers"] = res.headers

            node.status = "success"
            node.status_code = res.status_code
            node.contents = res.text
        finally:
            self._finalize_crawl(url)