def _crawl_with_get_request(self, url): self.logger.crawl_with_get_request(url) node = self.graph.nodes[url] node.request_type = "get" try: # timeout after 10 seconds res = requests.get(url, timeout=10) except Exception as e: node.status = "failure" node.error = e else: if res.text and res.headers["Content-Type"].split( "/")[0] == "text": (neighbors, errors) = html_helper.get_neighbors(res.text, res.url) self.errors.extend(errors) for neighbor_url in neighbors: self.graph.add_neighbor(url, neighbor_url) self._enqueue(neighbor_url) data = {"request_type": "get", "status": res.status_code} if res.status_code == 301: data["headers"] = res.headers node.status = "success" node.status_code = res.status_code node.contents = res.text self._finalize_crawl(url)
def _crawl_with_get_request(self, url): self.logger.crawl_with_get_request(url) node = self.graph.nodes[url] node.request_type = "get" try: # timeout after 10 seconds res = requests.get(url, timeout=2) except Exception as e: node.status = "failure" node.error = e else: if res.text and res.headers["Content-Type"].split("/")[0] == "text": (neighbors, errors) = html_helper.get_neighbors(res.text, res.url) self.errors.extend(errors) for neighbor_url in neighbors: self.graph.add_neighbor(url, neighbor_url) self._enqueue(neighbor_url) data = {"request_type": "get", "status": res.status_code} if res.status_code == 301: data["headers"] = res.headers node.status = "success" node.status_code = res.status_code node.contents = res.text finally: self._finalize_crawl(url)