def get_requests_from_robots(self, request): purl = urlsplit(request.url) url = "%s://%s/robots.txt" % (purl.scheme, purl.netloc) getreq = Request(REQTYPE_LINK, "GET", url) try: # request, timeout, retries=None, useragent=None, proxy=None): httpget = HttpGet(getreq, 10, 1, "Googlebot", Shared.options['proxy']) lines = httpget.get_file().split("\n") except urllib2.HTTPError: return [] except: raise requests = [] for line in lines: directive = "" url = None try: directive, url = re.sub("\#.*","",line).split(":",1) except: continue # ignore errors if re.match("(dis)?allow", directive.strip(), re.I): req = Request(REQTYPE_LINK, "GET", url.strip(), parent=request) requests.append(req) return adjust_requests(requests) if requests else []
def get_requests_from_robots(self, request): purl = urlsplit(request.url) url = "%s://%s/robots.txt" % (purl.scheme, purl.netloc) getreq = Request(REQTYPE_LINK, "GET", url, extra_headers=Shared.options['extra_headers']) try: # request, timeout, retries=None, useragent=None, proxy=None): httpget = HttpGet(getreq, 10, 1, "Googlebot", Shared.options['proxy']) lines = httpget.get_file().split("\n") except urllib.error.HTTPError: return [] except: return [] #raise requests = [] for line in lines: directive = "" url = None try: directive, url = re.sub("\#.*", "", line).split(":", 1) except: continue # ignore errors if re.match("(dis)?allow", directive.strip(), re.I): req = Request(REQTYPE_LINK, "GET", url.strip(), parent=request) requests.append(req) return adjust_requests(requests) if requests else []
def _get_requests_from_robots(start_request): """ read robots.txt file (if any) and create a list of request based on it's content :return: list of request """ purl = urlsplit(start_request.url) url = "%s://%s/robots.txt" % (purl.scheme, purl.netloc) getreq = Request(REQTYPE_LINK, "GET", url) try: # request, timeout, retries=None, user_agent=None, proxy=None): httpget = HttpGet(getreq, 10, 1, "Googlebot", Shared.options['proxy']) lines = httpget.get_file().split("\n") except urllib2.HTTPError: return [] except: raise requests = [] for line in lines: directive = "" url = None try: directive, url = re.sub("\#.*", "", line).split(":", 1) except Exception as e: print(str(e)) continue # ignore errors if re.match("(dis)?allow", directive.strip(), re.I): req = Request(REQTYPE_LINK, "GET", url.strip(), parent=start_request) if request_is_crawlable(req): requests.append(req) return adjust_requests(requests) if requests else []