Example #1
0
	def get_requests_from_robots(self, request):
		purl = urlsplit(request.url)		
		url = "%s://%s/robots.txt" % (purl.scheme, purl.netloc)
		
		getreq = Request(REQTYPE_LINK, "GET", url)
		try:		
			# request, timeout, retries=None, useragent=None, proxy=None):
			httpget = HttpGet(getreq, 10, 1, "Googlebot", Shared.options['proxy'])			
			lines = httpget.get_file().split("\n")
		except urllib2.HTTPError:
			return []
		except:
			raise

		requests = []
		for line in lines:			
			directive = ""
			url = None
			try:
				directive, url = re.sub("\#.*","",line).split(":",1)
			except:
				continue # ignore errors

			if re.match("(dis)?allow", directive.strip(), re.I):
				req = Request(REQTYPE_LINK, "GET", url.strip(), parent=request)			
				requests.append(req)


		return adjust_requests(requests) if requests else []
Example #2
0
    def get_requests_from_robots(self, request):
        purl = urlsplit(request.url)
        url = "%s://%s/robots.txt" % (purl.scheme, purl.netloc)

        getreq = Request(REQTYPE_LINK,
                         "GET",
                         url,
                         extra_headers=Shared.options['extra_headers'])
        try:
            # request, timeout, retries=None, useragent=None, proxy=None):
            httpget = HttpGet(getreq, 10, 1, "Googlebot",
                              Shared.options['proxy'])
            lines = httpget.get_file().split("\n")
        except urllib.error.HTTPError:
            return []
        except:
            return []
            #raise

        requests = []
        for line in lines:
            directive = ""
            url = None
            try:
                directive, url = re.sub("\#.*", "", line).split(":", 1)
            except:
                continue  # ignore errors

            if re.match("(dis)?allow", directive.strip(), re.I):
                req = Request(REQTYPE_LINK, "GET", url.strip(), parent=request)
                requests.append(req)

        return adjust_requests(requests) if requests else []
Example #3
0
    def _get_requests_from_robots(start_request):
        """
        read robots.txt file (if any) and create a list of request based on it's content

        :return: list of request
        """
        purl = urlsplit(start_request.url)
        url = "%s://%s/robots.txt" % (purl.scheme, purl.netloc)

        getreq = Request(REQTYPE_LINK, "GET", url)
        try:
            # request, timeout, retries=None, user_agent=None, proxy=None):
            httpget = HttpGet(getreq, 10, 1, "Googlebot",
                              Shared.options['proxy'])
            lines = httpget.get_file().split("\n")
        except urllib2.HTTPError:
            return []
        except:
            raise

        requests = []
        for line in lines:
            directive = ""
            url = None
            try:
                directive, url = re.sub("\#.*", "", line).split(":", 1)
            except Exception as e:
                print(str(e))
                continue  # ignore errors

            if re.match("(dis)?allow", directive.strip(), re.I):
                req = Request(REQTYPE_LINK,
                              "GET",
                              url.strip(),
                              parent=start_request)
                if request_is_crawlable(req):
                    requests.append(req)

        return adjust_requests(requests) if requests else []