def open(self, url, data=None):
     # cf https://github.com/w3c/py-http-handler/blob/master/checkremote.py
     from checkremote import check_url_safety, UnsupportedResourceError
     try:
         check_url_safety(url)
     except UnsupportedResourceError:
         raise IOError(403, "Access to url '%s' is not allowed" % url)
     if self.surblchecker.isMarkedAsSpam(url):
         raise IOError(
             403,
             "Access to url '%s' is not allowed as it is marked as spam in SURBL"
             % url)
     return urllib.FancyURLopener.open(self, url, data)
Beispiel #2
0
def brett_test(uri):

    if not sys.platform == "darwin":
        from checkremote import check_url_safety, UnsupportedResourceError
        from urllib2 import HTTPError, URLError
        try:
            check_url_safety(uri)
        except HTTPError as e:
            err_message(
                'HTTP Error with the error code: %s and the error message: "%s"' (
                    e.code, e.reason))
        except URLError as e:
            err_message('URL Error with the error message: "%s"' % e.reason)
        except UnsupportedResourceError as e:
            msg = e.args[0] + ": " + e.args[1]
            err_message(
                'Unsupported Resource Error with the error message "%s"' % msg)
        except Exception as e:
            l = len(e.args)
            msg = "" if l == 0 else (e.args[0] if l == 1 else e.args)
            err_message('Exception raised: "%s"' % msg)
Beispiel #3
0
def brett_test(uri):
	"""
	Test, when running on W3C, the safety of the URL.

	:param str uri: The URI used to start up the script
	:return: result of the check
	:rtype: Boolean

	If the the test does not pass, ie an exception is raised somewhere down the line, an error message is sent back (via HTTP) to the caller.

	Contributed by Brett Smith, W3C, and relying on an external library (``check_url_safety``) running at the W3C. *This method runs only on the W3C site and its invocation must be preceded by an appropriate check*.
	"""
	from checkremote import check_url_safety, UnsupportedResourceError
	if PY3:
		from urllib.error import HTTPError, URLError
	else:
		from urllib2 import HTTPError, URLError
	try:
		check_url_safety(uri)
		# If we got here, there have been no issues; Brett's script simply raises exceptions
		return True
	except HTTPError as e:
		err_message(uri, 'HTTP Error with the error code: %s and the error message: "%s"' (e.code, e.reason))
	except URLError as e:
		err_message(uri, 'URL Error with the error message: "%s"' % e.reason)
	except UnsupportedResourceError as e:
		msg = e.args[0] + ": " + e.args[1]
		err_message(uri, 'Unsupported Resource Error with the error message "%s"' % msg)
	except Exception as e:
		args = len(e.args)
		msg = "" if args == 0 else (e.args[0] if args == 1 else repr(e.args))
		err_message(uri, 'Exception raised: "%s"' % msg)

	# If we got here one of the exceptions were handled, ie, the result of the check
	# is False...
	return False
Beispiel #4
0
    def __init__(self, name, additional_headers={}):
        """
		@param name: URL to be opened
		@keyword additional_headers: additional HTTP request headers to be added to the call
		"""
        try:
            # Note the removal of the fragment ID. This is necessary, per the HTTP spec
            url = name.split('#')[0]
            if socket.getfqdn().endswith('.w3.org'):
                import checkremote
                checkremote.check_url_safety(url)
            if 'Accept' not in additional_headers:
                additional_headers[
                    'Accept'] = 'text/html, application/xhtml+xml'

            import requests
            # Switching off the verification is not cool. But, at least for now, too many
            # sites still go wrong because the cerficates are not o.k. with request...
            r = requests.get(url, headers=additional_headers, verify=False)
            self.data = r.content
            self.headers = r.headers

            if URIOpener.CONTENT_TYPE in self.headers:
                # The call below will remove the possible media type parameters, like charset settings
                ct = content_type(self.headers[URIOpener.CONTENT_TYPE])
                self.content_type = ct.media_type
                if 'charset' in ct.parmdict:
                    self.charset = ct.parmdict['charset']
                else:
                    self.charset = None
                # print
            else:
                # check if the suffix can be used for the content type; this may be important
                # for file:// type URI or if the server is not properly set up to return the right
                # mime type
                self.charset = None
                self.content_type = ""
                for suffix in preferred_suffixes.keys():
                    if name.endswith(suffix):
                        self.content_type = preferred_suffixes[suffix]
                        break

            if URIOpener.CONTENT_LOCATION in self.headers:
                self.location = urljoin(
                    r.url, self.headers[URIOpener.CONTENT_LOCATION])
            else:
                self.location = name

            self.expiration_date = datetime.datetime.utcnow(
            ) + datetime.timedelta(days=1)
            if URIOpener.EXPIRES in self.headers:
                try:
                    # Thanks to Deron Meranda for the HTTP date conversion method...
                    self.expiration_date = parse_http_datetime(
                        self.headers[URIOpener.EXPIRES])
                except:
                    # The Expires date format was wrong, sorry, forget it...
                    pass

            self.last_modified_date = None
            if URIOpener.LAST_MODIFIED in self.headers:
                try:
                    # Thanks to Deron Meranda for the HTTP date conversion method...
                    self.last_modified_date = parse_http_datetime(
                        self.headers[URIOpener.LAST_MODIFIED])
                except:
                    # The last modified date format was wrong, sorry, forget it...
                    pass

        except urllib_HTTPError:
            e = sys.exc_info()[1]
            from . import HTTPError
            msg = BaseHTTPRequestHandler.responses[e.code]
            raise HTTPError('%s' % msg[1], e.code)
        except Exception:
            e = sys.exc_info()[1]
            from . import RDFaError
            raise RDFaError('%s' % e)
Beispiel #5
0
	def __init__(self, name, additional_headers = {}) :
		"""
		@param name: URL to be opened
		@keyword additional_headers: additional HTTP request headers to be added to the call
		"""		
		try :
			# Note the removal of the fragment ID. This is necessary, per the HTTP spec
			url = name.split('#')[0]
			if socket.getfqdn().endswith('.w3.org'):
				import checkremote
				checkremote.check_url_safety(url)
			if 'Accept' not in additional_headers:
				additional_headers['Accept'] = 'text/html, application/xhtml+xml'
				
			import requests
			r = requests.get(url, headers=additional_headers)
			self.data	= r.content
			self.headers	= r.headers
			
			if URIOpener.CONTENT_TYPE in self.headers :
				# The call below will remove the possible media type parameters, like charset settings
				ct = content_type(self.headers[URIOpener.CONTENT_TYPE])
				self.content_type = ct.media_type
				if 'charset' in ct.parmdict :
					self.charset = ct.parmdict['charset']
				else :
					self.charset = None
				# print
			else :
				# check if the suffix can be used for the content type; this may be important
				# for file:// type URI or if the server is not properly set up to return the right
				# mime type
				self.charset = None
				self.content_type = ""
				for suffix in preferred_suffixes.keys() :
					if name.endswith(suffix) :
						self.content_type = preferred_suffixes[suffix]
						break
			
			if URIOpener.CONTENT_LOCATION in self.headers :
				self.location = urljoin(r.url,self.headers[URIOpener.CONTENT_LOCATION])
			else :
				self.location = name
			
			self.expiration_date = datetime.datetime.utcnow() + datetime.timedelta(days=1)
			if URIOpener.EXPIRES in self.headers :
				try :
					# Thanks to Deron Meranda for the HTTP date conversion method...
					self.expiration_date = parse_http_datetime(self.headers[URIOpener.EXPIRES])
				except :
					# The Expires date format was wrong, sorry, forget it...
					pass

			self.last_modified_date = None
			if URIOpener.LAST_MODIFIED in self.headers :
				try :
					# Thanks to Deron Meranda for the HTTP date conversion method...
					self.last_modified_date = parse_http_datetime(self.headers[URIOpener.LAST_MODIFIED])
				except :
					# The last modified date format was wrong, sorry, forget it...
					pass
				
		except urllib_HTTPError :
			e = sys.exc_info()[1]
			from . import HTTPError
			msg = BaseHTTPRequestHandler.responses[e.code]
			raise HTTPError('%s' % msg[1], e.code)
		except Exception :
			e = sys.exc_info()[1]
			from . import RDFaError
			raise RDFaError('%s' % e)