Example #1
0
    def execute(self, obj):
        if not isinstance(obj, str):
            raise TypeError('\'{}\' is not of type str.'.format(obj))

        parsed = self._parse(obj)
        parsed = self._process(**parsed)
        return rfc3987.parse(rfc3987.compose(**parsed))
Example #2
0
    def execute(self, obj):
        if not isinstance(obj, str):
            raise TypeError('\'{}\' is not of type str.'.format(obj))

        parsed = self._parse(obj)
        parsed = self._process(**parsed)
        return rfc3987.parse(rfc3987.compose(**parsed))
Example #3
0
def is_internal_url(url):
    is_internal = url['authority'] in (None, 'software.esciencecenter.nl')
    if is_internal and not url['path'].startswith('/'):
        raise ValueError('Path {} must start with /'.format(
            rfc3987.compose(**url)))

    if is_internal and url['scheme'] == 'https':
        raise ValueError('For the time being, use http instead of https '
                         'prefixes for http://software.esciencecenter.nl')
    return is_internal
Example #4
0
def parse_url(url):
    try:
        matches = rfc3987.parse(url, rule='URI')
    except ValueError:
        raise HTTPBadRequest(detail=Messages.invalid_uri)
    if matches['scheme'] not in ['http', 'https']:
        raise HTTPBadRequest(detail=Messages.invalid_uri)
    matches['path'] = matches['path'] or '/'
    matches['fragment'] = None
    return rfc3987.compose(**matches)
Example #5
0
 def _sanitizewebiri(iri):
     res = rfc3987.parse(iri, b"IRI")
     scheme = res[b"scheme"]
     if scheme == None or scheme.lower() not in ("http", "https"):
         raise ValueError() # Not a Web address
     authority = res[b"authority"]
     if authority == None or len(authority) == 0:
         raise ValueError() # No host specified
     res[b"authority"] = DataProvider._sanitizefqdn(authority)[:-1]
     iri = rfc3987.compose(**res)
     # Derived from Django
     uri = urllib.quote(iri.encode("utf-8"), safe=b"/#%[]=:;$&()+,!?*@'~")
     return unicode(uri)
    def remap(url):
        url_parts = parse(url, 'URI')
        if not (url_parts['scheme'] == src_parts['scheme']
                and url_parts['authority'] == src_parts['authority']):
            return False, url

        url_path = Path(unquote(url_parts['path'])).resolve()
        if src_path != url_path and src_path not in url_path.parents:
            return False, url

        result_path = dest_path / url_path.relative_to(src_path)

        # Use a trailing slash if the incoming path had one. This facilitates
        # further URI resolution operations.
        if url_parts['path'].endswith('/'):
            final_path = f'{result_path}/'
        else:
            final_path = str(result_path)

        return True, (compose(scheme=dest_parts['scheme'],
                              authority=dest_parts['authority'],
                              path=quote(final_path),
                              query=url_parts['query'],
                              fragment=url_parts['fragment']))
Example #7
0
def to_iri(iri):
    """
    Safely quotes an IRI in a way that is resilient to unicode and incorrect
    arguments (checks for RFC 3987 compliance and falls back to percent encoding)
    """
    # First decode the IRI if needed
    if not isinstance(iri, str):
        logger.debug("Converting IRI to unicode")
        iri = iri.decode('utf-8')

    try:
        # If we can safely parse the URI, then we don't
        # need to do anything special here
        rfc3987.parse(iri, rule='IRI')
        logger.debug("This is already a valid IRI, doing nothing...")
        return iri
    except:
        # The URI is not valid, so we'll have to fix it.
        logger.debug("The IRI is not valid, proceeding to quote...")
        # First see whether we can actually parse it *as if* it is a URI

        parts = urlparse.urlsplit(iri)
        if not parts.scheme or not parts.netloc:
            # If there is no scheme (e.g. http) nor a net location (e.g.
            # example.com) then we cannot do anything
            logger.error("The argument you provided does not comply with "
                         "RFC 3987 and is not parseable as a IRI"
                         "(there is no scheme or no net location part)")
            logger.error(iri)
            raise Exception("The argument you provided does not comply with"
                            "RFC 3987 and is not parseable as a IRI"
                            "(there is no scheme or no net location part)")

        logger.debug(
            "The IRI contains all necessary parts (scheme + net location)")
        quoted_parts = {}
        # We'll now convert the path, query and fragment parts of the URI

        # Get the 'anti-pattern' for the valid characters (see rfc3987 package)
        # This is roughly the ipchar pattern plus the '/' as we don't need to match
        # the entire path, but merely the individual characters
        no_invalid_characters = rfc3987.get_compiled_pattern(
            "(?!%(iunreserved)s|%(pct_encoded)s|%(sub_delims)s|:|@|/)(.)")

        # Replace the invalid characters with an underscore (no need to roundtrip)
        quoted_parts['path'] = no_invalid_characters.sub(u'_', parts.path)
        if parts.fragment:
            quoted_parts['fragment'] = no_invalid_characters.sub(
                u'_', parts.fragment)
        if parts.query:
            quoted_parts['query'] = urllib.quote(parts.query.encode('utf-8'),
                                                 safe="&=")
        # Leave these untouched
        quoted_parts['scheme'] = parts.scheme
        quoted_parts['authority'] = parts.netloc

        # Extra check to make sure we now have a valid IRI
        quoted_iri = rfc3987.compose(**quoted_parts)
        try:
            rfc3987.parse(quoted_iri)
        except:
            # Unable to generate a valid quoted iri, using the straightforward
            # urllib percent quoting (but this is ugly!)
            logger.warning('Could not safely quote as IRI, falling back to '
                           'percent encoding')
            quoted_iri = urllib.quote(iri.encode('utf-8'))

        return quoted_iri
Example #8
0
def absolute_url(url):
    url = parse_url(url)
    if url['authority'] is None:
        url['scheme'] = 'http'
        url['authority'] = 'software.esciencecenter.nl'
    return rfc3987.compose(**url)
Example #9
0
def check_internal_url(url):
    if not is_internal_url(url):
        raise ValueError('Url {} is not internal'.format(
            rfc3987.compose(**url)))
Example #10
0
def to_iri(iri):
    """
    Safely quotes an IRI in a way that is resilient to unicode and incorrect
    arguments (checks for RFC 3987 compliance and falls back to percent encoding)
    """
    # First decode the IRI if needed
    if not isinstance(iri, unicode):
        logger.debug("Converting IRI to unicode")
        iri = iri.decode('utf-8')

    try:
        # If we can safely parse the URI, then we don't
        # need to do anything special here
        rfc3987.parse(iri, rule='IRI')
        logger.debug("This is already a valid IRI, doing nothing...")
        return iri
    except:
        # The URI is not valid, so we'll have to fix it.
        logger.debug("The IRI is not valid, proceeding to quote...")
        # First see whether we can actually parse it *as if* it is a URI

        parts = urlparse.urlsplit(iri)
        if not parts.scheme or not parts.netloc:
            # If there is no scheme (e.g. http) nor a net location (e.g.
            # example.com) then we cannot do anything
            logger.error("The argument you provided does not comply with "
                         "RFC 3987 and is not parseable as a IRI"
                         "(there is no scheme or no net location part)")
            logger.error(iri)
            raise Exception("The argument you provided does not comply with"
                            "RFC 3987 and is not parseable as a IRI"
                            "(there is no scheme or no net location part)")

        logger.debug("The IRI contains all necessary parts (scheme + net location)")
        quoted_parts = {}
        # We'll now convert the path, query and fragment parts of the URI

        # Get the 'anti-pattern' for the valid characters (see rfc3987 package)
        # This is roughly the ipchar pattern plus the '/' as we don't need to match
        # the entire path, but merely the individual characters
        no_invalid_characters = rfc3987.get_compiled_pattern("(?!%(iunreserved)s|%(pct_encoded)s|%(sub_delims)s|:|@|/)(.)")

        # Replace the invalid characters with an underscore (no need to roundtrip)
        quoted_parts['path'] = no_invalid_characters.sub(u'_', parts.path)
        if parts.fragment:
            quoted_parts['fragment'] = no_invalid_characters.sub(u'_', parts.fragment)
        if parts.query:
            quoted_parts['query'] = urllib.quote(parts.query.encode('utf-8'),safe="&=")
        # Leave these untouched
        quoted_parts['scheme'] = parts.scheme
        quoted_parts['authority'] = parts.netloc

        # Extra check to make sure we now have a valid IRI
        quoted_iri = rfc3987.compose(**quoted_parts)
        try:
            rfc3987.parse(quoted_iri)
        except:
            # Unable to generate a valid quoted iri, using the straightforward
            # urllib percent quoting (but this is ugly!)
            logger.warning('Could not safely quote as IRI, falling back to '
                           'percent encoding')
            quoted_iri = urllib.quote(iri.encode('utf-8'))

        return quoted_iri
Example #11
0
File: owl.py Project: Jeket/eddy
 def __str__(self):
     return compose(**self.components)