Example #1
def get_uris(strings, min_length=10):
    # used when calling .extract_fileurl() before .extract_strings()
    if len(strings) <= 0:

    _uri_matcher = rfc3987.get_compiled_pattern("^%(URI)s$")

    extracted_uris = [uri for uri in strings if len(uri) > min_length and _uri_matcher.match(uri)]
    return extracted_uris
Example #2
 def build(self, *which):
     :param which: a (sub)set of ["uri", "email]
     regex_components = []
     for regex_type in which:
         regex_components.append(getattr(self, regex_type + '_regex'))
     string = u'(?P<string>\n%s\n)' % u'\n|\n'.join(regex_components)
     return rfc3987.get_compiled_pattern(string, re.IGNORECASE | re.VERBOSE | re.UNICODE)
Example #3
 def build(self, *which):
     :param which: a (sub)set of ["uri", "email]
     regex_components = []
     for regex_type in which:
         regex_components.append(getattr(self, regex_type + '_regex'))
     string = u'(?P<string>\n%s\n)' % u'\n|\n'.join(regex_components)
     return rfc3987.get_compiled_pattern(
         string, re.IGNORECASE | re.VERBOSE | re.UNICODE)
Example #4
def extract_text_from_html(input_, pre_strip_urls=False):
    Take a HTML document and return the raw text

    The raw text is returned without any structure (including without
    punctuation) and without non alphabetic characters.

    bs4.BeautifulSoup.get_text is used for parsing. It removes HTML comments,
    scripts and styles prior to extracting the text. Only text in the body is

    input_ : str or bs4.BeautifulSoup
        HTML document from which the text will be extracted

    pre_strip_urls : bool
        If True, Links that are found in the text part of the HTML are removed
        prior to filtering

    text : str
        The text found in the HTML document
    if isinstance(input_, bs4.BeautifulSoup):
        bs = input_
    elif isinstance(input_, basestring):
        bs = parse_html(input_)
        raise ValueError("unknown input_ - expected str or "
                         "bs4.BeautifulSoup got '%s'" % (type(input_)))

        text = bs.body.get_text().strip()
    except (TypeError, AttributeError):
        return None

    if pre_strip_urls:
            text = rfc3987.get_compiled_pattern(rule='URI').sub(" ", text)
            raise ValueError("pre_strip_urls requires regex and rfc3987 "
                             "libraries. One or both could not be found")

    return filter_text(text)
Example #5
import rfc3987

URL_REGEX = rfc3987.get_compiled_pattern('%(URI)s')

def extract_urls(text):
    '''Extract urls from given text

    :text: (str) Text to extract urls from
    :returns: List of urls extracted


    return URL_REGEX.findall(text)
    return (instance.__class__,) + args

cache_container_agnostic_configuration          = {}
cache_container_ssl_configuration          	= {}
cache_container_url2entity_configuration          	= {}
cache_container_nginx_fs                        = {}
cache_extra_from_distrib                        = {}

# Traitement des URL

URI_rfc3987                        =       rfc3987.get_compiled_pattern('^%(URI)s$')

def common_process_uri(
    key_uri, uri                    = [ ( k, v ) for k, v in d.items() if k.startswith( suffixwith ) and k.endswith( 'URI' ) ][ 0 ]

    d_rfc3987 = URI_rfc3987.match( uri ).groupdict()
        except Exception as e:
    if not dataset_ref:
        dataset_ref = g.value(predicate=RDF.type, object=DCAT.Dataset)
    return dataset_ref

# TODO disallows whitespaces
VALID_URL = re.compile(
        r'^(?:http|ftp)s?://' # http:// or https://
        r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain...
        r'localhost|' #localhost...
        r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
        r'(?::\d+)?' # optional port
        r'(?:/?|[/?]\S+)$', re.IGNORECASE)

URI = get_compiled_pattern('^%(URI)s$')

def is_valid_url(references):
        res = urlparse.urlparse(references)
        return bool(res.scheme and res.netloc)
    except Exception as e:
        return False

def is_valid_uri(references):
    return bool(URI.match(references))

def graph_from_opendatasoft(g, dataset_dict, portal_url):
Example #8
def to_iri(iri):
    Safely quotes an IRI in a way that is resilient to unicode and incorrect
    arguments (checks for RFC 3987 compliance and falls back to percent encoding)
    # First decode the IRI if needed
    if not isinstance(iri, str):
        logger.debug("Converting IRI to unicode")
        iri = iri.decode('utf-8')

        # If we can safely parse the URI, then we don't
        # need to do anything special here
        rfc3987.parse(iri, rule='IRI')
        logger.debug("This is already a valid IRI, doing nothing...")
        return iri
        # The URI is not valid, so we'll have to fix it.
        logger.debug("The IRI is not valid, proceeding to quote...")
        # First see whether we can actually parse it *as if* it is a URI

        parts = urlparse.urlsplit(iri)
        if not parts.scheme or not parts.netloc:
            # If there is no scheme (e.g. http) nor a net location (e.g.
            # example.com) then we cannot do anything
            logger.error("The argument you provided does not comply with "
                         "RFC 3987 and is not parseable as a IRI"
                         "(there is no scheme or no net location part)")
            raise Exception("The argument you provided does not comply with"
                            "RFC 3987 and is not parseable as a IRI"
                            "(there is no scheme or no net location part)")

            "The IRI contains all necessary parts (scheme + net location)")
        quoted_parts = {}
        # We'll now convert the path, query and fragment parts of the URI

        # Get the 'anti-pattern' for the valid characters (see rfc3987 package)
        # This is roughly the ipchar pattern plus the '/' as we don't need to match
        # the entire path, but merely the individual characters
        no_invalid_characters = rfc3987.get_compiled_pattern(

        # Replace the invalid characters with an underscore (no need to roundtrip)
        quoted_parts['path'] = no_invalid_characters.sub(u'_', parts.path)
        if parts.fragment:
            quoted_parts['fragment'] = no_invalid_characters.sub(
                u'_', parts.fragment)
        if parts.query:
            quoted_parts['query'] = urllib.quote(parts.query.encode('utf-8'),
        # Leave these untouched
        quoted_parts['scheme'] = parts.scheme
        quoted_parts['authority'] = parts.netloc

        # Extra check to make sure we now have a valid IRI
        quoted_iri = rfc3987.compose(**quoted_parts)
            # Unable to generate a valid quoted iri, using the straightforward
            # urllib percent quoting (but this is ugly!)
            logger.warning('Could not safely quote as IRI, falling back to '
                           'percent encoding')
            quoted_iri = urllib.quote(iri.encode('utf-8'))

        return quoted_iri
Example #9
import re
import sys
from rfc3986 import is_valid_uri
from rfc3987 import get_compiled_pattern
from strict_rfc3339 import validate_rfc3339

if sys.version_info > (3, ):
    long = int
    unicode = str

email_regex = re.compile(r'(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)')
ipv4_regex = get_compiled_pattern('^%(IPv4address)s$')
ipv6_regex = get_compiled_pattern('^%(IPv6address)s$')
hostname_regex = re.compile('(?!-)[A-Z\d-]{1,63}(?<!-)$', re.IGNORECASE)

rfc3339 = validate_rfc3339

def email(value):
    return email_regex.match(value) != None

def ipv4(value):
    return ipv4_regex.match(value) != None

def ipv6(value):
    return ipv6_regex.match(value) != None

def uri(value):
Example #10
def to_iri(iri):
    Safely quotes an IRI in a way that is resilient to unicode and incorrect
    arguments (checks for RFC 3987 compliance and falls back to percent encoding)
    # First decode the IRI if needed
    if not isinstance(iri, unicode):
        logger.debug("Converting IRI to unicode")
        iri = iri.decode('utf-8')

        # If we can safely parse the URI, then we don't
        # need to do anything special here
        rfc3987.parse(iri, rule='IRI')
        logger.debug("This is already a valid IRI, doing nothing...")
        return iri
        # The URI is not valid, so we'll have to fix it.
        logger.debug("The IRI is not valid, proceeding to quote...")
        # First see whether we can actually parse it *as if* it is a URI

        parts = urlparse.urlsplit(iri)
        if not parts.scheme or not parts.netloc:
            # If there is no scheme (e.g. http) nor a net location (e.g.
            # example.com) then we cannot do anything
            logger.error("The argument you provided does not comply with "
                         "RFC 3987 and is not parseable as a IRI"
                         "(there is no scheme or no net location part)")
            raise Exception("The argument you provided does not comply with"
                            "RFC 3987 and is not parseable as a IRI"
                            "(there is no scheme or no net location part)")

        logger.debug("The IRI contains all necessary parts (scheme + net location)")
        quoted_parts = {}
        # We'll now convert the path, query and fragment parts of the URI

        # Get the 'anti-pattern' for the valid characters (see rfc3987 package)
        # This is roughly the ipchar pattern plus the '/' as we don't need to match
        # the entire path, but merely the individual characters
        no_invalid_characters = rfc3987.get_compiled_pattern("(?!%(iunreserved)s|%(pct_encoded)s|%(sub_delims)s|:|@|/)(.)")

        # Replace the invalid characters with an underscore (no need to roundtrip)
        quoted_parts['path'] = no_invalid_characters.sub(u'_', parts.path)
        if parts.fragment:
            quoted_parts['fragment'] = no_invalid_characters.sub(u'_', parts.fragment)
        if parts.query:
            quoted_parts['query'] = urllib.quote(parts.query.encode('utf-8'),safe="&=")
        # Leave these untouched
        quoted_parts['scheme'] = parts.scheme
        quoted_parts['authority'] = parts.netloc

        # Extra check to make sure we now have a valid IRI
        quoted_iri = rfc3987.compose(**quoted_parts)
            # Unable to generate a valid quoted iri, using the straightforward
            # urllib percent quoting (but this is ugly!)
            logger.warning('Could not safely quote as IRI, falling back to '
                           'percent encoding')
            quoted_iri = urllib.quote(iri.encode('utf-8'))

        return quoted_iri
Example #11
import unittest
import urlparse
from collections import Counter, OrderedDict
from operator import attrgetter, itemgetter

import bs4
import praw
import rfc3987
import urlobject
from jinja2 import Markup, Environment, PackageLoader

__APPNAME__ = "redem"
__VERSION__ = "0.1.1"
__USER_AGENT__ = '%s (praw)\%s' % (__APPNAME__, __VERSION__)

uri_rgx = rfc3987.get_compiled_pattern('URI')  # URI_reference
log = logging.getLogger('%s.cli' % __APPNAME__)

Example #12
 def _is_valid(self, url):
     """Check URL validity using RFC 3986 based regular expression"""
     # FIXME: disallow file:// scheme - security risk
     pattern = rfc3987.get_compiled_pattern('^%(URI)s$')
     return pattern.match(url) is not None
Example #13
    return pp.Group(
        pp.Keyword(tag_name) + COLON + BOOLEAN + EOL

def id_only_tag(tag_name: str) -> pp.ParserElement:
    return pp.Group(
        pp.Keyword(tag_name) + COLON + _id_value + EOL

def basic_tag_value_pair(tag_name: str) -> pp.ParserElement:
    return pp.Group(pp.Keyword(tag_name) + COLON + obo_unquoted + EOL)(tag_name)

stanza_type = (pp.Keyword('Term') | pp.Keyword('Typedef') | pp.Keyword('Instance'))
stanza_name = (pp.LineStart() + OPEN_BRACKET + stanza_type + CLOSE_BRACKET)('stanza_name')
tag = pp.Regex(r'[^:\n]+')('tag')
value = pp.OneOrMore(
    pp.Word(pp.printables, excludeChars='{!"') | quoted_string, stopOn=pp.LineEnd()).setParseAction(
    ' '.join)('value')
xsd_type = pp.Combine('xsd:' + pp.Word(pp.alphas))('xsd-type')
dbxref_name = (pp.Regex(r'(?:[^],\"\\\n]|\\.)+') +
dbxref = pp.Group(
    OPEN_BRACKET + (pp.delimitedList(dbxref_name, ',') | pp.Empty()) + CLOSE_BRACKET
tag_value_pair = pp.Group(tag + COLON + value + EOL)
file_path = pp.Regex(r'[^\0\n]+')
iri = pp.Regex(get_compiled_pattern('%(IRI)s'))