Beispiel #1
0
def get_uris(strings, min_length=10):
    # used when calling .extract_fileurl() before .extract_strings()
    if len(strings) <= 0:
        return

    _uri_matcher = rfc3987.get_compiled_pattern("^%(URI)s$")

    extracted_uris = [uri for uri in strings if len(uri) > min_length and _uri_matcher.match(uri)]
    return extracted_uris
Beispiel #2
0
 def build(self, *which):
     """
     :param which: a (sub)set of ["uri", "email]
     """
     regex_components = []
     for regex_type in which:
         regex_components.append(getattr(self, regex_type + '_regex'))
     string = u'(?P<string>\n%s\n)' % u'\n|\n'.join(regex_components)
     return rfc3987.get_compiled_pattern(string, re.IGNORECASE | re.VERBOSE | re.UNICODE)
Beispiel #3
0
 def build(self, *which):
     """
     :param which: a (sub)set of ["uri", "email]
     """
     regex_components = []
     for regex_type in which:
         regex_components.append(getattr(self, regex_type + '_regex'))
     string = u'(?P<string>\n%s\n)' % u'\n|\n'.join(regex_components)
     return rfc3987.get_compiled_pattern(
         string, re.IGNORECASE | re.VERBOSE | re.UNICODE)
Beispiel #4
0
def extract_text_from_html(input_, pre_strip_urls=False):
    """
    Take a HTML document and return the raw text

    The raw text is returned without any structure (including without
    punctuation) and without non alphabetic characters.

    bs4.BeautifulSoup.get_text is used for parsing. It removes HTML comments,
    scripts and styles prior to extracting the text. Only text in the body is
    returned.

    Parameters
    ----------
    input_ : str or bs4.BeautifulSoup
        HTML document from which the text will be extracted

    pre_strip_urls : bool
        If True, Links that are found in the text part of the HTML are removed
        prior to filtering

    Returns
    -------
    text : str
        The text found in the HTML document
    """
    if isinstance(input_, bs4.BeautifulSoup):
        bs = input_
    elif isinstance(input_, basestring):
        bs = parse_html(input_)
    else:
        raise ValueError("unknown input_ - expected str or "
                         "bs4.BeautifulSoup got '%s'" % (type(input_)))

    try:
        text = bs.body.get_text().strip()
    except (TypeError, AttributeError):
        return None

    if pre_strip_urls:
        if RFC_REGEX_AVAILABLE:
            text = rfc3987.get_compiled_pattern(rule='URI').sub(" ", text)
        else:
            raise ValueError("pre_strip_urls requires regex and rfc3987 "
                             "libraries. One or both could not be found")

    return filter_text(text)
Beispiel #5
0
import rfc3987

URL_REGEX = rfc3987.get_compiled_pattern('%(URI)s')


def extract_urls(text):
    '''Extract urls from given text

    :text: (str) Text to extract urls from
    :returns: List of urls extracted

    '''

    return URL_REGEX.findall(text)
    return (instance.__class__,) + args

cache_container_agnostic_configuration          = {}
cache_container_ssl_configuration          	= {}
cache_container_url2entity_configuration          	= {}
cache_container_nginx_fs                        = {}
cache_extra_from_distrib                        = {}



####################
# Traitement des URL
####################

URI_rfc3987                        =       rfc3987.get_compiled_pattern('^%(URI)s$')

def common_process_uri(
    le_root_configuration,
    d,
    current_line,
    current_server,
    current_port,
    current_mapping_type,
    l_bad_configurations,
    suffixwith
):
    key_uri, uri                    = [ ( k, v ) for k, v in d.items() if k.startswith( suffixwith ) and k.endswith( 'URI' ) ][ 0 ]

    d_rfc3987 = URI_rfc3987.match( uri ).groupdict()
        except Exception as e:
            pass
    if not dataset_ref:
        dataset_ref = g.value(predicate=RDF.type, object=DCAT.Dataset)
    return dataset_ref

# TODO disallows whitespaces
VALID_URL = re.compile(
        r'^(?:http|ftp)s?://' # http:// or https://
        r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain...
        r'localhost|' #localhost...
        r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
        r'(?::\d+)?' # optional port
        r'(?:/?|[/?]\S+)$', re.IGNORECASE)

URI = get_compiled_pattern('^%(URI)s$')


def is_valid_url(references):
    try:
        res = urlparse.urlparse(references)
        return bool(res.scheme and res.netloc)
    except Exception as e:
        return False


def is_valid_uri(references):
    return bool(URI.match(references))


def graph_from_opendatasoft(g, dataset_dict, portal_url):
Beispiel #8
0
def to_iri(iri):
    """
    Safely quotes an IRI in a way that is resilient to unicode and incorrect
    arguments (checks for RFC 3987 compliance and falls back to percent encoding)
    """
    # First decode the IRI if needed
    if not isinstance(iri, str):
        logger.debug("Converting IRI to unicode")
        iri = iri.decode('utf-8')

    try:
        # If we can safely parse the URI, then we don't
        # need to do anything special here
        rfc3987.parse(iri, rule='IRI')
        logger.debug("This is already a valid IRI, doing nothing...")
        return iri
    except:
        # The URI is not valid, so we'll have to fix it.
        logger.debug("The IRI is not valid, proceeding to quote...")
        # First see whether we can actually parse it *as if* it is a URI

        parts = urlparse.urlsplit(iri)
        if not parts.scheme or not parts.netloc:
            # If there is no scheme (e.g. http) nor a net location (e.g.
            # example.com) then we cannot do anything
            logger.error("The argument you provided does not comply with "
                         "RFC 3987 and is not parseable as a IRI"
                         "(there is no scheme or no net location part)")
            logger.error(iri)
            raise Exception("The argument you provided does not comply with"
                            "RFC 3987 and is not parseable as a IRI"
                            "(there is no scheme or no net location part)")

        logger.debug(
            "The IRI contains all necessary parts (scheme + net location)")
        quoted_parts = {}
        # We'll now convert the path, query and fragment parts of the URI

        # Get the 'anti-pattern' for the valid characters (see rfc3987 package)
        # This is roughly the ipchar pattern plus the '/' as we don't need to match
        # the entire path, but merely the individual characters
        no_invalid_characters = rfc3987.get_compiled_pattern(
            "(?!%(iunreserved)s|%(pct_encoded)s|%(sub_delims)s|:|@|/)(.)")

        # Replace the invalid characters with an underscore (no need to roundtrip)
        quoted_parts['path'] = no_invalid_characters.sub(u'_', parts.path)
        if parts.fragment:
            quoted_parts['fragment'] = no_invalid_characters.sub(
                u'_', parts.fragment)
        if parts.query:
            quoted_parts['query'] = urllib.quote(parts.query.encode('utf-8'),
                                                 safe="&=")
        # Leave these untouched
        quoted_parts['scheme'] = parts.scheme
        quoted_parts['authority'] = parts.netloc

        # Extra check to make sure we now have a valid IRI
        quoted_iri = rfc3987.compose(**quoted_parts)
        try:
            rfc3987.parse(quoted_iri)
        except:
            # Unable to generate a valid quoted iri, using the straightforward
            # urllib percent quoting (but this is ugly!)
            logger.warning('Could not safely quote as IRI, falling back to '
                           'percent encoding')
            quoted_iri = urllib.quote(iri.encode('utf-8'))

        return quoted_iri
Beispiel #9
0
import re
import sys
from rfc3986 import is_valid_uri
from rfc3987 import get_compiled_pattern
from strict_rfc3339 import validate_rfc3339

if sys.version_info > (3, ):
    long = int
    unicode = str

email_regex = re.compile(r'(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)')
ipv4_regex = get_compiled_pattern('^%(IPv4address)s$')
ipv6_regex = get_compiled_pattern('^%(IPv6address)s$')
hostname_regex = re.compile('(?!-)[A-Z\d-]{1,63}(?<!-)$', re.IGNORECASE)

rfc3339 = validate_rfc3339


def email(value):
    return email_regex.match(value) != None


def ipv4(value):
    return ipv4_regex.match(value) != None


def ipv6(value):
    return ipv6_regex.match(value) != None


def uri(value):
Beispiel #10
0
def to_iri(iri):
    """
    Safely quotes an IRI in a way that is resilient to unicode and incorrect
    arguments (checks for RFC 3987 compliance and falls back to percent encoding)
    """
    # First decode the IRI if needed
    if not isinstance(iri, unicode):
        logger.debug("Converting IRI to unicode")
        iri = iri.decode('utf-8')

    try:
        # If we can safely parse the URI, then we don't
        # need to do anything special here
        rfc3987.parse(iri, rule='IRI')
        logger.debug("This is already a valid IRI, doing nothing...")
        return iri
    except:
        # The URI is not valid, so we'll have to fix it.
        logger.debug("The IRI is not valid, proceeding to quote...")
        # First see whether we can actually parse it *as if* it is a URI

        parts = urlparse.urlsplit(iri)
        if not parts.scheme or not parts.netloc:
            # If there is no scheme (e.g. http) nor a net location (e.g.
            # example.com) then we cannot do anything
            logger.error("The argument you provided does not comply with "
                         "RFC 3987 and is not parseable as a IRI"
                         "(there is no scheme or no net location part)")
            logger.error(iri)
            raise Exception("The argument you provided does not comply with"
                            "RFC 3987 and is not parseable as a IRI"
                            "(there is no scheme or no net location part)")

        logger.debug("The IRI contains all necessary parts (scheme + net location)")
        quoted_parts = {}
        # We'll now convert the path, query and fragment parts of the URI

        # Get the 'anti-pattern' for the valid characters (see rfc3987 package)
        # This is roughly the ipchar pattern plus the '/' as we don't need to match
        # the entire path, but merely the individual characters
        no_invalid_characters = rfc3987.get_compiled_pattern("(?!%(iunreserved)s|%(pct_encoded)s|%(sub_delims)s|:|@|/)(.)")

        # Replace the invalid characters with an underscore (no need to roundtrip)
        quoted_parts['path'] = no_invalid_characters.sub(u'_', parts.path)
        if parts.fragment:
            quoted_parts['fragment'] = no_invalid_characters.sub(u'_', parts.fragment)
        if parts.query:
            quoted_parts['query'] = urllib.quote(parts.query.encode('utf-8'),safe="&=")
        # Leave these untouched
        quoted_parts['scheme'] = parts.scheme
        quoted_parts['authority'] = parts.netloc

        # Extra check to make sure we now have a valid IRI
        quoted_iri = rfc3987.compose(**quoted_parts)
        try:
            rfc3987.parse(quoted_iri)
        except:
            # Unable to generate a valid quoted iri, using the straightforward
            # urllib percent quoting (but this is ugly!)
            logger.warning('Could not safely quote as IRI, falling back to '
                           'percent encoding')
            quoted_iri = urllib.quote(iri.encode('utf-8'))

        return quoted_iri
Beispiel #11
0
import unittest
import urlparse
from collections import Counter, OrderedDict
from operator import attrgetter, itemgetter

import bs4
import praw
import rfc3987
import urlobject
from jinja2 import Markup, Environment, PackageLoader

__APPNAME__ = "redem"
__VERSION__ = "0.1.1"
__USER_AGENT__ = '%s (praw)\%s' % (__APPNAME__, __VERSION__)

uri_rgx = rfc3987.get_compiled_pattern('URI')  # URI_reference
log = logging.getLogger('%s.cli' % __APPNAME__)

SUBMISSION_ATTRS = (
    'id',
    #'author',
    'num_comments',
    'selftext',
    'selftext_html',
    'subreddit_id',
    'title',
    'url',
    'domain',
    'created',
    'created_utc',
    'ups',
Beispiel #12
0
 def _is_valid(self, url):
     """Check URL validity using RFC 3986 based regular expression"""
     # FIXME: disallow file:// scheme - security risk
     pattern = rfc3987.get_compiled_pattern('^%(URI)s$')
     return pattern.match(url) is not None
Beispiel #13
0
    return pp.Group(
        pp.Keyword(tag_name) + COLON + BOOLEAN + EOL
    )(tag_name)


def id_only_tag(tag_name: str) -> pp.ParserElement:
    return pp.Group(
        pp.Keyword(tag_name) + COLON + _id_value + EOL
    )(tag_name)


def basic_tag_value_pair(tag_name: str) -> pp.ParserElement:
    return pp.Group(pp.Keyword(tag_name) + COLON + obo_unquoted + EOL)(tag_name)


stanza_type = (pp.Keyword('Term') | pp.Keyword('Typedef') | pp.Keyword('Instance'))
stanza_name = (pp.LineStart() + OPEN_BRACKET + stanza_type + CLOSE_BRACKET)('stanza_name')
tag = pp.Regex(r'[^:\n]+')('tag')
value = pp.OneOrMore(
    pp.Word(pp.printables, excludeChars='{!"') | quoted_string, stopOn=pp.LineEnd()).setParseAction(
    ' '.join)('value')
xsd_type = pp.Combine('xsd:' + pp.Word(pp.alphas))('xsd-type')
dbxref_name = (pp.Regex(r'(?:[^],\"\\\n]|\\.)+') +
               pp.Optional(unqoted_quote_string))
dbxref = pp.Group(
    OPEN_BRACKET + (pp.delimitedList(dbxref_name, ',') | pp.Empty()) + CLOSE_BRACKET
)('dbxref')
tag_value_pair = pp.Group(tag + COLON + value + EOL)
file_path = pp.Regex(r'[^\0\n]+')
iri = pp.Regex(get_compiled_pattern('%(IRI)s'))