def get_uris(strings, min_length=10): # used when calling .extract_fileurl() before .extract_strings() if len(strings) <= 0: return _uri_matcher = rfc3987.get_compiled_pattern("^%(URI)s$") extracted_uris = [uri for uri in strings if len(uri) > min_length and _uri_matcher.match(uri)] return extracted_uris
def build(self, *which): """ :param which: a (sub)set of ["uri", "email] """ regex_components = [] for regex_type in which: regex_components.append(getattr(self, regex_type + '_regex')) string = u'(?P<string>\n%s\n)' % u'\n|\n'.join(regex_components) return rfc3987.get_compiled_pattern(string, re.IGNORECASE | re.VERBOSE | re.UNICODE)
def build(self, *which): """ :param which: a (sub)set of ["uri", "email] """ regex_components = [] for regex_type in which: regex_components.append(getattr(self, regex_type + '_regex')) string = u'(?P<string>\n%s\n)' % u'\n|\n'.join(regex_components) return rfc3987.get_compiled_pattern( string, re.IGNORECASE | re.VERBOSE | re.UNICODE)
def extract_text_from_html(input_, pre_strip_urls=False): """ Take a HTML document and return the raw text The raw text is returned without any structure (including without punctuation) and without non alphabetic characters. bs4.BeautifulSoup.get_text is used for parsing. It removes HTML comments, scripts and styles prior to extracting the text. Only text in the body is returned. Parameters ---------- input_ : str or bs4.BeautifulSoup HTML document from which the text will be extracted pre_strip_urls : bool If True, Links that are found in the text part of the HTML are removed prior to filtering Returns ------- text : str The text found in the HTML document """ if isinstance(input_, bs4.BeautifulSoup): bs = input_ elif isinstance(input_, basestring): bs = parse_html(input_) else: raise ValueError("unknown input_ - expected str or " "bs4.BeautifulSoup got '%s'" % (type(input_))) try: text = bs.body.get_text().strip() except (TypeError, AttributeError): return None if pre_strip_urls: if RFC_REGEX_AVAILABLE: text = rfc3987.get_compiled_pattern(rule='URI').sub(" ", text) else: raise ValueError("pre_strip_urls requires regex and rfc3987 " "libraries. One or both could not be found") return filter_text(text)
import rfc3987 URL_REGEX = rfc3987.get_compiled_pattern('%(URI)s') def extract_urls(text): '''Extract urls from given text :text: (str) Text to extract urls from :returns: List of urls extracted ''' return URL_REGEX.findall(text)
return (instance.__class__,) + args cache_container_agnostic_configuration = {} cache_container_ssl_configuration = {} cache_container_url2entity_configuration = {} cache_container_nginx_fs = {} cache_extra_from_distrib = {} #################### # Traitement des URL #################### URI_rfc3987 = rfc3987.get_compiled_pattern('^%(URI)s$') def common_process_uri( le_root_configuration, d, current_line, current_server, current_port, current_mapping_type, l_bad_configurations, suffixwith ): key_uri, uri = [ ( k, v ) for k, v in d.items() if k.startswith( suffixwith ) and k.endswith( 'URI' ) ][ 0 ] d_rfc3987 = URI_rfc3987.match( uri ).groupdict()
except Exception as e: pass if not dataset_ref: dataset_ref = g.value(predicate=RDF.type, object=DCAT.Dataset) return dataset_ref # TODO disallows whitespaces VALID_URL = re.compile( r'^(?:http|ftp)s?://' # http:// or https:// r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain... r'localhost|' #localhost... r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip r'(?::\d+)?' # optional port r'(?:/?|[/?]\S+)$', re.IGNORECASE) URI = get_compiled_pattern('^%(URI)s$') def is_valid_url(references): try: res = urlparse.urlparse(references) return bool(res.scheme and res.netloc) except Exception as e: return False def is_valid_uri(references): return bool(URI.match(references)) def graph_from_opendatasoft(g, dataset_dict, portal_url):
def to_iri(iri): """ Safely quotes an IRI in a way that is resilient to unicode and incorrect arguments (checks for RFC 3987 compliance and falls back to percent encoding) """ # First decode the IRI if needed if not isinstance(iri, str): logger.debug("Converting IRI to unicode") iri = iri.decode('utf-8') try: # If we can safely parse the URI, then we don't # need to do anything special here rfc3987.parse(iri, rule='IRI') logger.debug("This is already a valid IRI, doing nothing...") return iri except: # The URI is not valid, so we'll have to fix it. logger.debug("The IRI is not valid, proceeding to quote...") # First see whether we can actually parse it *as if* it is a URI parts = urlparse.urlsplit(iri) if not parts.scheme or not parts.netloc: # If there is no scheme (e.g. http) nor a net location (e.g. # example.com) then we cannot do anything logger.error("The argument you provided does not comply with " "RFC 3987 and is not parseable as a IRI" "(there is no scheme or no net location part)") logger.error(iri) raise Exception("The argument you provided does not comply with" "RFC 3987 and is not parseable as a IRI" "(there is no scheme or no net location part)") logger.debug( "The IRI contains all necessary parts (scheme + net location)") quoted_parts = {} # We'll now convert the path, query and fragment parts of the URI # Get the 'anti-pattern' for the valid characters (see rfc3987 package) # This is roughly the ipchar pattern plus the '/' as we don't need to match # the entire path, but merely the individual characters no_invalid_characters = rfc3987.get_compiled_pattern( "(?!%(iunreserved)s|%(pct_encoded)s|%(sub_delims)s|:|@|/)(.)") # Replace the invalid characters with an underscore (no need to roundtrip) quoted_parts['path'] = no_invalid_characters.sub(u'_', parts.path) if parts.fragment: quoted_parts['fragment'] = no_invalid_characters.sub( u'_', parts.fragment) if parts.query: quoted_parts['query'] = urllib.quote(parts.query.encode('utf-8'), safe="&=") # Leave these untouched quoted_parts['scheme'] = parts.scheme quoted_parts['authority'] = parts.netloc # Extra check to make sure we now have a valid IRI quoted_iri = rfc3987.compose(**quoted_parts) try: rfc3987.parse(quoted_iri) except: # Unable to generate a valid quoted iri, using the straightforward # urllib percent quoting (but this is ugly!) logger.warning('Could not safely quote as IRI, falling back to ' 'percent encoding') quoted_iri = urllib.quote(iri.encode('utf-8')) return quoted_iri
import re import sys from rfc3986 import is_valid_uri from rfc3987 import get_compiled_pattern from strict_rfc3339 import validate_rfc3339 if sys.version_info > (3, ): long = int unicode = str email_regex = re.compile(r'(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)') ipv4_regex = get_compiled_pattern('^%(IPv4address)s$') ipv6_regex = get_compiled_pattern('^%(IPv6address)s$') hostname_regex = re.compile('(?!-)[A-Z\d-]{1,63}(?<!-)$', re.IGNORECASE) rfc3339 = validate_rfc3339 def email(value): return email_regex.match(value) != None def ipv4(value): return ipv4_regex.match(value) != None def ipv6(value): return ipv6_regex.match(value) != None def uri(value):
def to_iri(iri): """ Safely quotes an IRI in a way that is resilient to unicode and incorrect arguments (checks for RFC 3987 compliance and falls back to percent encoding) """ # First decode the IRI if needed if not isinstance(iri, unicode): logger.debug("Converting IRI to unicode") iri = iri.decode('utf-8') try: # If we can safely parse the URI, then we don't # need to do anything special here rfc3987.parse(iri, rule='IRI') logger.debug("This is already a valid IRI, doing nothing...") return iri except: # The URI is not valid, so we'll have to fix it. logger.debug("The IRI is not valid, proceeding to quote...") # First see whether we can actually parse it *as if* it is a URI parts = urlparse.urlsplit(iri) if not parts.scheme or not parts.netloc: # If there is no scheme (e.g. http) nor a net location (e.g. # example.com) then we cannot do anything logger.error("The argument you provided does not comply with " "RFC 3987 and is not parseable as a IRI" "(there is no scheme or no net location part)") logger.error(iri) raise Exception("The argument you provided does not comply with" "RFC 3987 and is not parseable as a IRI" "(there is no scheme or no net location part)") logger.debug("The IRI contains all necessary parts (scheme + net location)") quoted_parts = {} # We'll now convert the path, query and fragment parts of the URI # Get the 'anti-pattern' for the valid characters (see rfc3987 package) # This is roughly the ipchar pattern plus the '/' as we don't need to match # the entire path, but merely the individual characters no_invalid_characters = rfc3987.get_compiled_pattern("(?!%(iunreserved)s|%(pct_encoded)s|%(sub_delims)s|:|@|/)(.)") # Replace the invalid characters with an underscore (no need to roundtrip) quoted_parts['path'] = no_invalid_characters.sub(u'_', parts.path) if parts.fragment: quoted_parts['fragment'] = no_invalid_characters.sub(u'_', parts.fragment) if parts.query: quoted_parts['query'] = urllib.quote(parts.query.encode('utf-8'),safe="&=") # Leave these untouched quoted_parts['scheme'] = parts.scheme quoted_parts['authority'] = parts.netloc # Extra check to make sure we now have a valid IRI quoted_iri = rfc3987.compose(**quoted_parts) try: rfc3987.parse(quoted_iri) except: # Unable to generate a valid quoted iri, using the straightforward # urllib percent quoting (but this is ugly!) logger.warning('Could not safely quote as IRI, falling back to ' 'percent encoding') quoted_iri = urllib.quote(iri.encode('utf-8')) return quoted_iri
import unittest import urlparse from collections import Counter, OrderedDict from operator import attrgetter, itemgetter import bs4 import praw import rfc3987 import urlobject from jinja2 import Markup, Environment, PackageLoader __APPNAME__ = "redem" __VERSION__ = "0.1.1" __USER_AGENT__ = '%s (praw)\%s' % (__APPNAME__, __VERSION__) uri_rgx = rfc3987.get_compiled_pattern('URI') # URI_reference log = logging.getLogger('%s.cli' % __APPNAME__) SUBMISSION_ATTRS = ( 'id', #'author', 'num_comments', 'selftext', 'selftext_html', 'subreddit_id', 'title', 'url', 'domain', 'created', 'created_utc', 'ups',
def _is_valid(self, url): """Check URL validity using RFC 3986 based regular expression""" # FIXME: disallow file:// scheme - security risk pattern = rfc3987.get_compiled_pattern('^%(URI)s$') return pattern.match(url) is not None
return pp.Group( pp.Keyword(tag_name) + COLON + BOOLEAN + EOL )(tag_name) def id_only_tag(tag_name: str) -> pp.ParserElement: return pp.Group( pp.Keyword(tag_name) + COLON + _id_value + EOL )(tag_name) def basic_tag_value_pair(tag_name: str) -> pp.ParserElement: return pp.Group(pp.Keyword(tag_name) + COLON + obo_unquoted + EOL)(tag_name) stanza_type = (pp.Keyword('Term') | pp.Keyword('Typedef') | pp.Keyword('Instance')) stanza_name = (pp.LineStart() + OPEN_BRACKET + stanza_type + CLOSE_BRACKET)('stanza_name') tag = pp.Regex(r'[^:\n]+')('tag') value = pp.OneOrMore( pp.Word(pp.printables, excludeChars='{!"') | quoted_string, stopOn=pp.LineEnd()).setParseAction( ' '.join)('value') xsd_type = pp.Combine('xsd:' + pp.Word(pp.alphas))('xsd-type') dbxref_name = (pp.Regex(r'(?:[^],\"\\\n]|\\.)+') + pp.Optional(unqoted_quote_string)) dbxref = pp.Group( OPEN_BRACKET + (pp.delimitedList(dbxref_name, ',') | pp.Empty()) + CLOSE_BRACKET )('dbxref') tag_value_pair = pp.Group(tag + COLON + value + EOL) file_path = pp.Regex(r'[^\0\n]+') iri = pp.Regex(get_compiled_pattern('%(IRI)s'))