def _get_uri_from_label(self, def_text): """ Fetches a URI given a label by searching all term labels in braces ('{' and '}'). For example, if we encounter "{whole plant phenological stage}", it will be converted to "http://purl.obolibrary.org/obo/PPO_0000001". """ labelre = re.compile(r'(\{[A-Za-z0-9\- _]+\})') defparts = labelre.split(def_text) newdef = '' for defpart in defparts: if labelre.match(defpart): label = defpart.strip("{}") # Get the class IRI associated with this label. try: labelIRI = self.__label_map.lookupIRI(label) except KeyError: raise RuntimeError( 'The class label, "' + label + '", could not be matched to a term IRI.') newdef = str(labelIRI) else: newdef += defpart if len(defparts) == 0: newdef = def_text if len(newdef) != 0: # attempt parsing wlth the rfc3987 library and throws error if not a valid IRI rfc3987.parse(newdef, rule='IRI') return newdef
def clientfunc(connection, addr): print "Got connection from ", addr connection.send("Server up and running") while True: connection.send("Ready for another url") valid = True url_recieved = c.recv(1024) try: parse(url_recieved, rule="IRI") except Exception as e: valid = False if valid: url_recieved = url_recieved.split('/')[2] print "Do you want to write URL:", url_recieved, "to config file" print "for the node", addr response = raw_input("(yes/no):") if response == 'yes': fp = open("user.action", 'a') fp.write(url_recieved) fp.write("\n") print "URL written to the file user.action" print "Done" connection.send("The URL is accepted") fp.close() else: print "URL not added" print "Message sent to client" connection.send("The URL is blocked by admin") else: connection.send("The URL is not valid") pass connection.close()
def webCheck(website): try: print parse(website, rule='IRI') return True except: return False
def expandURL(self, url_pattern, row, datatype=False): """Takes a Jinja or Python formatted string, applies it to the row values, and returns it as a URIRef""" try: unicode_url_pattern = unicode(url_pattern) except NameError: unicode_url_pattern = str(url_pattern).split(')')[0].split('(')[-1] # print(unicode_url_pattern) url = self.render_pattern(unicode_url_pattern, row) # DEPRECATED # for ns, nsuri in namespaces.items(): # if url.startswith(ns): # url = url.replace(ns + ':', nsuri) # break try: iri = iribaker.to_iri(url) rfc3987.parse(iri, rule='IRI') except: raise Exception(u"Cannot convert `{}` to valid IRI".format(url)) # print(iri) return URIRef(iri)
def is_iri(value): try: import rfc3987 rfc3987.parse(value, rule="URI") return True except Exception as e: return False
def url_remapper(src, dest): src_parts = parse(src, 'URI') dest_parts = parse(dest, 'URI') src_path = Path(unquote(src_parts['path'])).resolve() dest_path = Path(unquote(dest_parts['path'])).resolve() def remap(url): url_parts = parse(url, 'URI') if not (url_parts['scheme'] == src_parts['scheme'] and url_parts['authority'] == src_parts['authority']): return False, url url_path = Path(unquote(url_parts['path'])).resolve() if src_path != url_path and src_path not in url_path.parents: return False, url result_path = dest_path / url_path.relative_to(src_path) # Use a trailing slash if the incoming path had one. This facilitates # further URI resolution operations. if url_parts['path'].endswith('/'): final_path = f'{result_path}/' else: final_path = str(result_path) return True, (compose(scheme=dest_parts['scheme'], authority=dest_parts['authority'], path=quote(final_path), query=url_parts['query'], fragment=url_parts['fragment'])) return remap
def __is_url(cls, url_string): from rfc3987 import parse try: parse(url_string, rule='IRI') return True except ValueError: return False
def checker(url): try: parse(url) return True except ValueError: return False return False
def check_uri(data): if not isinstance(data, str) and not isinstance(data, string_types): raise TypeError("URL must be a string, not a {}".format(data)) try: rfc3987.parse(data, rule="URI") except Exception as e: print(e) raise TypeError('except URL type, found {}'.format(data))
def test_against_legacy_ref(url): legacy = True try: rfc3987.parse(url, rule='URI_reference') except ValueError: legacy = False new = validate_rfc3986(url, rule='URI_reference') assert legacy == bool(new)
def validate_format(self, value: str, *args: ty.Any, **context: ty.Any) -> ValidationResult: try: rfc3987.parse(value, rule='URI') except ValueError: return self.messages['format'] return None
def set_from_url(record, return_data): if len(record['SOURCEURL']) > 0: try: parse(record['SOURCEURL'], rule='IRI') return_data['fromURL'] = record['SOURCEURL'] except: pass return return_data
def get_links_from_url(url, domain, params): tree = request_page_tree(url) links = [] for a in tree.xpath('//a'): ignore_link = False try: link = str(a.get('href')) except: #Link is somehow broken link = '' ignore_link = True pass if 'ignore_urls_with' in params: if any(pattern in link for pattern in params['ignore_urls_with']): ignore_link = True if 'ignore_urls_without' in params: found = False if any(pattern in link for pattern in params['ignore_urls_without']): found = True if not found: ignore_link = True if ignore_link: continue try: if '#' in link: link = link[:link.find('#')] if 'retain_params' not in params and '?' in link: link = link[:link.find('?')] if parse(link, rule='IRI') and (domain + '/') in link: links.append(str(link).encode('ASCII')) except ValueError as e: if 'http' not in link and 'java' not in link and len( link) > 1 and link != 'None' and link[0] not in ['?', '#']: if str(link).encode('ASCII', 'ignore')[0] != '/': link = '/' + link rel_link = 'http://' + domain + link try: if parse(rel_link, rule='IRI'): links.append(str(rel_link).encode('ASCII')) except: pass pass except: pass return list(set(links))
def insert_concept(self, data): try: parse(data['id'], rule='IRI') a = URIRef(data['id']) except ValueError: a = BNCF[data['id']] self.sparql.add((a, RDF.type, SKOS.Concept)) self.sparql.add((a, RDFS.label, Literal(data['label']))) return 'OK'
def is_html(self, message): for suffix in self.urlExtensions: if suffix in message: try: parse(message, rule="URI") return True except ValueError: pass return False
def insert_organization(self, data): try: parse(data['id'], rule='IRI') a = URIRef(data['id']) except ValueError: a = AOP[data['id']] self.sparql.add((a, RDF.type, FOAF.Organization)) self.sparql.add((a, FOAF.name, Literal(data['label']))) return 'OK'
def handle_profile_image(user_id, image_url): try: parse(image_url, rule="IRI") except ValueError: return False query = (db.session.query(User).filter(User.id == user_id).first()) query.image_url = image_url db.session.commit() return True
def get_message_type(message): try: parse(message, rule="IRI") if is_url_image(message): return "image" else: return "url" except ValueError: return "text"
def insert_place(self, data): try: parse(data['id'], rule='IRI') a = URIRef(data['id']) except ValueError: a = DBPEDIA[data['id']] self.sparql.add((a, RDF.type, DBPEDIA.Place)) self.sparql.add((a, RDFS.label, Literal(data['label']))) return 'OK'
def __new__(cls, *args, **kwargs): if cls.nullable and args[0] is None: return None value = super().__new__(cls, *args, **kwargs) if cls.trim_whitespace: value = value.strip() if cls.min_length is not None: if len(value) < cls.min_length: if cls.min_length == 1: raise TypeSystemError(cls=cls, code='blank') else: raise TypeSystemError(cls=cls, code='min_length') if cls.max_length is not None: if len(value) > cls.max_length: raise TypeSystemError(cls=cls, code='max_length') if cls.pattern is not None: if not re.search(cls.pattern, value): raise TypeSystemError(cls=cls, code='pattern') # Validate format, if specified if cls.format == 'date': try: value = datetime.strptime(value, "%Y-%m-%d").date() except ValueError as e: raise TypeSystemError(str(e), cls=cls) elif cls.format == 'date-time': try: value = isodate.parse_datetime(value) except (ValueError, isodate.ISO8601Error) as e: raise TypeSystemError(str(e), cls=cls) elif cls.format == 'email': if '@' not in value: raise TypeSystemError('Not a valid email address.', cls=cls) elif cls.format == 'time': try: value = datetime.strptime(value, "%H:%M:%S") except ValueError as e: raise TypeSystemError(str(e), cls=cls) elif cls.format == 'uri': try: rfc3987.parse(value, rule='URI') except ValueError as e: raise TypeSystemError(str(e), cls=cls) # Coerce value to the native str type. We only do this if the value # is an instance of the class. It could be a datetime instance or # a str already if `trim_whitespace` is True. if isinstance(value, cls): value = cls.native_type(value) cls.validate(value) return value
def test_against_legacy_hypothesis(url): print(url) legacy = True try: rfc3987.parse(url, rule='URI') except ValueError: legacy = False new = validate_rfc3986(url) assert legacy == bool(new)
def checker(url): ''' Check if the url is a valid one or not. ''' try: parse(url) return True except ValueError: return False return False
def validate_url(url): """ Validates URL (actually, IRIs). """ try: rfc3987.parse(url, rule="IRI") except: return False return True
def insert_author(self, author): try: parse(author['author_id'], rule='IRI') a = author['author_id'] except ValueError: a = AOP[author['author_id']] self.sparql.add((a, RDF.type, FOAF.Person)) self.sparql.add((a, FOAF.name, Literal(author['author_fullname']))) if 'author_email' in author: self.sparql.add((a, SCHEMA.email, Literal(author['author_email']))) return 'OK'
def is_url(url): """ Returns True if `url` is an IRI as specified in the RFC 3987 (https://www.ietf.org/rfc/rfc3987.txt) """ try: rfc3987.parse(url, rule="IRI") return True except ValueError: logger.warning("%s is not a valid url.", url) return False
def guess_server_url( url: str, login_page: str = Options.startup_page, proxy: "Proxy" = None, timeout: int = 5, ) -> str: """ Guess the complete server URL given an URL (either an IP address, a simple domain name or an already complete URL). Note: this function cannot be decorated with lru_cache(). :param url: The server URL (IP, domain name, full URL). :param login_page: The Drive login page. :param int timeout: Timeout for each and every request. :return: The complete URL. """ import requests import rfc3987 from requests.exceptions import SSLError kwargs: Dict[str, Any] = { "timeout": timeout, "verify": Options.ca_bundle or not Options.ssl_no_verify, } for new_url in compute_urls(url): try: rfc3987.parse(new_url, rule="URI") log.debug(f"Testing URL {new_url!r}") full_url = f"{new_url}/{login_page}" if proxy: kwargs["proxies"] = proxy.settings(url=full_url) with requests.get(full_url, **kwargs) as resp: resp.raise_for_status() if resp.status_code == 200: # Happens when JSF is installed log.debug(f"Found URL: {new_url}") return new_url except SSLError as exc: if "CERTIFICATE_VERIFY_FAILED" in str(exc): raise InvalidSSLCertificate() except requests.HTTPError as exc: if exc.response.status_code in (401, 403): # When there is only Web-UI installed, the code is 401. log.debug(f"Found URL: {new_url}") return new_url except (ValueError, requests.RequestException): log.debug(f"Bad URL: {new_url}") except Exception: log.exception("Unhandled error") if not url.lower().startswith("http"): return "" return url
def guess_server_url( url: str, login_page: str = Options.startup_page, proxy: "Proxy" = None, timeout: int = 5, ) -> str: """ Guess the complete server URL given an URL (either an IP address, a simple domain name or an already complete URL). :param url: The server URL (IP, domain name, full URL). :param login_page: The Drive login page. :param int timeout: Timeout for each and every request. :return: The complete URL. """ import requests import rfc3987 from requests.exceptions import SSLError kwargs: Dict[str, Any] = { "timeout": timeout, "verify": Options.ca_bundle or not Options.ssl_no_verify, } for new_url in compute_urls(url): try: rfc3987.parse(new_url, rule="URI") log.debug(f"Testing URL {new_url!r}") full_url = f"{new_url}/{login_page}" if proxy: kwargs["proxies"] = proxy.settings(url=full_url) with requests.get(full_url, **kwargs) as resp: resp.raise_for_status() if resp.status_code == 200: log.debug(f"Found URL: {new_url}") return new_url except requests.HTTPError as exc: if exc.response.status_code in {401, 403}: # When there is only Web-UI installed, the code is 401. log.debug(f"Found URL: {new_url}") return new_url except SSLError as exc: if "CERTIFICATE_VERIFY_FAILED" in str(exc): raise InvalidSSLCertificate() except (ValueError, requests.RequestException): log.debug(f"Bad URL: {new_url}") except Exception: log.exception("Unhandled error") if not url.lower().startswith("http"): return "" return url
def getThumbnailContent(metadata): if 'image' not in metadata: return 'no preview' images = metadata['image'] if len(images) < 1: return 'no preview' try: img_uri = images[0].strip() rfc3987.parse(img_uri, "URI") return '<img src="https://steemitimages.com/128x256/{img_uri}" />'.format( **locals()) except Exception: return 'no preview'
def validate(self, value): """Check that the URL is valid, and optionally accessible.""" try: parse(value) except ValueError: print value self.error("Value is not a valid URL") if self.verify_exists: try: get(value) except: self.error("The URL appears to be inaccessible")
def escape_url(url): try: rfc3987.parse(url, rule="URI") return url except ValueError: if url.lower().startswith('https://'): scheme = 'https://' elif url.lower().startswith('http://'): scheme = 'http://' else: scheme = '' url = quote_url(url[len(scheme):]) return scheme + url
def check_url(instance): # See https://github.com/Julian/jsonschema/blob/master/jsonschema/_format.py if not isinstance(instance, str_types): return True rfc3987.parse(instance, rule='URI') # raises ValueError try: response = requests.get(instance, timeout=self.args.timeout) result = response.status_code in (200,) if not result: print('HTTP {} on GET {}'.format(response.status_code, instance)) return result except requests.exceptions.Timeout as e: print('Timedout on GET {}'.format(instance)) return False
def spider(base_urls, target): ''' Loop through the initial links found in the given page. Each new link discovered will be added to the list if it's not already there, and thus crawled aswell looking for more links. wannabe list works as the placeholder for the urls that are yet to crawl. base_urls is a list with all the already crawled urls. ''' global target_ target_ = parse(target) p = Pool(arguments.process) wannabe = [ url for url in base_urls if target_['authority'] in parse(url)['authority'] ] while True: #retrieve all the urls returned by the workers new_urls = p.map(worker, wannabe) #flatten them and remove repeated ones new_urls = list(set(itertools.chain(*new_urls))) wannabe = [] i = 0 #if new_urls is empty meaning no more urls are being discovered, exit the loop if new_urls == []: break else: for url in new_urls: if url not in base_urls: ''' For each new url, check if it hasn't been crawled. If it's indeed new and contains the target domain it gets appended to the wannabe list so in the next iteration it will be crawled. ''' i += 1 if target_['authority'] in parse(url)['authority']: wannabe.append(url) base_urls.append(url) print( colored('\nNew urls appended: {}\n'.format(i), 'green', attrs=['bold'])) #once all the links for the given depth have been analyzed, execute the parser parser(base_urls)
def is_valid_uri(instance): if not isinstance(instance, six.string_types): return True uri = urlparse(instance) query = urlencode(parse_qsl(unquote(uri.query.encode('utf-8')))) return rfc3987.parse(uri._replace(query=query).geturl(), rule='URI')
def serialize_fe(fe, reified, wiki_title, add_triple, format): # The FE predicate takes the FE label p1 = _uri_for('FE', 'predicate', fe['FE']) # The FE object takes the linked entity URI and/or the literal le_uri = fe.get('uri') literal = fe.get('literal') if le_uri: # It's a URI wiki_title = quote(le_uri.split('/')[-1].encode('utf8')) o1 = NAMESPACES['resource'] + wiki_title parsed = parse(o1, rule='URI_reference') # URI sanity check assert add_triple(reified, p1, o1) if literal: # It's a literal if type(literal) in {str, unicode}: assert add_triple(reified, p1, literal) elif type(literal) == dict: if 'duration' in literal: assert add_triple(reified, p1, literal['duration']) if 'start' in literal: assert add_triple(reified, '%sstartYear' % NAMESPACES['ontology'], literal['start']) if 'end' in literal: assert add_triple(reified, '%sendYear' % NAMESPACES['ontology'], literal['end']) else: raise Exception("Don't know how to serialize: " + repr(literal))
def launch_the_stream(): parser = create_parser() if len(argv) == 1: parser.print_help() exit(1) arguments = parser.parse_args() rv = 0 statistics = UserStat(APPLICATION_NAME, STAT_FILE_NAME) statistics.load() nicknames = Nicknames(APPLICATION_NAME, ALIAS_FILE_NAME) nicknames.load() if arguments.stat: print(str(statistics)) elif arguments.aliases: print(str(nicknames)) elif arguments.clear: trimmed = statistics.fltr(lambda key, value: value > int(arguments.clear)) statistics.save() print("Statistics cleared: {0}".format(trimmed)) elif (arguments.let and len(arguments.let) == 2): (nick, URL) = arguments.let nicknames.assign(nick, URL) # Extract the last part of URL path as a streamer nick streamer = [x for x in parse(URL)['path'].split('/') if x][-1] trimmed = statistics.fltr(lambda key, value: streamer not in key) statistics.save() nicknames.save() print("{0} was assigned to {1}; Statistics cleared: {2}".format(nick, URL, trimmed)) else: rv = assemble_command(arguments, statistics, nicknames) return rv
def generate_validator_from_schema(schema_uri): #download the schema to a string schema = None #handle http and file uri_split = rfc3987.parse(schema_uri) if uri_split['scheme'] in ("http", "https"): #its a http or https use requests schema = requests.get(schema_uri).json() elif uri_split['scheme'] == "file": #its a file, open as normal #reconstiture the file path from the uri with open( os.path.abspath( os.path.join(uri_split['authority'], uri_split['path'])), 'r') as schema_file: schema = json.load(schema_file) else: raise ValueError("schema uri must have file or url scheme") #Create a refresolver to allow resolution #of relative schema links #This is required to use git branches / versions and #local development correctly #Don't use from_schema because it uses the $id baked #into the schema, and we want to avoid baking handlers = dict(file=file_handler) resolver = jss.RefResolver(schema_uri, schema, handlers=handlers, store={}) validator = jss.Draft7Validator( schema=schema, resolver=resolver, ) return validator
def parse(ark_str): """Parse an ARK URL or an ARK ID string into an Ark oject Args: ark_str (str): The string to parse. Returns: Ark: The parsed ARK. Raises: ArkParsingError: If parsing fails. """ try: parts = rfc3987.parse(ark_str, rule="URI") # Ensure ark is a URI parser = Lark(_GRAMMAR, start='arkid') # Extract an ARK ID from ark_str if ark_str is a full ARK URL. if parts["scheme"] != _ARKID_SCHEME: arkid_str = parts["path"].lstrip("/") if not parts["authority"]: # NMA is required msg = 'Name Mapping Authority cannot be null.' raise ArkParsingError(msg, ark_str) else: arkid_str = ark_str tree = parser.parse(arkid_str) ark_parts = ArkIdTransformer().transform(tree) ark_parts.update(parts) ark = Ark(**ark_parts) return Either.pure(ark) except (TypeError, ValueError, ParseError, UnexpectedCharacters) as ex: return Left(ArkParsingError(str(ex), ark_str))
def to_b64(self, image_filename, *args): """ Returns a tuple with (b64content, imgtype) where: - b64content is a base64 representation of the input file - imgtype is the image type as detected by imghdr """ self.logger.debug('Converting image %s to base64', image_filename) self.logger.debug('Current directory %s', os.path.abspath(os.curdir)) try: img_info = parse(image_filename, rule='IRI') extension = img_info['path'].split('.')[-1] content = urlopen(image_filename) except ValueError: # not a valid IRI, assume local file self.logger.debug("Image '%s' doesn't have a valid URL, " "assuming local", image_filename) try: extension = imghdr.what(image_filename) if extension is None: self.logger.debug('Image extension not detected, skipping') return '' content = open(image_filename, 'rb') except (IOError, AttributeError, TypeError): return '' except (HTTPError, URLError, TypeError): return '' txt = 'data:image/{};base64,\n{}'.format(extension, content.read().encode('base64' ) ) content.close() return txt
def execute(self, obj): if not isinstance(obj, str): raise TypeError('\'{}\' is not of type str.'.format(obj)) parsed = self._parse(obj) parsed = self._process(**parsed) return rfc3987.parse(rfc3987.compose(**parsed))
def validate_url(url): try: p = parse(url, rule='URI_reference') r = all((p['scheme'], p['authority'], p['path'])) except Exception as e: print e r = False return r
def uri_validator(value, **kwargs): try: parts = rfc3987.parse(value, rule="URI") except ValueError: raise ValidationError(MESSAGES["format"]["invalid_uri"].format(value)) if not parts["scheme"] or not parts["authority"]: raise ValidationError(MESSAGES["format"]["invalid_uri"].format(value))
def test_urn_link(urn, result): if isinstance(result, Exception): with pytest.raises(type(result)) as e: URNLink().execute(urn) assert e.value.args == result.args else: assert rfc3987.parse(result) # Extra URL validation assert URNLink().execute(urn)['IRI'] == result
def test_ark_link(ark_id, result): if isinstance(result, Exception): with pytest.raises(type(result)) as e: ARKLink().execute(ark_id) assert e.value.args == result.args else: assert rfc3987.parse(result) # Extra URL validation assert ARKLink().execute(ark_id)['IRI'] == result
def test_doi_link(doi, result): if isinstance(result, Exception): with pytest.raises(type(result)) as e: DOILink().execute(doi) assert e.value.args == result.args else: assert rfc3987.parse(result) # Extra URL validation assert DOILink().execute(doi)['IRI'] == result
def uri_validator(value, **kwargs): try: parts = rfc3987.parse(value, rule='URI') except ValueError: raise ValidationError(MESSAGES['format']['invalid_uri'].format(value)) if not parts['scheme'] or not parts['authority']: raise ValidationError(MESSAGES['format']['invalid_uri'].format(value))
def __init__(self, parent, format): self.format = format self.uri = parent.uri + FORMATS[format] self.valid = rfc3987.parse(self.uri, rule="absolute_URI") if self.valid: l.debug("Initialized serialization (%s: %s)." % ((self.format, self.uri))) else: l.WARNING("%s is not a valid absolute URI, so this serialization will not be retrieved.")
def getLayerData(self): # Is this a proper URL? try: rfc3987.parse(self.location, rule='IRI') isUrl = True except: isUrl = False # Download layer from URL. if isUrl: logging.info('Downloading {0} from {1}'.format(self.name, self.location)) try: response = urllib2.urlopen(self.location) data = response.read() except Exception, e: logging.exception(e) return False
def check_api_url(self, url): try: p = parse(url, rule="IRI") if p != None: return True else: return False except ValueError: return False
def try_host(self, hostname): success = False try: result = parse(hostname, rule="URI") if result['authority'] != '': return True except ValueError, e: logging.error("address given does not match URI definition") logging.exception(e)
def expandURL(self, url_pattern, row, datatype=False): """Takes a Jinja or Python formatted string, applies it to the row values, and returns it as a URIRef""" url = self.render_pattern(unicode(url_pattern), row) # DEPRECATED # for ns, nsuri in namespaces.items(): # if url.startswith(ns): # url = url.replace(ns + ':', nsuri) # break try: iri = iribaker.to_iri(url) rfc3987.parse(iri, rule='IRI') except: raise Exception(u"Cannot convert `{}` to valid IRI".format(url)) # print "Baked: ", iri return URIRef(iri)
def parse_url(url): try: matches = rfc3987.parse(url, rule='URI') except ValueError: raise HTTPBadRequest(detail=Messages.invalid_uri) if matches['scheme'] not in ['http', 'https']: raise HTTPBadRequest(detail=Messages.invalid_uri) matches['path'] = matches['path'] or '/' matches['fragment'] = None return rfc3987.compose(**matches)
def parse_streamer_url(url, nicknames): if match(url, 'absolute_URI'): rv1 = [x for x in parse(url)['path'].split('/') if x][-1] rv2 = url return rv1, rv2 elif nicknames.find(url): return url, nicknames.get(url) else: print("Nickname \"{0}\" has not been defined yet".format(url)) return None, None
def build_domain(self, url): """ :param domaininfo: :return: """ d = parse(url, rule='IRI') li = d['authority'].split('.') domain = li[len(li) - 2] self.is_valid(domain) module = self.my_import('domains.'+domain) self.fetcher = module.Fetcher(requester.Requester())