Python _unquote_match Exemples

Langage de programmation: Python

Espace de nommage/Pack: lxml.html

Méthode/Fonction: _unquote_match

Exemples au hotexamples.com: 2

Python _unquote_match - 2 exemples trouvés. Ce sont les exemples réels les mieux notés de lxml.html._unquote_match extraits de projets open source. Vous pouvez noter les exemples pour nous aider à en améliorer la qualité.

Exemple #1

0

Afficher le fichier

Fichier : parsers.py Projet : shishaktkumarCLS/pywebcopy

def _handle_lxml_elem(el): """ From source code of `lxml.html.iter_links` function. With added refactoring of multi-urls attributes, i.e. src-set Yielding and internally handling (element, attribute, link, pos), where attribute may be None (indicating the link is in the text). ``pos`` is the position where the link occurs; often 0, but sometimes something else in the case of links in stylesheets or style tags. Note: multiple links inside of a single text string or attribute value are returned in reversed order. This makes it possible to replace or delete them from the text string value based on their reported text positions. Otherwise, a modification at one text position can change the positions of links reported later on. """ attribs = el.attrib tag = _nons(el.tag) if tag == 'object': # pragma: no cover codebase = None if 'codebase' in attribs: codebase = el.get('codebase') yield el, 'codebase', codebase, 0 for attrib in ('classid', 'data'): if attrib in attribs: value = el.get(attrib) if codebase is not None: value = urljoin(codebase, value) yield el, attrib, value, 0 if 'archive' in attribs: for match in _archive_re.finditer(el.get('archive')): value = match.group(0) if codebase is not None: value = urljoin(codebase, value) yield el, 'archive', value, match.start() else: for attrib in SINGLE_LINK_ATTRIBS: if attrib in attribs: yield el, attrib, attribs[attrib], 0 # XXX Patch for multi-url detection for attrib in LIST_LINK_ATTRIBS: if attrib in attribs: urls = list(_iter_srcset_urls(attribs[attrib])) if urls: # yield in reversed order to simplify in-place modifications for match in urls[::-1]: url, start = _unquote_match( match.group(1).strip(), match.start(1)) yield el, attrib, url, start if tag == 'meta': http_equiv = attribs.get('http-equiv', '').lower() if http_equiv == 'refresh': content = attribs.get('content', '') match = _parse_meta_refresh_url(content) url = (match.group('url') if match else content).strip() # unexpected content means the redirect won't work, but we might # as well be permissive and yield the entire string. if url: url, pos = _unquote_match( url, match.start('url') if match else content.find(url)) yield el, 'content', url, pos elif tag == 'param': valuetype = el.get('valuetype') or '' if valuetype.lower() == 'ref': yield el, 'value', el.get('value'), 0 elif tag == 'style' and el.text: urls = [ # (start_pos, url) _unquote_match(match.group(1), match.start(1))[::-1] for match in _iter_css_urls(el.text) ] + [(match.start(1), match.group(1)) for match in _iter_css_imports(el.text)] if urls: # sort by start pos to bring both match sets back into order # and reverse the list to report correct positions despite # modifications urls.sort(reverse=True) for start, url in urls: yield el, None, url, start if 'style' in attribs: urls = list(_iter_css_urls(attribs['style'])) if urls: # yield in reversed order to simplify in-place modifications for match in urls[::-1]: url, start = _unquote_match(match.group(1), match.start(1)) yield el, 'style', url, start

Exemple #2

0

Afficher le fichier

def __handle(self, el): """Handles a lxml element which is straight out of the parser and does the work of file objects building and starts the download. """ attribs = el.attrib tag = _nons(el.tag) if tag == 'object': codebase = None if 'codebase' in attribs: codebase = el.get('codebase') self.handle(el, 'codebase', codebase, 0) for attrib in ('classid', 'data'): if attrib in attribs: value = el.get(attrib) if codebase is not None: value = urljoin(codebase, value) self.handle(el, attrib, value, 0) if 'archive' in attribs: for match in _archive_re.finditer(el.get('archive')): value = match.group(0) if codebase is not None: value = urljoin(codebase, value) self.handle(el, 'archive', value, match.start()) else: for attrib in link_attrs: if attrib in attribs: self.handle(el, attrib, attribs[attrib], 0) for attrib in list_link_attrs: if attrib in attribs: urls = list(_iter_srcset_urls(attribs[attrib])) if urls: # return in reversed order to simplify in-place modifications for match in urls[::-1]: url, start = _unquote_match( match.group(1).strip(), match.start(1)) self.handle(el, attrib, url, start) if tag == 'meta': http_equiv = attribs.get('http-equiv', '').lower() if http_equiv == 'refresh': content = attribs.get('content', '') match = _parse_meta_refresh_url(content) url = (match.group('url') if match else content).strip() # unexpected content means the redirect won't work, but we might # as well be permissive and return the entire string. if url: url, pos = _unquote_match( url, match.start('url') if match else content.find(url)) self.handle(el, 'content', url, pos) elif tag == 'param': valuetype = el.get('valuetype') or '' if valuetype.lower() == 'ref': self.handle(el, 'value', el.get('value'), 0) elif tag == 'style' and el.text: urls = [ # (start_pos, url) _unquote_match(match.group(1), match.start(1))[::-1] for match in _iter_css_urls(el.text) ] + [(match.start(1), match.group(1)) for match in _iter_css_imports(el.text)] if urls: # sort by start pos to bring both match sets back into order # and reverse the list to report correct positions despite # modifications urls.sort(reverse=True) for start, url in urls: self.handle(el, None, url, start) if 'style' in attribs: urls = list(_iter_css_urls(attribs['style'])) if urls: # return in reversed order to simplify in-place modifications for match in urls[::-1]: url, start = _unquote_match(match.group(1), match.start(1)) self.handle(el, 'style', url, start)