def xpath(self, selector: str, *, clean: bool = False, first: bool = False, _encoding: str = None) -> _XPath: """Given an XPath selector, returns a list of :class:`Element <Element>` objects or a single one. :param selector: XPath Selector to use. :param clean: Whether or not to sanitize the found HTML of ``<script>`` and ``<style>`` tags. :param first: Whether or not to return just the first result. :param _encoding: The encoding format. If a sub-selector is specified (e.g. ``//a/@href``), a simple list of results is returned. See W3School's `XPath Examples <https://www.w3schools.com/xml/xpath_examples.asp>`_ for more details. If ``first`` is ``True``, only returns the first :class:`Element <Element>` found. """ selected = self.lxml.xpath(selector) elements = [ Element(element=selection, url=self.url, default_encoding=_encoding or self.encoding) if not isinstance(selection, etree._ElementUnicodeResult) else str(selection) for selection in selected ] # Sanitize the found HTML. if clean: elements_copy = elements.copy() elements = [] for element in elements_copy: element.raw_html = lxml_html_tostring(cleaner.clean_html(element.lxml)) elements.append(element) return _get_first_or_list(elements, first)
def html_tostring(tree): """ html to string """ return lxml_html_tostring( tree, pretty_print=True, # method="html", encoding='utf-8', ).strip()
def find(self, selector: str = "*", *, containing: _Containing = None, clean: bool = False, first: bool = False, _encoding: str = None) -> _Find: """Given a CSS Selector, returns a list of :class:`Element <Element>` objects or a single one. :param selector: CSS Selector to use. :param clean: Whether or not to sanitize the found HTML of ``<script>`` and ``<style>`` tags. :param containing: If specified, only return elements that contain the provided text. :param first: Whether or not to return just the first result. :param _encoding: The encoding format. Example CSS Selectors: - ``a`` - ``a.someClass`` - ``a#someID`` - ``a[target=_blank]`` See W3School's `CSS Selectors Reference <https://www.w3schools.com/cssref/css_selectors.asp>`_ for more details. If ``first`` is ``True``, only returns the first :class:`Element <Element>` found. """ # Convert a single containing into a list. if isinstance(containing, str): containing = [containing] encoding = _encoding or self.encoding elements = [ Element(element=found, url=self.url, default_encoding=encoding) for found in self.pq(selector) ] if containing: elements_copy = elements.copy() elements = [] for element in elements_copy: if any([c.lower() in element.full_text.lower() for c in containing]): elements.append(element) elements.reverse() # Sanitize the found HTML. if clean: elements_copy = elements.copy() elements = [] for element in elements_copy: element.raw_html = lxml_html_tostring(cleaner.clean_html(element.lxml)) elements.append(element) return _get_first_or_list(elements, first)