def lxml(self) -> HtmlElement:
        """`lxml <http://lxml.de>`_ representation of the
        :class:`Element <Element>` or :class:`HTML <HTML>`.
        """
        if self._lxml is None:
            try:
                self._lxml = soup_parse(self.html, features='html.parser')
            except ValueError:
                self._lxml = lxml.html.fromstring(self.html)

        return self._lxml
コード例 #2
0
ファイル: pattern.py プロジェクト: Shicheng-Cheng/pyReptile
 def xpath(self,response,selector,**kwargs):
     parser=kwargs.get('parser','html.parser')
     try:
         soup=soup_parse(response,features=parser)
     except:
         soup=lxml.html.fromstring(response)
     temp=soup.xpath(selector)
     tempList = []
     for i in temp:
         tempList.append(i.text)
     return tempList
コード例 #3
0
    def lxml(self) -> HtmlElement:
        """`lxml <http://lxml.de>`_ representation of the
        :class:`Element <Element>` or :class:`HTML <HTML>`.
        """
        if self._lxml is None:
            try:
                self._lxml = soup_parse(self.html, features='html.parser')
            except ValueError:
                self._lxml = lxml.html.fromstring(self.html)

        return self._lxml
コード例 #4
0
    def lxml(self) -> HtmlElement:
        """`lxml <http://lxml.de>`_ representation of the
        :class:`Element <Element>` or :class:`HTML <HTML>`.
        """
        if self._lxml is None:
            try:
                self._lxml = soup_parse(self.html, features='html.parser')
            except ValueError:
                raw = str(self.raw_html,
                          encoding='utf-8').replace('gb2312',
                                                    self.default_encoding)
                self._lxml = lxml.html.fromstring(
                    raw.encode('utf-8')
                )  #因为lxml.html.fromstring会根据html中编码格式进行解码,但是如果html的编码是gbk,GB2312等,经过render函数后,即渲染过js后,
                #self.raw_html的编码格式此时是utf-8,而html中charset的值是gbk或者gb2312,这样会导致因编码\解码不一致的乱码情况,代码此处改成这样,此时
                #编码方式写死为gb2312,只是为了说明,这里可以灵活根据html的编码惊醒更改.
                #self._lxml = lxml.html.fromstring(self.raw_html)

        return self._lxml
コード例 #5
0
 def lxml(self) -> HtmlElement:
     """`lxml <http://lxml.de>`_ representation of the
     :class:`Element <Element>` or :class:`HTML <HTML>`.
     """
     return soup_parse(self.html, features='html.parser')