def extract(self, selector, host='', body_xpath='', with_body_html=False):
     body_xpath = body_xpath or config.get('body', {}).get('xpath', '')
     if body_xpath:
         body = selector.xpath(body_xpath)[0]
     else:
         body = selector.xpath('//body')[0]
     for node in iter_node(body):
         node_hash = hash(node)
         density_info = self.calc_text_density(node)
         text_density = density_info['density']
         ti_text = density_info['ti_text']
         text_tag_count = self.count_text_tag(node, tag='p')
         sbdi = self.calc_sbdi(ti_text, density_info['ti'], density_info['lti'])
         images_list = node.xpath('.//img/@src')
         host = host or config.get('host', '')
         if host:
             images_list = [pad_host_for_images(host, url) for url in images_list]
         node_info = {'ti': density_info['ti'],
                      'lti': density_info['lti'],
                      'tgi': density_info['tgi'],
                      'ltgi': density_info['ltgi'],
                      'node': node,
                      'density': text_density,
                      'text': ti_text,
                      'images': images_list,
                      'text_tag_count': text_tag_count,
                      'sbdi': sbdi}
         if with_body_html or config.get('with_body_html', False):
             body_source_code = unescape(etree.tostring(node, encoding='utf-8').decode())
             node_info['body_html'] = body_source_code
         self.node_info[node_hash] = node_info
     std = self.calc_standard_deviation()
     self.calc_new_score(std)
     result = sorted(self.node_info.items(), key=lambda x: x[1]['score'], reverse=True)
     return result
 def extract(self,
             selector,
             host='',
             body_xpath='',
             with_body_html=False,
             use_visiable_info=False):
     body_xpath = body_xpath or config.get('body', {}).get('xpath', '')
     use_visiable_info = use_visiable_info or config.get(
         'use_visiable_info', False)
     if body_xpath:
         body = selector.xpath(body_xpath)[0]
     else:
         body = selector.xpath('//body')[0]
     for node in iter_node(body):
         if use_visiable_info:
             if not node.attrib.get('is_visiable', True):
                 continue
             coordinate_json = node.attrib.get('coordinate', '{}')
             coordinate = json.loads(coordinate_json)
             if coordinate.get('height', 0) < 150:  # 正文块的高度应该要大于150px
                 continue
         node_hash = hash(node)
         density_info = self.calc_text_density(node)
         text_density = density_info['density']
         ti_text = density_info['ti_text']
         text_tag_count = self.count_text_tag(node, tag='p')
         sbdi = self.calc_sbdi(ti_text, density_info['ti'],
                               density_info['lti'])
         images_list = node.xpath('.//img/@src')
         host = host or config.get('host', '')
         if host:
             images_list = [
                 pad_host_for_images(host, url) for url in images_list
             ]
         node_info = {
             'ti': density_info['ti'],
             'lti': density_info['lti'],
             'tgi': density_info['tgi'],
             'ltgi': density_info['ltgi'],
             'node': node,
             'density': text_density,
             'text': ti_text,
             'images': images_list,
             'text_tag_count': text_tag_count,
             'sbdi': sbdi
         }
         if use_visiable_info:
             node_info['is_visiable'] = node.attrib['is_visiable']
             node_info['coordinate'] = node.attrib.get('coordinate', '')
         if with_body_html or config.get('with_body_html', False):
             body_source_code = unescape(
                 etree.tostring(node, encoding='utf-8').decode())
             node_info['body_html'] = body_source_code
         self.node_info[node_hash] = node_info
     self.calc_new_score()
     result = sorted(self.node_info.items(),
                     key=lambda x: x[1]['score'],
                     reverse=True)
     return result
 def extract(self, element: HtmlElement, title_xpath: str = '') -> str:
     title_xpath = title_xpath or config.get('title', {}).get('xpath')
     title = (self.extract_by_xpath(element, title_xpath)
              or self.extract_by_htag_and_title(element)
              or self.extract_by_title(element)
              or self.extract_by_htag(element))
     return title.strip()
 def extractor(self, element: HtmlElement, author_xpath=''):
     # 【GET】连续按键索引取值时,可设置值不存在时的默认值为{},这样后面的索引就不会报错了。
     author_xpath = author_xpath or config.get('author', {}).get(
         'xpath')  # 获取自定义的规则
     # 首先,用自定义的规则匹配作者
     if author_xpath:
         author = ''.join(element.xpath(author_xpath))
         return author
     # 其次,用写好的规则匹配作者
     text = ''.join(element.xpath('.//text()'))  # 获取全部文本
     for pattern in self.author_pattern:  # 遍历模式逐个匹配
         author_obj = re.search(pattern, text)
         if author_obj:
             return author_obj.group(1)
     # 最后,都没匹配到返回空字符串
     return ''
Example #5
0
 def extractor(self, element: HtmlElement, publish_time_xpath: str = '') -> str:
     publish_time_xpath = publish_time_xpath or config.get('publish_time', {}).get('xpath')
     publish_time = (self.extract_from_user_xpath(publish_time_xpath, element)  # 用户指定的 Xpath 是第一优先级
                     or self.extract_from_meta(element)   # 第二优先级从 Meta 中提取
                     or self.extract_from_text(element))  # 最坏的情况从正文中提取
     return publish_time
    def extract(self, selector, host='', with_body_html=False):
        """
         W3C标准中HTML结构:

        <!DOCTYPE html>
        <html>
          <head>
            <meta charset="UTF-8">
            <title>网页标题</title>
          </head>
          <body>
            <h1>网页正文</h1>
          </body>
        </html>

        :param selector:
        :param host:
        :param with_body_html:
        :return:
        """
        body = selector.xpath('//body')[0]  # 选中body标签
        for node in iter_node(body):
            node_hash = hash(node)
            # 计算节点文本密度
            density_info = self.calc_text_density(
                node
            )  # 返回{'density': density, 'ti_text': ti_text, 'ti': ti, 'lti': lti, 'tgi': tgi, 'ltgi': ltgi}

            # 计算文字符号密度
            text_density = density_info['density']
            ti_text = density_info['ti_text']
            text_tag_count = self.count_text_tag(node, tag='p')  # 计算文本标签(p)数量
            sbdi = self.calc_sbdi(ti_text, density_info['ti'],
                                  density_info['lti'])  # 返回sbdi or 1

            # 解析图片url(获取所有img的src,若用户定义了host主域名,则加上)
            images_list = node.xpath('.//img/@src')
            host = host or config.get('host', '')
            if host:
                images_list = [
                    pad_host_for_images(host, url) for url in images_list
                ]

            node_info = {
                'ti': density_info['ti'],
                'lti': density_info['lti'],
                'tgi': density_info['tgi'],
                'ltgi': density_info['ltgi'],
                'node': node,
                'density': text_density,
                'text': ti_text,
                'images': images_list,
                'text_tag_count': text_tag_count,
                'sbdi': sbdi
            }
            # 生成新闻正文所在标签的 HTML 源代码
            if with_body_html or config.get('with_body_html', False):
                body_source_code = unescape(
                    etree.tostring(node, encoding='utf-8').decode())
                node_info['body_html'] = body_source_code
            self.node_info[node_hash] = node_info
        std = self.calc_standard_deviation()  # 计算标准差
        self.calc_new_score(std)  # 评分核心函数
        # sorted(key)参数含义: 按照第几维的元素进行排序。
        # 此处按照第二维中的score对应的值排序
        result = sorted(self.node_info.items(),
                        key=lambda x: x[1]['score'],
                        reverse=True)
        return result