コード例 #1
0
 def extract(self, selector, host='', with_body_html=False):
     body = selector.xpath('//body')[0]
     for node in iter_node(body):
         node_hash = hash(node)
         density_info = self.calc_text_density(node)
         text_density = density_info['density']
         ti_text = density_info['ti_text']
         text_tag_count = self.count_text_tag(node, tag='p')
         sbdi = self.calc_sbdi(ti_text, density_info['ti'], density_info['lti'])
         images_list = node.xpath('.//img/@src')
         host = host or config.get('host', '')
         if host:
             images_list = [pad_host_for_images(host, url) for url in images_list]
         node_info = {'ti': density_info['ti'],
                      'lti': density_info['lti'],
                      'tgi': density_info['tgi'],
                      'ltgi': density_info['ltgi'],
                      'node': node,
                      'density': text_density,
                      'text': ti_text,
                      'images': images_list,
                      'text_tag_count': text_tag_count,
                      'sbdi': sbdi}
         if with_body_html or config.get('with_body_html', False):
             body_source_code = unescape(etree.tostring(node, encoding='utf-8').decode())
             node_info['body_html'] = body_source_code
         self.node_info[node_hash] = node_info
     std = self.calc_standard_deviation()
     self.calc_new_score(std)
     result = sorted(self.node_info.items(), key=lambda x: x[1]['score'], reverse=True)
     return result
コード例 #2
0
 def extract(self, element: HtmlElement, keywords_xpath: str = '') -> str:
     keywords_xpath = keywords_xpath or config.get('keywords',
                                                   {}).get('xpath')
     keywords = (
         self.extract_from_user_xpath(keywords_xpath,
                                      element)  # 用户指定的 Xpath 是第一优先级
         or self.extract_from_meta(element)  # 第二优先级从 Meta 中提取
         or self.extract_from_text(element))  # 最坏的情况从正文中提取
     return keywords
コード例 #3
0
 def extract(self,
             element: HtmlElement,
             description_xpath: str = '') -> str:
     description_xpath = description_xpath or config.get('description',
                                                         {}).get('xpath')
     description = (
         self.extract_from_user_xpath(description_xpath,
                                      element)  # 用户指定的 Xpath 是第一优先级
         or self.extract_from_meta(element))
     return description
コード例 #4
0
ファイル: __init__.py プロジェクト: justinzm/Web-Extractor
    def extract(self,
                url='',
                html=None,
                title_xpath='',
                author_xpath='',
                publish_time_xpath='',
                keywords_xpath='',
                description_xpath='',
                host='',
                noise_node_list=None,
                with_body_html=False):
        """
        新闻类站点自动化抽取
        :param url:                 新闻页面网址
        :param html:                新闻页面源代码
        :param title_xpath:         新闻标题xpath
        :param author_xpath:        作者xpath
        :param publish_time_xpath:  发布时间xpath
        :param keywords_xpath:      新闻关键词xpath
        :param description_xpath:   新闻简介xpath
        :param host:                站点网址
        :param noise_node_list:     去除多余list内容
        :param with_body_html:
        :return:    输出json
        """
        if url is not None and len(url) > 0:
            html = self.getHtml(url=url)

        element = html2element(html)

        title = TitleExtractor().extract(element, title_xpath=title_xpath)
        publish_time = TimeExtractor().extract(
            element, publish_time_xpath=publish_time_xpath)
        author = AuthorExtractor().extract(element, author_xpath=author_xpath)
        keywords = KeywordsExtractor().extract(element,
                                               keywords_xpath=keywords_xpath)
        description = DescriptionExtractor().extract(
            element, description_xpath=description_xpath)

        element = pre_parse(element)
        remove_noise_node(element, noise_node_list)
        content = ContentExtractor().extract(element, host, with_body_html)
        result = {
            'title': title,
            'author': author,
            'publish_time': publish_time,
            'keywords': keywords,
            'description': description,
            'content': content[0][1]['text'],
            'images': content[0][1]['images']
        }
        if with_body_html or config.get('with_body_html', False):
            result['body_html'] = content[0][1]['body_html']
        return result
コード例 #5
0
 def extract(self,
             element: HtmlElement,
             publish_time_xpath: str = '') -> str:
     publish_time_xpath = publish_time_xpath or config.get(
         'publish_time', {}).get('xpath')
     publish_time = (
         self.extract_from_user_xpath(publish_time_xpath,
                                      element)  # 用户指定的 Xpath 是第一优先级
         or self.extract_from_meta(element)  # 第二优先级从 Meta 中提取
         or self.extract_from_text(element))  # 最坏的情况从正文中提取
     return publish_time
コード例 #6
0
 def extract(self, element: HtmlElement, title_xpath: str = ''):
     title_xpath = title_xpath or config.get('title', {}).get('xpath')
     title = self.extract_from_xpath(element, title_xpath) \
             or self.extract_from_title(element) \
             or self.extract_from_htag(element)
     return title.strip()
コード例 #7
0
 def extract(self, element: HtmlElement, author_xpath=''):
     author_xpath = author_xpath or config.get('author', {}).get('xpath')
     author = self.extract_from_xpath(element, author_xpath) \
             or self.extract_from_htag(element)
     return author.strip()