Beispiel #1
0
 def extract(self, selector, host='', body_xpath='', with_body_html=False):
     body_xpath = body_xpath or config.get('body', {}).get('xpath', '')
     if body_xpath:
         body = selector.xpath(body_xpath)[0]
     else:
         body = selector.xpath('//body')[0]
     body = self.remove_list_relevant(body)
     for node in iter_node(body):
         density_info = self.calc_text_density(node)
         node_hash = hash(node)
         text_density = density_info['density']
         ti_text = density_info['ti_text']
         text_tag_count = self.count_text_tag(node, tag='p')
         sbdi = self.calc_sbdi(ti_text, density_info['ti'],
                               density_info['lti'])
         images_list = node.xpath('.//img/@src')
         images_list = self.remove_img(images_list)
         host = host or config.get('host', '')
         if host:
             images_list = [
                 pad_host_for_images(host, url) for url in images_list
             ]
         node_info = {
             'ti': density_info['ti'],
             'lti': density_info['lti'],
             'tgi': density_info['tgi'],
             'ltgi': density_info['ltgi'],
             'node': node,
             'body': body,
             'density': text_density,
             'text': ti_text,
             'images': images_list,
             'text_tag_count': text_tag_count,
             'sbdi': sbdi
         }
         if with_body_html or config.get('with_body_html', False):
             body_source_code = unescape(
                 etree.tostring(node, encoding='utf-8').decode())
             node_info['body_html'] = body_source_code
         self.node_info[node_hash] = node_info
     self.calc_new_score()
     result = sorted(self.node_info.items(),
                     key=lambda x: x[1]['score'],
                     reverse=True)
     return result
Beispiel #2
0
 def extractor(self, content_element: HtmlElement, element: HtmlElement, publish_time_xpath: str = '') -> str:
     publish_time_xpath = publish_time_xpath or config.get('publish_time', {}).get('xpath')
     publish_time = (self.extract_from_user_xpath(publish_time_xpath, element)  # 用户指定的 Xpath 是第一优先级
                     or self.extract_from_meta(element)  # 第二优先级从 Meta 中提取
                     or self.extract_from_text(element))  # 最坏的情况从正文中提取
     try:
         publish_time = self.resolve_time(publish_time)
     except:
         return ''
     return publish_time
 def extractor(self, element: HtmlElement, author_xpath=''):
     author_xpath = author_xpath or config.get('author', {}).get('xpath')
     if author_xpath:
         author = ''.join(element.xpath(author_xpath))
         return author
     text = ''.join(element.xpath('.//text()'))
     for pattern in self.author_pattern:
         author_obj = re.search(pattern, text)
         if author_obj:
             return author_obj.group(1)
     return ''
 def extract(self,
             content,
             content_element: HtmlElement,
             element: HtmlElement,
             title_xpath: str = '') -> str:
     title_xpath = title_xpath or config.get('title', {}).get('xpath')
     title = (self.extract_by_xpath(element, title_xpath)
              or self.extract_by_htag_and_title(element)
              or self.extract_by_htag(element, content_element, content)
              or self.extract_by_title(element))
     return title.strip()