def extract(self, selector, host='', body_xpath='', with_body_html=False): body_xpath = body_xpath or config.get('body', {}).get('xpath', '') if body_xpath: body = selector.xpath(body_xpath)[0] else: body = selector.xpath('//body')[0] body = self.remove_list_relevant(body) for node in iter_node(body): density_info = self.calc_text_density(node) node_hash = hash(node) text_density = density_info['density'] ti_text = density_info['ti_text'] text_tag_count = self.count_text_tag(node, tag='p') sbdi = self.calc_sbdi(ti_text, density_info['ti'], density_info['lti']) images_list = node.xpath('.//img/@src') images_list = self.remove_img(images_list) host = host or config.get('host', '') if host: images_list = [ pad_host_for_images(host, url) for url in images_list ] node_info = { 'ti': density_info['ti'], 'lti': density_info['lti'], 'tgi': density_info['tgi'], 'ltgi': density_info['ltgi'], 'node': node, 'body': body, 'density': text_density, 'text': ti_text, 'images': images_list, 'text_tag_count': text_tag_count, 'sbdi': sbdi } if with_body_html or config.get('with_body_html', False): body_source_code = unescape( etree.tostring(node, encoding='utf-8').decode()) node_info['body_html'] = body_source_code self.node_info[node_hash] = node_info self.calc_new_score() result = sorted(self.node_info.items(), key=lambda x: x[1]['score'], reverse=True) return result
def extractor(self, content_element: HtmlElement, element: HtmlElement, publish_time_xpath: str = '') -> str: publish_time_xpath = publish_time_xpath or config.get('publish_time', {}).get('xpath') publish_time = (self.extract_from_user_xpath(publish_time_xpath, element) # 用户指定的 Xpath 是第一优先级 or self.extract_from_meta(element) # 第二优先级从 Meta 中提取 or self.extract_from_text(element)) # 最坏的情况从正文中提取 try: publish_time = self.resolve_time(publish_time) except: return '' return publish_time
def extractor(self, element: HtmlElement, author_xpath=''): author_xpath = author_xpath or config.get('author', {}).get('xpath') if author_xpath: author = ''.join(element.xpath(author_xpath)) return author text = ''.join(element.xpath('.//text()')) for pattern in self.author_pattern: author_obj = re.search(pattern, text) if author_obj: return author_obj.group(1) return ''
def extract(self, content, content_element: HtmlElement, element: HtmlElement, title_xpath: str = '') -> str: title_xpath = title_xpath or config.get('title', {}).get('xpath') title = (self.extract_by_xpath(element, title_xpath) or self.extract_by_htag_and_title(element) or self.extract_by_htag(element, content_element, content) or self.extract_by_title(element)) return title.strip()