def extract(self, selector, host='', with_body_html=False): body = selector.xpath('//body')[0] for node in iter_node(body): node_hash = hash(node) density_info = self.calc_text_density(node) text_density = density_info['density'] ti_text = density_info['ti_text'] text_tag_count = self.count_text_tag(node, tag='p') sbdi = self.calc_sbdi(ti_text, density_info['ti'], density_info['lti']) images_list = node.xpath('.//img/@src') host = host or config.get('host', '') if host: images_list = [pad_host_for_images(host, url) for url in images_list] node_info = {'ti': density_info['ti'], 'lti': density_info['lti'], 'tgi': density_info['tgi'], 'ltgi': density_info['ltgi'], 'node': node, 'density': text_density, 'text': ti_text, 'images': images_list, 'text_tag_count': text_tag_count, 'sbdi': sbdi} if with_body_html or config.get('with_body_html', False): body_source_code = unescape(etree.tostring(node, encoding='utf-8').decode()) node_info['body_html'] = body_source_code self.node_info[node_hash] = node_info std = self.calc_standard_deviation() self.calc_new_score(std) result = sorted(self.node_info.items(), key=lambda x: x[1]['score'], reverse=True) return result
def extract(self, element: HtmlElement, keywords_xpath: str = '') -> str: keywords_xpath = keywords_xpath or config.get('keywords', {}).get('xpath') keywords = ( self.extract_from_user_xpath(keywords_xpath, element) # 用户指定的 Xpath 是第一优先级 or self.extract_from_meta(element) # 第二优先级从 Meta 中提取 or self.extract_from_text(element)) # 最坏的情况从正文中提取 return keywords
def extract(self, element: HtmlElement, description_xpath: str = '') -> str: description_xpath = description_xpath or config.get('description', {}).get('xpath') description = ( self.extract_from_user_xpath(description_xpath, element) # 用户指定的 Xpath 是第一优先级 or self.extract_from_meta(element)) return description
def extract(self, url='', html=None, title_xpath='', author_xpath='', publish_time_xpath='', keywords_xpath='', description_xpath='', host='', noise_node_list=None, with_body_html=False): """ 新闻类站点自动化抽取 :param url: 新闻页面网址 :param html: 新闻页面源代码 :param title_xpath: 新闻标题xpath :param author_xpath: 作者xpath :param publish_time_xpath: 发布时间xpath :param keywords_xpath: 新闻关键词xpath :param description_xpath: 新闻简介xpath :param host: 站点网址 :param noise_node_list: 去除多余list内容 :param with_body_html: :return: 输出json """ if url is not None and len(url) > 0: html = self.getHtml(url=url) element = html2element(html) title = TitleExtractor().extract(element, title_xpath=title_xpath) publish_time = TimeExtractor().extract( element, publish_time_xpath=publish_time_xpath) author = AuthorExtractor().extract(element, author_xpath=author_xpath) keywords = KeywordsExtractor().extract(element, keywords_xpath=keywords_xpath) description = DescriptionExtractor().extract( element, description_xpath=description_xpath) element = pre_parse(element) remove_noise_node(element, noise_node_list) content = ContentExtractor().extract(element, host, with_body_html) result = { 'title': title, 'author': author, 'publish_time': publish_time, 'keywords': keywords, 'description': description, 'content': content[0][1]['text'], 'images': content[0][1]['images'] } if with_body_html or config.get('with_body_html', False): result['body_html'] = content[0][1]['body_html'] return result
def extract(self, element: HtmlElement, publish_time_xpath: str = '') -> str: publish_time_xpath = publish_time_xpath or config.get( 'publish_time', {}).get('xpath') publish_time = ( self.extract_from_user_xpath(publish_time_xpath, element) # 用户指定的 Xpath 是第一优先级 or self.extract_from_meta(element) # 第二优先级从 Meta 中提取 or self.extract_from_text(element)) # 最坏的情况从正文中提取 return publish_time
def extract(self, element: HtmlElement, title_xpath: str = ''): title_xpath = title_xpath or config.get('title', {}).get('xpath') title = self.extract_from_xpath(element, title_xpath) \ or self.extract_from_title(element) \ or self.extract_from_htag(element) return title.strip()
def extract(self, element: HtmlElement, author_xpath=''): author_xpath = author_xpath or config.get('author', {}).get('xpath') author = self.extract_from_xpath(element, author_xpath) \ or self.extract_from_htag(element) return author.strip()