def extract(self, selector, host='', body_xpath='', with_body_html=False): body_xpath = body_xpath or config.get('body', {}).get('xpath', '') if body_xpath: body = selector.xpath(body_xpath)[0] else: body = selector.xpath('//body')[0] for node in iter_node(body): node_hash = hash(node) density_info = self.calc_text_density(node) text_density = density_info['density'] ti_text = density_info['ti_text'] text_tag_count = self.count_text_tag(node, tag='p') sbdi = self.calc_sbdi(ti_text, density_info['ti'], density_info['lti']) images_list = node.xpath('.//img/@src') host = host or config.get('host', '') if host: images_list = [pad_host_for_images(host, url) for url in images_list] node_info = {'ti': density_info['ti'], 'lti': density_info['lti'], 'tgi': density_info['tgi'], 'ltgi': density_info['ltgi'], 'node': node, 'density': text_density, 'text': ti_text, 'images': images_list, 'text_tag_count': text_tag_count, 'sbdi': sbdi} if with_body_html or config.get('with_body_html', False): body_source_code = unescape(etree.tostring(node, encoding='utf-8').decode()) node_info['body_html'] = body_source_code self.node_info[node_hash] = node_info std = self.calc_standard_deviation() self.calc_new_score(std) result = sorted(self.node_info.items(), key=lambda x: x[1]['score'], reverse=True) return result
def extract(self, selector, host='', body_xpath='', with_body_html=False, use_visiable_info=False): body_xpath = body_xpath or config.get('body', {}).get('xpath', '') use_visiable_info = use_visiable_info or config.get( 'use_visiable_info', False) if body_xpath: body = selector.xpath(body_xpath)[0] else: body = selector.xpath('//body')[0] for node in iter_node(body): if use_visiable_info: if not node.attrib.get('is_visiable', True): continue coordinate_json = node.attrib.get('coordinate', '{}') coordinate = json.loads(coordinate_json) if coordinate.get('height', 0) < 150: # 正文块的高度应该要大于150px continue node_hash = hash(node) density_info = self.calc_text_density(node) text_density = density_info['density'] ti_text = density_info['ti_text'] text_tag_count = self.count_text_tag(node, tag='p') sbdi = self.calc_sbdi(ti_text, density_info['ti'], density_info['lti']) images_list = node.xpath('.//img/@src') host = host or config.get('host', '') if host: images_list = [ pad_host_for_images(host, url) for url in images_list ] node_info = { 'ti': density_info['ti'], 'lti': density_info['lti'], 'tgi': density_info['tgi'], 'ltgi': density_info['ltgi'], 'node': node, 'density': text_density, 'text': ti_text, 'images': images_list, 'text_tag_count': text_tag_count, 'sbdi': sbdi } if use_visiable_info: node_info['is_visiable'] = node.attrib['is_visiable'] node_info['coordinate'] = node.attrib.get('coordinate', '') if with_body_html or config.get('with_body_html', False): body_source_code = unescape( etree.tostring(node, encoding='utf-8').decode()) node_info['body_html'] = body_source_code self.node_info[node_hash] = node_info self.calc_new_score() result = sorted(self.node_info.items(), key=lambda x: x[1]['score'], reverse=True) return result
def extract(self, element: HtmlElement, title_xpath: str = '') -> str: title_xpath = title_xpath or config.get('title', {}).get('xpath') title = (self.extract_by_xpath(element, title_xpath) or self.extract_by_htag_and_title(element) or self.extract_by_title(element) or self.extract_by_htag(element)) return title.strip()
def extractor(self, element: HtmlElement, author_xpath=''): # 【GET】连续按键索引取值时,可设置值不存在时的默认值为{},这样后面的索引就不会报错了。 author_xpath = author_xpath or config.get('author', {}).get( 'xpath') # 获取自定义的规则 # 首先,用自定义的规则匹配作者 if author_xpath: author = ''.join(element.xpath(author_xpath)) return author # 其次,用写好的规则匹配作者 text = ''.join(element.xpath('.//text()')) # 获取全部文本 for pattern in self.author_pattern: # 遍历模式逐个匹配 author_obj = re.search(pattern, text) if author_obj: return author_obj.group(1) # 最后,都没匹配到返回空字符串 return ''
def extractor(self, element: HtmlElement, publish_time_xpath: str = '') -> str: publish_time_xpath = publish_time_xpath or config.get('publish_time', {}).get('xpath') publish_time = (self.extract_from_user_xpath(publish_time_xpath, element) # 用户指定的 Xpath 是第一优先级 or self.extract_from_meta(element) # 第二优先级从 Meta 中提取 or self.extract_from_text(element)) # 最坏的情况从正文中提取 return publish_time
def extract(self, selector, host='', with_body_html=False): """ W3C标准中HTML结构: <!DOCTYPE html> <html> <head> <meta charset="UTF-8"> <title>网页标题</title> </head> <body> <h1>网页正文</h1> </body> </html> :param selector: :param host: :param with_body_html: :return: """ body = selector.xpath('//body')[0] # 选中body标签 for node in iter_node(body): node_hash = hash(node) # 计算节点文本密度 density_info = self.calc_text_density( node ) # 返回{'density': density, 'ti_text': ti_text, 'ti': ti, 'lti': lti, 'tgi': tgi, 'ltgi': ltgi} # 计算文字符号密度 text_density = density_info['density'] ti_text = density_info['ti_text'] text_tag_count = self.count_text_tag(node, tag='p') # 计算文本标签(p)数量 sbdi = self.calc_sbdi(ti_text, density_info['ti'], density_info['lti']) # 返回sbdi or 1 # 解析图片url(获取所有img的src,若用户定义了host主域名,则加上) images_list = node.xpath('.//img/@src') host = host or config.get('host', '') if host: images_list = [ pad_host_for_images(host, url) for url in images_list ] node_info = { 'ti': density_info['ti'], 'lti': density_info['lti'], 'tgi': density_info['tgi'], 'ltgi': density_info['ltgi'], 'node': node, 'density': text_density, 'text': ti_text, 'images': images_list, 'text_tag_count': text_tag_count, 'sbdi': sbdi } # 生成新闻正文所在标签的 HTML 源代码 if with_body_html or config.get('with_body_html', False): body_source_code = unescape( etree.tostring(node, encoding='utf-8').decode()) node_info['body_html'] = body_source_code self.node_info[node_hash] = node_info std = self.calc_standard_deviation() # 计算标准差 self.calc_new_score(std) # 评分核心函数 # sorted(key)参数含义: 按照第几维的元素进行排序。 # 此处按照第二维中的score对应的值排序 result = sorted(self.node_info.items(), key=lambda x: x[1]['score'], reverse=True) return result