def _extract(self, href, referer): resp = rget(href, referer=referer) if not resp: return html = etree.HTML(resp.content) if not html: return title = html.xpath('//td[@class="article_title1a"]/h1/text()') # title = title if title else html.xpath('//h1[@class="nw"]/text()') if title: title = title[0] else: return tag = html.xpath( '//div[@id="pagecenter"]/table/tr[2]/td[1]/table/tr/td/a[3]/text()' ) tag = tag[0] if tag else '-1' publish_time = html.xpath('//td[@align="center"]//text()') publish_time = publish_time[0].split( '\xa0')[0] if publish_time else '-1' ps = html.xpath('//td[@class="article_title2a"]//text()') sText = ''.join(ps) if len(sText) <= 100: content = trim(sText) else: sText = sText.split('。') content = trim('。&&&'.join(sText)) if filter_(content) or not content: return logger.debug( '\033[96m title:{}; href:{}; tag:{}; content:{}\033[0m'.format( title, href, tag, len(content))) return { 'category': '手表', 'site': self.site, 'tag': tag, 'news_url': href, 'title': title, 'content': content, 'author': '', 'publish_time': publish_time, }
def _extract(self, href, referer): resp = rget(href, referer=referer) if not resp: return html = etree.HTML(resp.content) if not html: return title = html.xpath('//*[@class="title"]/h1/text()') if title: title = title[0] else: return tag = html.xpath('//*[@class="breadcrumb left"]/p/a[2]/text()') tag = tag[0] if tag else '-1' publish_time = html.xpath('//*[@class="article-attr"]/span[1]/text()') publish_time = publish_time[0] if publish_time else '' author = html.xpath('//*[@class="article-attr"]/span[4]/text()') author = author[0].split(':')[1] if author else '' ps = html.xpath('//*[@class="article"]//p/text()') sText = ''.join(ps) if len(sText) <= 100: content = trim(sText) else: sText = sText.split('。') content = trim('。&&&'.join(sText)) if filter_(content) or not content: return logger.debug('\033[96m title:{}; href:{}; tag:{}; content:{}\033[0m' .format(title, href, tag, len(content))) return { 'category': '手表', 'site': self.site, 'tag': tag, 'news_url': href, 'title': title, 'content': content, 'author': author, 'publish_time': publish_time, }
def _extract(self, href, referer): resp = rget(href, referer=referer) if not resp: return html = etree.HTML(resp.content) if len(html) is None: return tag = ''.join(html.xpath('//div[@class="h"]/a[last()]/text()')) title = ''.join(html.xpath('//font[@class="f5"]/text()')) other = trim(''.join(html.xpath('//font[@class="f3"]/text()'))) other = re.findall('发布时间:(\d{4}-\d{2}-\d{2})来源:(\w+)', other) if other: publish_time = other[0][0] author = other[0][1] else: publish_time = author = '' content = ''.join( html.xpath( '//div[@class="mcontent"]//p[string-length(text()) >1]/text()') ) if content: content = trim('。&&&'.join(content.split('。'))) else: content = ''.join(html.xpath('//div[@class="mcontent"]//text()')) if filter_(content) or not content: return logger.debug( '\033[96m title:{}; href:{}; tag:{}; content:{} \033[0m'.format( title, href, tag, len(content))) return { 'category': '包包', 'site': self.site, 'tag': tag, 'news_url': href, 'title': title, 'content': content, 'author': author, 'publish_time': publish_time, }
def _extract(self, href, referer): resp = rget(href, referer=referer) if not resp: return html = etree.HTML(resp.content) if len(html) is None: return title = ''.join(html.xpath('//font[@class="f5 f6"]/text()')) tag = ''.join(html.xpath('//div[@id="ur_here"]/a[2]/text()')) other = ''.join(html.xpath('//font[@class="f3"]/text()')).split('/') publish_time = other[1].strip() author = other[0].strip() if other[0] else '-1' content = ''.join( html.xpath( '//div[@class="box_1"]/div//span[string-length(text()) >1]/text()' )) if not content: content = ''.join( html.xpath( '//div[@class="box_1"]/div//p[string-length(text()) >1]/text()' )) content = trim('。&&&'.join(content.split('。'))) if filter_(content) and not content: return logger.debug( '\033[96m title:{}; href:{}; tag:{}; content:{}; \033[0m'.format( title, href, tag, len(content))) return { 'category': '包包', 'site': self.site, 'tag': tag, 'news_url': href, 'title': title, 'content': content, 'author': author, 'publish_time': publish_time, }