def _extract(self, href, referer): resp = rget(href, referer=referer) if not resp: self._extract(href, referer=referer) html = etree.HTML(resp.content) if not html: return title = ''.join(html.xpath('//*[@id="activity-name"]/text()')) if title: title = trim(title) else: return publish_time = re.findall(r'publish_time = "(\d{4}-\d{2}-\d{2})"?', resp.text) publish_time = publish_time[0] if publish_time else '' author = trim(''.join(html.xpath('//*[@id="js_name"]/text()'))) content = ''.join(html.xpath('//*[@id="js_content"]//text()')) if content: content = trim('。&&&'.join(content.split('。'))) else: return logger.debug('\033[96m title:{}; href:{}; content:{} \033[0m'.format( title, href, len(content))) return { 'category': 'news', 'site': self.url, 'tag': -1, 'news_url': href, 'title': title, 'content': content, 'author': author, 'publish_time': publish_time, }
def _extract(self, href, referer): resp = rget(href, referer=referer) if not resp: return html = etree.HTML(resp.content) if len(html) is None: return info = html.xpath('//div[@class="info"]/text()') tag = info[-1] publish_time = info[0].split('\xa0')[0] author = ''.join(html.xpath('//div[@class="info"]/a/text()')) title = ''.join(html.xpath('//div[@class="article_con"]/h1/text()')) content = ''.join(html.xpath('//div[@class="art_con"]//text()')) if content: content = trim('。&&&'.join(content.split('。'))) else: content = ''.join(html.xpath('//div[@class="mcontent"]//text()')) if not content: return logger.debug( '\033[96m title:{}; href:{}; tag:{}; content:{} \033[0m'.format( title, href, tag, len(content))) return { 'category': '包包', 'site': self.site, 'tag': tag, 'news_url': href, 'title': title, 'content': content, 'author': author, 'publish_time': publish_time, }
def _extract(self, href, referer): resp = rget(href, referer=referer) if not resp: return html = etree.HTML(resp.content) if not html: return title = html.xpath('//td[@class="article_title1a"]/h1/text()') # title = title if title else html.xpath('//h1[@class="nw"]/text()') if title: title = title[0] else: return tag = html.xpath( '//div[@id="pagecenter"]/table/tr[2]/td[1]/table/tr/td/a[3]/text()' ) tag = tag[0] if tag else '-1' publish_time = html.xpath('//td[@align="center"]//text()') publish_time = publish_time[0].split( '\xa0')[0] if publish_time else '-1' ps = html.xpath('//td[@class="article_title2a"]//text()') sText = ''.join(ps) if len(sText) <= 100: content = trim(sText) else: sText = sText.split('。') content = trim('。&&&'.join(sText)) if filter_(content) or not content: return logger.debug( '\033[96m title:{}; href:{}; tag:{}; content:{}\033[0m'.format( title, href, tag, len(content))) return { 'category': '手表', 'site': self.site, 'tag': tag, 'news_url': href, 'title': title, 'content': content, 'author': '', 'publish_time': publish_time, }
def _extract(self, href, referer): resp = rget(href, referer=referer) if not resp: return html = etree.HTML(resp.content) if not html: return title = html.xpath('//*[@class="title"]/h1/text()') if title: title = title[0] else: return tag = html.xpath('//*[@class="breadcrumb left"]/p/a[2]/text()') tag = tag[0] if tag else '-1' publish_time = html.xpath('//*[@class="article-attr"]/span[1]/text()') publish_time = publish_time[0] if publish_time else '' author = html.xpath('//*[@class="article-attr"]/span[4]/text()') author = author[0].split(':')[1] if author else '' ps = html.xpath('//*[@class="article"]//p/text()') sText = ''.join(ps) if len(sText) <= 100: content = trim(sText) else: sText = sText.split('。') content = trim('。&&&'.join(sText)) if filter_(content) or not content: return logger.debug('\033[96m title:{}; href:{}; tag:{}; content:{}\033[0m' .format(title, href, tag, len(content))) return { 'category': '手表', 'site': self.site, 'tag': tag, 'news_url': href, 'title': title, 'content': content, 'author': author, 'publish_time': publish_time, }
def _extract(self, href, referer): resp = rget(href, referer=referer) if not resp: return html = etree.HTML(resp.content) if len(html) is None: return tag = ''.join(html.xpath('//div[@class="h"]/a[last()]/text()')) title = ''.join(html.xpath('//font[@class="f5"]/text()')) other = trim(''.join(html.xpath('//font[@class="f3"]/text()'))) other = re.findall('发布时间:(\d{4}-\d{2}-\d{2})来源:(\w+)', other) if other: publish_time = other[0][0] author = other[0][1] else: publish_time = author = '' content = ''.join( html.xpath( '//div[@class="mcontent"]//p[string-length(text()) >1]/text()') ) if content: content = trim('。&&&'.join(content.split('。'))) else: content = ''.join(html.xpath('//div[@class="mcontent"]//text()')) if filter_(content) or not content: return logger.debug( '\033[96m title:{}; href:{}; tag:{}; content:{} \033[0m'.format( title, href, tag, len(content))) return { 'category': '包包', 'site': self.site, 'tag': tag, 'news_url': href, 'title': title, 'content': content, 'author': author, 'publish_time': publish_time, }
def _extract(self, href, referer): resp = rget(href, referer=referer) if not resp: return html = etree.HTML(resp.content) if not html: return title = ''.join(html.xpath('//title/text()')) if title: title = trim(title) else: return publish_time = re.findall( r"time: '(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})'?", resp.text) publish_time = publish_time[0] if publish_time else '' author = trim(''.join(re.findall(r"name: '(\w+)'?", resp.text))) tag = ','.join(re.findall(r'{"name":"(\w+)"}\]?', resp.text)) content = ''.join(re.findall(r"content: '(.+)'?", resp.text)) if content: content = trim(content) content = re.sub('[<>"pa-z\/#3D\.-:_]', '', content) content = '。&&&'.join(content.split('。')) else: return logger.debug('\033[96m title:{}; href:{}; content:{} \033[0m'.format( title, href, len(content))) return { 'category': 'news', 'site': self.url, 'tag': tag, 'news_url': href, 'title': title, 'content': content, 'author': author, 'publish_time': publish_time, }
def _extract(self, href, referer): resp = rget(href, referer=referer) if not resp: self._extract(href, referer=referer) html = etree.HTML(resp.content) if not html: return title = ''.join(html.xpath('//h2[@class="title-info"]/text()')) if title: title = trim(title) else: return publish_time = trim(''.join( html.xpath('//footer[@class="time"]/text()'))) author = trim(''.join(html.xpath('//header[@class="name"]/text()'))) content = html.xpath('//div[@class="display-content"]//p/text()') + \ html.xpath('//div[@class="hidden-content hide"]//p/text()') content = ''.join(content) if content: content = trim('。&&&'.join(content.split('。'))) else: return logger.debug('\033[96m title:{}; href:{}; content:{} \033[0m'.format( title, href, len(content))) return { 'category': 'news', 'site': self.url, 'tag': '-1', 'news_url': href, 'title': title, 'content': content, 'author': author, 'publish_time': publish_time, }
def _extract(self, href, referer): resp = rget(href, referer=referer) if not resp: return html = etree.HTML(resp.content) if len(html) is None: return title = ''.join(html.xpath('//font[@class="f5 f6"]/text()')) tag = ''.join(html.xpath('//div[@id="ur_here"]/a[2]/text()')) other = ''.join(html.xpath('//font[@class="f3"]/text()')).split('/') publish_time = other[1].strip() author = other[0].strip() if other[0] else '-1' content = ''.join( html.xpath( '//div[@class="box_1"]/div//span[string-length(text()) >1]/text()' )) if not content: content = ''.join( html.xpath( '//div[@class="box_1"]/div//p[string-length(text()) >1]/text()' )) content = trim('。&&&'.join(content.split('。'))) if filter_(content) and not content: return logger.debug( '\033[96m title:{}; href:{}; tag:{}; content:{}; \033[0m'.format( title, href, tag, len(content))) return { 'category': '包包', 'site': self.site, 'tag': tag, 'news_url': href, 'title': title, 'content': content, 'author': author, 'publish_time': publish_time, }