def get_content(self, response): meta = response.meta contents = response.xpath( '//div[@class="cnt_bd"]/p[not(@style)] | ' '//div[@class="shizhendema_Aind_9810_2013120304"]/div[@class="bd"]/p[not(@style)] |' '//div[@id="content_area]/p"]').xpath('string()').extract() content = '' pattern = r'原标题:|原标题:' for i in contents: if re.search(pattern, i): continue else: content += i.strip() item = JoviLonglasttimeItem() item['article_title'] = meta['title'] rep_content = 'var fo = createPlayer("v_player",540,400);fo.addVariable("videoId","vid");fo.addVariable("videoCenterId","bb13275ded2b46638e9ffc02983aaf38");fo.addVariable("videoType","0");fo.addVariable("videoEditMode","1");fo.addVariable("isAutoPlay","true");fo.addVariable("tai","news");fo.addVariable("languageConfig","");fo.addParam("wmode","opaque");writePlayer(fo,"embed_playerid");' item['article_content'] = content.replace('\n', '').replace( '\t', '').replace('\r', '').replace('\xa0', '').replace('\u3000', '').replace(rep_content, '') item['first_tag'] = '央视新闻' item['second_tag'] = meta['second_tag'] item['article_url'] = response.url yield item
def get_content(self, response): meta = response.meta item = JoviLonglasttimeItem() contents = response.xpath( '//div[@id="whole_content"]/p/text()').extract() # header = response.xpath('//div[contain(@class,"acTxtTit ")]//span') content = '' pattern = r'/|图\d:|图文|说明:|原标题|原题|选自:|公众号|▲|文章来自|本文|来源|||来自网络|作者:|声明:|译自|如有侵权|\||编辑|编者|往期回顾|记者|点击进入|联合出品|图为|提示:|导语:|资料图|图注:' for i in contents: if re.search(pattern, i, re.S): continue else: content += i.strip() item['article_content'] = content.replace('\r', '').replace( '\n', '').replace('\t', '').replace('\u3000', '').replace('\xa0', '').replace('\u200b', '') item['first_tag'] = '手机凤凰网' item['second_tag'] = meta['second_tag'] item['update_time'] = response.xpath( '//div[contains(@class,"acTxtTit ")]//span[1]/text()' ).extract_first() item['source'] = response.xpath( '//div[contains(@class,"acTxtTit ")]//span[last()]/text()' ).extract_first() item['article_title'] = meta['title'] item['article_url'] = response.url yield item
def get_content(self, response): try: meta = response.meta item = JoviLonglasttimeItem() content = re.search(r'"content":"(.*?)","thumbnails"', response.text, re.S).group(1) article_contents = etree.HTML(content).xpath('//p//text()') ptime = int( re.search(r'"publish_time":(\d+),', response.text, re.S).group(1)) publish_time = datetime.fromtimestamp(int(ptime) / 1000) pub_time = str(publish_time).split(' ')[0] article_content = '' pattern = r'图文|说明:|原标题|原题|选自:|公众号|▲|文章来自|本文|来源|||来自网络|作者:|声明:|译自|如有侵权|\||编辑|编者|往期回顾|记者|点击进入|联合出品|提示:|导语:' for i in article_contents: if re.search(pattern, i, re.S): continue else: article_content += i.strip() item['article_content'] = article_content.replace( '\r', '').replace('\n', '').replace('\t', '').replace('\xa0', '').replace('\u3000', '') item['first_tag'] = 'UC头条' item['second_tag'] = meta['third_tag'] item['article_url'] = response.url item['source'] = meta['source'] item['label'] = meta['label'] item['update_time'] = pub_time item['article_title'] = meta['title'] yield item except Exception: print('请求异常----%s' % response.url)
def get_content(self, response): meta = response.meta current_page = response.xpath('//a[@class="cur"]') next_page = current_page.xpath('following-sibling::a') contents = response.xpath( '//div[@id="J-contain_detail_cnt"]//text()').extract() content = '' for i in contents: if re.search( r'原标题:|选自:|公众号|▲|文章来自|本文|来源|||来自网络|作者:|声明:|译自|如有侵权|\||责任编辑:|编者按|往期回顾|记者|点击进入|联合出品|【精彩推荐】|·', i): continue else: content += i.strip() meta['content'] += content if next_page: url = 'https://mini.eastday.com/a/' + next_page.xpath( '@href').extract_first() yield scrapy.Request(url=url, callback=self.get_content, meta=meta) else: item = JoviLonglasttimeItem() item['first_tag'] = meta['first_tag'] item['second_tag'] = meta['second_tag'] item['third_tag'] = meta['third_tag'] item['source'] = meta['source'] item['update_time'] = response.xpath( '//div[@class="fl"]/i[1]/text()').re_first(r'\d+-\d+-\d+') item['article_url'] = re.sub(r'-\d+', '', response.url) item['article_title'] = meta['title'] item['article_content'] = meta['content'].replace( '\r', '').replace('\n', '').replace('\t', '').replace( '\xa0', '').replace('\u3000', '').replace('\ufeff', '') yield item
def get_content(self, response): meta = response.meta item = JoviLonglasttimeItem() item['article_url'] = response.url item['first_tag'] = meta['first_tag'] item['second_tag'] = meta['second_tag'] host = urlparse(response.url).netloc xpath = self.xpath.get(host) if xpath: item['article_title'] = response.xpath( xpath['title']).get().strip() ps = response.xpath(xpath['ps']).getall() else: logger.info('This URL parsing xpath is not settled:{}'.format( response.url)) print(response.url) return content = '' for p in ps: if re.search( r'责任编辑:|作者:|出处:|{}|来自:|来源 :|来源:|来源 : |图片来自|图片由|图:|更多精彩|请投稿至:|文|文/|编辑', p): continue elif re.search(r'关注微信公众号|参考资料|声明:|原网页已经由 ZAKER 转码排版 |推荐阅读', p): break else: content += p.strip() item['article_content'] = content.replace('\n', '').replace( '\r', '').replace('\t', '').replace('\u3000', '').replace('\xa0', '') yield item
def get_content(self, response): meta = response.meta if response.body != b'': contents = response.xpath( '//article/p//text()|//section[@class="art_pic_card art_content"]/p//text()|//div[@class="article"]/p//text()' ).extract() content = '' for i in contents: if re.search( '原标题:|图片来自|图片来源|文章转自|文章来自|本文来源|本文来自|作者:|微信公众号|更多信息请关注|来源:|如有侵权|点击进入专题|作者署名|本文是|ID:|✎|文\|', i): continue else: content += i.strip() item = JoviLonglasttimeItem() item['first_tag'] = meta['first_tag'] item['second_tag'] = meta['second_tag'] item['third_tag'] = meta['third_tag'] item['article_url'] = response.url item['source'] = meta['source'] item['update_time'] = meta['update_time'] item['article_title'] = meta['title'] item['article_content'] = content.replace('\r', '').replace( '\n', '').replace('\t', '').replace('\xa0', '').replace('\u3000', '') yield item
def get_content(self, response): meta = response.meta item = JoviLonglasttimeItem() # doc_info = json.loads(re.search(r'window\.yidian\.docinfo = (.*?)\n</script>',response.body.decode('utf-8'),re.S).group(1).replace('\\','')) artical_contents = response.xpath( '//div[@class="post_content"]//p//text()').extract() content = '' pattern = r'相关阅读:|图文|说明:|原标题|原题|选自:|公众号|▲|文章来自|本文|来源|||来自网络|作者:|声明:|译自|如有侵权|\||编辑|编者|往期回顾|记者|点击进入|提示:|导语:|转载联系|责编' for i in artical_contents: if re.search(pattern, i, re.S): continue elif re.search(r'相关阅读:', i): break else: content += i.strip() item['article_content'] = content.replace('\r', '').replace( '\t', '').replace('\n', '').replace('\xa0', '').replace('\u3000', '') try: item['article_title'] = response.xpath( '//div[@class="post_title"]//text()').extract_first().strip() except: item['article_title'] = '' item['first_tag'] = 'IT之家' item['second_tag'] = meta['second_tag'] item['third_tag'] = meta['third_tag'] item['article_url'] = response.url yield item
def get_content(self, response): meta = response.meta contents = response.xpath( '//div[contains(@class,"page js-page")]/p//text()').extract() content = '' for i in contents: if re.search( r'原标题:|选自:|公众号|▲|文章来自|本文|来源|||来自网络|作者:|声明:|译自|如有侵权|\||编辑:|编者按|往期回顾|记者|点击进入|联合出品|【精彩推荐】|·', i): continue else: content += i.strip() item = JoviLonglasttimeItem() item['first_tag'] = meta['first_tag'] item['second_tag'] = meta['second_tag'] item['third_tag'] = meta['third_tag'] item['label'] = '' item['source'] = meta['source'] item['update_time'] = meta['update_time'] item['article_url'] = response.url item['article_title'] = meta['title'] item['article_content'] = content.replace('\r', '').replace( '\n', '').replace('\t', '').replace('\xa0', '').replace('\u3000', '') yield item
def get_content(self, response): meta = response.meta item = JoviLonglasttimeItem() contents = response.xpath('//div[@id="artibody"]/p').xpath( 'string()').extract() pattern = r'点击[上下]方|关注(.*?)公众号|关注(.*?)微信|↓|原文|相关阅读|原文|说明:|原标题|原题|选自:|公众号|▲|文章来自|本文|||来自网络|作者:|声明:|译自|如有侵权|\||编辑|编者|往期回顾|记者|点击进入|提示:|导语:|转载联系|责编|译者:|来源:' content = '' for i in contents: if re.search(pattern, i): continue else: content += i.strip() item['article_content'] = content.replace('\r', '').replace( '\t', '').replace('\n', '').replace('\xa0', '').replace('\u3000', '').replace('\u200b', '') item['article_title'] = meta['title'] item['article_url'] = response.url item['first_tag'] = '新浪游记' item['second_tag'] = meta['second_tag'] item['third_tag'] = meta['third_tag'] item['update_time'] = meta['update_time'] item['label'] = '' item['source'] = '' yield item
def parse_article(self, response): meta = response.meta item = JoviLonglasttimeItem() ps = response.xpath('//div[@id="content"]/p/text()').getall() content = ''.join( map((lambda x: x.strip().replace('\n', '').replace('\r', '')), ps)) item['article_url'] = meta['article_url'] item['first_tag'] = 'Zaker新闻' item['second_tag'] = meta['second_tag'] item['article_title'] = meta['article_title'] item['article_content'] = content yield item
def get_content(self, response): meta = response.meta item = JoviLonglasttimeItem() data = json.loads(response.text) item['first_tag'] = '凤凰新闻' item['second_tag'] = meta['second_tag'] item['article_url'] = response.url item['article_title'] = data.get('body').get('title') article_content = data.get('body').get('text') s = scrapy.Selector(text=article_content) item['article_content'] = ''.join(s.xpath('//p//text()').get_all()) print(item) yield item
def get_content(self, response): meta = response.meta jr = json.loads(response.text) item = JoviLonglasttimeItem() item['first_tag'] = '一点资讯' item['second_tag'] = meta['second_tag'] item['article_url'] = response.url item['article_title'] = jr['documents'][0]['title'] try: s = Selector(text=jr['documents'][0]['content']) except: return ps = s.xpath('//p[not(@class)]//text()').getall() content = ''.join(map((lambda x: x.strip()), ps)) item['article_content'] = content yield item
def get_content(self, response): meta = response.meta item = JoviLonglasttimeItem() pattern = r'http[s]*://[\S]+.ifeng.com/a/[\d]{8}/[\d]+_0.[s]*html' # 两种形式的url,提取规则不一样 if re.search(pattern, response.url): contents = response.xpath( '//div[@id="main_content"]/p//text()').extract() else: try: allData = re.search('var allData = (.*?);\n', response.text).group(1) allData = json.loads(allData) docData = allData['docData'] type = docData['contentData']['contentList'][-1]['type'] if type == 'text': contentData = docData['contentData']['contentList'][-1][ 'data'] contents = scrapy.Selector(text=contentData).xpath( '//p[not(@class)]//text()').extract() else: contents = [] print('内容是视频或者图片----%s' % response.url) except Exception as e: print('可能发生跳转或者没有内容----%s' % response.url) print(e) contents = [] pattern1 = r'编辑:|注:|关注(.*?)公众号|作者:|请关注|微信号:|本文为|未经授权|作者原创|微信公号:|微信ID:|作者简介:|原标题:|记者||编辑||来源:' content = '' for i in contents: if re.search(pattern1, i): continue elif re.search(r'- END -|END', i): break else: content += i.strip() item['first_tag'] = '凤凰网' item['second_tag'] = meta['second_tag'] item['third_tag'] = meta['third_tag'] item['article_url'] = response.url item['article_title'] = meta['title'] item['article_content'] = content.replace('\r', '').replace( '\n', '').replace('\t', '').replace('\xa0', '').replace('\u3000', '') yield item
def get_content(self, response): item = JoviLonglasttimeItem() meta = response.meta article = response.xpath( '//div[@class="article"]/p[not(@class)]//text()').extract() content = '' pattern = r'原标题:|特别声明:|{}'.format(meta['title']) for i in article: if re.search(pattern, i): continue else: content += i.strip() item['article_content'] = content item['article_title'] = meta['title'] item['article_url'] = response.url item['first_tag'] = '新浪滚动' item['second_tag'] = '新浪滚动' item['third_tag'] = meta['third_tag'] yield item
def get_content(self, response): meta = response.meta res = response.text item = JoviLonglasttimeItem() content = re.search(r' content: \'(.*?)\'.slice\(6, -6\),', res) title = re.search(r' title: \'(.*?)\'.slice\(6, -6\),', res) second_tag = re.search(r'chineseTag: \'(.*?)\',', res) if content: content = content.group(1)[6:-6] else: log.msg('此URL没有文章----%s' % response.url, level=log.INFO) return if title: title = title.group(1)[6:-6] # log.msg(response.url,level=log.INFO) else: log.msg('此URL没有标题----%s' % response.url, level=log.INFO) return if second_tag: second_tag = second_tag.group(1) else: log.msg('此URL没有二级标签----%s' % response.url, level=log.INFO) return for k, j in self.HTML_entity.items(): content = content.replace(k, j) # print(content) e = etree.HTML(content).xpath('//p//text()') pattern = r"图片来自|原标题:|选自:|公众号|▲|文章来自|本文|来源|||来自网络|作者:|声明:|译自|如有侵权|\||责任编辑:|编者按|往期回顾|记者|点击进入|联合出品|【精彩推荐】|·|责编|源丨|文丨|转载联系" article = '' for i in e: if re.search(pattern, i): continue else: article += i.strip() article = article.replace('\r', '').replace('\n', '').replace( '\t', '').replace('\xa0', '').replace('\u3000', '').replace('\u200b', '') item['article_content'] = article item['article_title'] = title item['first_tag'] = meta['first_tag'] item['second_tag'] = second_tag item['article_url'] = response.url yield item
def get_article(self, response): meta = response.meta item = JoviLonglasttimeItem() contents = response.xpath( '//div[@id="p-detail"]/p[not(@class)]/text()').extract() content = '' pattern = r' 策划:|撰文:' for i in contents: if re.search(pattern, i): continue else: content += i.strip() item['first_tag'] = '新华网' item['second_tag'] = meta['second_tag'] item['article_url'] = response.url item['article_title'] = meta['article_title'] item['article_content'] = content.replace('\r', '').replace( '\n', '').replace('\t', '').replace('\xa0', '').replace('\u3000', '') yield item
def get_content(self, response): item = JoviLonglasttimeItem() item['article_url'] = response.url item['first_tag'] = '人民网' item['second_tag'] = self.channels.get(urlparse(response.url).netloc) item['article_title'] = response.xpath('//h1/text()').get() xpath = '//*[@id="rwb_zw"]//p//text() | //*[@class="box_con"]//p//text() |' \ ' //*[@class="box_con w1000 clearfix"]//p//text() | ' \ '//*[@class="content clear clearfix"]//p//text() |' \ '//*[@class="show_text"]//p//text() |' \ '//*[@id="p_content"]//p//text() |' \ '//*[@class="artDet"]//p//text() |' \ '//*[@class="text"]//p//text() |' \ '//*[@class="text width978 clearfix"]//p//text() |' \ '//*[@id="zoom"]//p//text() |' \ '//*[@class="text_show"]//p//text()' item['article_content'] = ''.join( map((lambda x: x.strip()), response.xpath(xpath).getall())) yield item
def get_content(self, response): meta = response.meta item = JoviLonglasttimeItem() contents = response.xpath( '//p[@class="text"]/text() | //h2[@class=""]/text()').extract() content = '' pattern = r'版权声明|来自:|关注:|搜索:|图:|点击播放|公众号:|文章来源' for i in contents: if re.search(pattern, i): continue else: content += i.strip() # 除了这个版权声明,没有多余的杂质,用replace去除 # 有需要根据实际情况只设置两层tag,为了简化文档结构,改写相应的pipeline item['first_tag'] = '天天快报' item['second_tag'] = meta['second_tag'] item['article_url'] = response.url item['article_title'] = meta['title'] item['article_content'] = content.replace('\n', '').replace( '\t', '').replace('\r', '').replace('\u3000', '').replace('\xa0', '') yield item
def get_content(self, response): # print(response.body.decode('utf-8')) meta = response.meta item = JoviLonglasttimeItem() contents = response.xpath( '//*[@class="article"]/p[not(@data-role)]//text()|//*[@class="article-text"]/p//text()' ).extract() article_content = '' pattern = r'返回搜狐|原标题:|选自:|公众号|▲|文章来自|本文|来源|||来自网络|作者:|声明:|译自|如有侵权|\||编辑:|编者按|往期回顾|记者|点击进入|联合出品|【精彩推荐】|·|导读|导言:|导读:' for i in contents: if re.search(pattern, i): continue else: article_content += i.strip() item['article_content'] = article_content.replace('\r', '').replace( '\n', '').replace('\t', '').replace('\u3000', '').replace('\xa0', '') item['article_url'] = response.url item['article_title'] = meta['title'] item['first_tag'] = meta['first_tag'] item['second_tag'] = meta['second_tag'] item['third_tag'] = meta['third_tag'] yield item
def get_content(self, response): if response.body: meta = response.meta item = JoviLonglasttimeItem() url = response.url item['article_title'] = response.xpath('//h1/text()').extract_first() contents = response.xpath('//*[@class="content-article"]/p').xpath('string()').extract() content = '' for i in contents: if re.search(r'原标题:|图片来自|图片来源|作者:|微信公众号|更多信息请关注|来源:', i): continue else: content += i.strip() item['article_content'] = content.replace('\r', '').replace('\n', '').replace('\t', '').replace('\u3000', '').replace( '\xa0', '') item['article_url'] = url item['first_tag'] = meta['first_tag'] item['second_tag'] = meta['second_tag'] item['third_tag'] = meta['third_tag'] item['update_time'] = meta['update_time'] item['source'] = meta['source'] yield item