def parse_login(self, response): # print("正在登陆....") # 对响应体判断是否登录成功 json_res = json.loads(response.text) if json_res["retcode"] == 50060000: print('出现验证码,请在常用登陆地运行') print(json_res["data"]['errurl']) elif json_res["retcode"] == 20000000: print("登陆成功") args = SARunner().parser() keyword_list = SARunner().keyworld_list(args.anaentities) keyword = keyword_list.replace( u"|", "~", ) seek_url = "https://weibo.cn/search/" fd = { 'advancedfilter': '1', 'keyword': keyword, 'nick': '', 'starttime': self.starttime, 'endtime': self.endtime, 'sort': 'time', 'smblog': '搜索' } print('搜索关键词:', keyword) yield scrapy.FormRequest( url=seek_url, formdata=fd, callback=self.parse_info, ) else: print('登陆失败!')
def parse_main(self, response): item = response.meta['item'] item['article'] = response.xpath( "//div[@class ='p-right left']//div[@id='p-detail']//p|" "//div[@id='content']//p|" "//div[@class='content']//p|" "//div[@class ='contant clearfix']/div[@class ='xl']//p|" "//div[@id ='Content']//p|" "//div[@class ='zj_left']/div[@class ='zj_nr']//p|" "//td[@class='text_con_16_33']//p|" "//div[@class ='content pack']//p|" "//div[@class = 'article']//p|" "//div[@class ='main-content-box']//p|" "//div[@id ='nr_wz']//p").xpath('string(.)').extract() item['TID'] = re.findall(r'c_.{1,}htm', item['href'])[0][2:-4] yield item article = Article(tid=item['TID'], channel_id=11, title=item['title'], content=item['article'], publish_datetime=item['time'], url=item['href'], author_name=item['source'], digest=item['intro']) self.r.append(article) if len(self.r) == len(self.R): print(len(self.r)) print('爬虫结束,开始热度分析') SARunner().article_List(self.r)
def parse_main(self, response): item = RMWspider1Item() item['title'] = response.meta['title'][0] item['time'] = response.meta['time'] item['intro'] = response.meta['intro'][0].replace('[', '', 1).replace( ']', '', ) item['href'] = response.meta['href'] item['TID'] = re.findall(r'/c.{1,}html', item['href'])[0][1:-5] if 'people' in item['TID']: item['TID'] = re.findall(r'/c.{1,}', item['TID'])[0][1:] item['source'] = response.xpath( "//div[@class = 'artOri']/a/text()|" "//div[@class='box01']//a/text()|" "//div[@class='text_c']/p//a/text()|" "//div[@class = 'msgBox']//a/text()|" "//div[@class = 'page_c']/div[@class = 'fr']/a/text()|" "//div[@class = 'w1000 p2']//a/text()|" "//div[@class = 'p2j_text fl']/h2/a/text()").extract_first() item['article'] = response.xpath( "//div[@id='rwb_zw']//p|" "//div[@class='show_text']//p|" "//div[@class='artDet']//p|" "//div[@class='text_con clearfix']//p|" "//div[@class = 'content clear clearfix']//p|" "//div[@id = 'p_content']//p|" "//div[@class = 'box_con']//p|" "//div[@class = 'text_show']//p|" "//div[@class = 'gray box_text']//p|" "//div[@class = 'text_box clearfix']//p").xpath( 'string(.)').extract() yield item article = Article(tid=item['TID'], channel_id=5, title=item['title'], content=item['article'], publish_datetime=item['time'], url=item['href'], author_name=item['source'], digest=item['intro']) self.r.append(article) if len(self.R) == len(self.r): print(len(self.r)) print('爬虫结束,开始热度分析') SARunner().article_List(self.r)
def parse_info(self, response): weibo_list = response.xpath("//div[@class='c' and @id]") for weibo in weibo_list: item = Weibospider1Item() div = weibo.xpath("./div") if len(div) == 1: # 微博类型 item["category"] = "无图原创" item["author"] = weibo.xpath( "./div/a[@class='nk']/text()").extract_first() item['author_id'] = weibo.xpath( "./div[1]/a[@class='nk']/@href").extract_first() item["content"] = weibo.xpath( "./div/span[@class='ctt']").xpath('string(.)').extract() img = weibo.xpath("./div/span[@class='ctt']/img/@src") if len(img) == 1: item["content"] = weibo.xpath( "./div/text()|./div/span[@class='ctt']//text()" ).extract() item["dianzan"] = weibo.xpath("./div/a/text()").extract()[-4] item["relay"] = weibo.xpath("./div/a/text()").extract()[-3] item["comment"] = weibo.xpath( "./div/a[@class='cc']/text()").extract_first() item["comment_url"] = weibo.xpath( "./div/a[@class='cc']/@href").extract_first() item["send_time"] = weibo.xpath( "./div/span[@class='ct']/text()").extract_first() item["reason"] = None item["img_url"] = None item['reason_name'] = None item['reason_id'] = None elif len(div) == 2: item["category"] = "" item["content"] = weibo.xpath("./div[1]/span[@class='ctt']" ).xpath('string(.)').extract() img = weibo.xpath("./div/span[@class='ctt']/img/@src") if len(img) == 1: item["content"] = weibo.xpath( "./div[1]/text()|./div[1]/span[@class='ctt']//text()" ).extract() item["relay"] = weibo.xpath("./div[2]/a/text()").extract()[-3] item["comment"] = weibo.xpath( "./div[2]/a[@class='cc']/text()").extract_first() item["reason"] = None img = weibo.xpath("./div[2]//img[@class='ib']/@src") if len(img) == 0: # 无图转发 item['category'] = "无图转发" item["author"] = weibo.xpath( "./div/span[@class = 'cmt']/a/text()").extract_first() item['author_id'] = weibo.xpath( "./div[1]/a[@class='nk']/@href").extract_first() item['reason_name'] = weibo.xpath( "./div[1]/span[@class = 'cmt']/a/text()" ).extract_first() item['reason_id'] = weibo.xpath( "./div[1]/span[@class = 'cmt']/a/@href").extract_first( ) item["dianzan"] = weibo.xpath( "./div[2]/a/text()").extract()[-4] item["reason"] = weibo.xpath( "./div[2]/text()|./div[2]//span[@class='kt']/text()" ).extract() item["comment_url"] = weibo.xpath( "./div[2]/a[@class='cc']/@href").extract_first() item["img_url"] = None item["send_time"] = weibo.xpath( "./div[2]/span[@class='ct']/text()").extract_first() else: # 有图原创 item['category'] = "有图原创" item["author"] = weibo.xpath( "./div/a[@class='nk']/text()").extract_first() item['author_id'] = weibo.xpath( "./div[1]/a[@class='nk']/@href").extract_first() item['reason_name'] = None item['reason_id'] = None item["dianzan"] = weibo.xpath( "./div[2]/a/text()").extract()[-4] item["img_url"] = weibo.xpath( "./div[2]//img[@class='ib']/@src").extract_first() item["comment_url"] = weibo.xpath( "./div[2]/a[@class='cc']/@href").extract_first() item["send_time"] = weibo.xpath( "./div[2]/span[@class='ct']/text()").extract_first() else: # len(div) == 3 item["category"] = "带图片转发" item["author"] = weibo.xpath( "./div[1]/a[@class='nk']/text()").extract_first() item['author_id'] = weibo.xpath( "./div[1]/a[@class='nk']/@href").extract_first() item['reason_name'] = weibo.xpath( "./div[1]/span[@class = 'cmt']/a/text()").extract_first() item['reason_id'] = weibo.xpath( "./div[1]/span[@class = 'cmt']/a/@href").extract_first() item["content"] = weibo.xpath("./div[1]/span[@class = 'ctt']" ).xpath('string(.)').extract() img = weibo.xpath("./div[1]/span[@class='ctt']/img/@src") if len(img) == 1: item["content"] = weibo.xpath( "./div[1]/text()|./div[1]/span[@class='ctt']//text()" ).extract() item["send_time"] = weibo.xpath( "./div[3]/span[@class='ct']/text()").extract_first() item["dianzan"] = weibo.xpath( "./div[3]/a/text()").extract()[-4] item["relay"] = weibo.xpath("./div[3]/a/text()").extract()[-3] item["comment"] = weibo.xpath( "./div[3]/a[@class='cc']/text()").extract_first() item["comment_url"] = weibo.xpath( "./div[3]/a[@class='cc']/@href").extract_first() item["img_url"] = weibo.xpath( "./div[2]//img[@class='ib']/@src").extract_first() item["reason"] = weibo.xpath( "./div[3]/text()|./div[3]//span[@class='kt']/text()" ).extract() item['relay_url'] = '' item['TID'] = re.findall(r'uid=.{1,}&', item["comment_url"])[0][4:-1] a = weibo.xpath("//a[@class='nk']/@href").extract() yield item article = Article(tid=item['TID'], channel_id=9, content=item['content'], publish_datetime=item['send_time'], url=item['comment_url'], title=item['content'][0:100], author_id=item['author_id'], author_name=item['author']) article.statistics = ArticleStatistics( tid=item['TID'], channel_id=9, reply_count=item['comment'], forward_count=item['relay'], like_count=item['dianzan'], ) if int(item['relay']) > 0: self.relay_url_list.append(item['relay_url']) self.r.append(article) self.name_url_list.append(a) num_page = response.xpath( "//div[@id='pagelist']/form/div/text()").extract() num_page = [i.replace( u"\xa0", "", ) for i in num_page] num_page = [i for i in num_page if len(i) > 0][0] num_page = re.findall(r'\d+', num_page) print('正在爬取第', num_page[0], '页', num_page[1]) max_page = NUM_PAGE if max_page is None: max_page = int(num_page[1]) if int(num_page[0]) == max_page: L = [] for L1 in self.name_url_list: L += L1 for url_1 in L: with open(os_file.a + '\\crawler_url.txt', 'a', encoding='utf-8') as f: f.write(url_1 + "\n") print('页数上限,搜索页数据爬取完毕') print('爬虫结束,开始热度分析') SARunner().article_List(self.r) print("爬取微博数:", len(self.r)) # print('开始爬取用户详情页数据,一共有', self.L2, '个非重复用户') # 爬取作者头像 id 关注 粉丝 with open(os_file.a + '\\crawler_url.txt', 'r', encoding='utf-8') as f: urls = f.readlines() # 获取待爬个数 # 去重 L2 = {}.fromkeys(urls).keys() self.L2 = len(L2) print('开始爬取用户详情页数据,一共有', self.L2, '个非重复用户') for url in L2: yield scrapy.FormRequest(url=url, callback=self.parse_info_detail, dont_filter=True) else: next_url = response.xpath( "//a[text() = '下页']/@href").extract_first() next_url = urllib.parse.urljoin(response.url, next_url) yield scrapy.Request(next_url, callback=self.parse_info, dont_filter=True)
def parse_main(self, response): item = XinLangspider1Item() item['intro'] = str(response.meta["intro"]).replace( u"...", "", ).replace( u"']", "", ).replace( u"['", "", ) item['href'] = response.meta["href"] item['time'] = response.meta['time'] item['title_main'] = response.meta['title'] item['article'] = response.xpath( "//div[@id = 'artibody']//p//text()|//div[@id = 'article']//p//text()" ).extract() item['source'] = response.xpath( "//a[@class = 'source ent-source']/text()|//span[@class = 'source ent-source']/text()" ).extract() item['TID'] = None a = re.findall(r'http.{1,}sina', item['href'])[0][7:-5] a = a.replace( u"/", "", ) if a in 'k': item['TID'] = re.findall(r'article_.{1,}_', item['href'])[0][8:-1] else: item['TID'] = re.findall(r'-ih.{1,}shtml', item['href'])[0][1:-6] if a in xw_type.cs: item['source'] = response.xpath( "//span[@id = 'art_source']/text()").extract() item['article'] = response.xpath( "//div[@class = 'article-body main-body']//p//text()").extract( ) elif a in xw_type.ss: item['source'] = response.xpath( "//a[@class = 'source content-color']/text()|//span[@class ='source content-color']/text()" ).extract() elif a in xw_type.xw: item['article'] = response.xpath("//div[@id = 'article']").xpath( 'string(.)').extract() item['source'] = response.xpath( "//a[@class = 'source']/text()").extract() elif a in xw_type.bk: item['source'] = '新浪博客' item['article'] = response.xpath( "//div[@id='sina_keyword_ad_area2']/div/font|//div[@id='sina_keyword_ad_area2']/p/font" ).xpath('string(.)').extract() # 手机版网站 if len(item['article']) == 0 and len(item['source']) == 0: item['article'] = response.xpath( "//section[@class = 'art_pic_card art_content']/p//text()" ).extract() item['source'] = response.xpath( "//h2[@class ='weibo_user']/text()").extract() yield item article = Article(tid=item['TID'], channel_id=3, title=item['title_main'], content=item['article'], publish_datetime=item['time'], url=item['href'], author_name=item['source'], digest=item['intro']) self.R.append(article) if len(self.r) == len(self.R): print(len(self.R)) print('开始保存数据库') print('爬虫结束,开始热度分析') SARunner().article_List(self.R)
class XlSpider(scrapy.Spider): name = 'xl' allowed_domains = ['sina.com'] custom_settings = { 'ITEM_PIPELINES': { 'crawler.pipelines.XinLangPipeline': 300, }, } r = [] R = [] MAX_PAGE = XL_MAX_PAGE args = SARunner().parser() keyword_list = SARunner().keyworld_list(args.anaentities) if '|' in keyword_list: keyword_list = keyword_list.replace( u"|", "~", ) a = Urlchuli(keyword_list, 'gbk') one = a.url_bm() start_urls = [ 'http://search.sina.com.cn/?c=news&q={}&range=all&time=w&stime=&etime=&num=10' .format(one) ] # time w:一周,m:月 h:一小时 d:一天 def parse(self, response): a = response.url if self.MAX_PAGE is None: MAX_PAGE = response.xpath( "//span[@class ='pagebox_cur_page']/text()") else: MAX_PAGE = self.MAX_PAGE if 'https://s.weibo.com/weibo/' in str(a): print('搜索失败,请重新搜索') raise Exception(f'搜索失败,请重新搜索') else: div_list = response.xpath("//div[@class='box-result clearfix']") for div in div_list: data = div.xpath(".//p[@class = 'content']").xpath( 'string(.)').extract() title = div.xpath(".//h2/a/text()").extract() title = ''.join(title) href = div.xpath(".//h2/a/@href").extract_first() time = div.xpath( ".//span[@class = 'fgray_time']/text()").extract_first() time = re.split(r' ', time) time = time[-2] + ' ' + time[-1] self.r.append(href) yield scrapy.Request(url=href, meta={ "intro": data, 'href': href, 'time': time, 'title': title }, callback=self.parse_main, dont_filter=True) next_url = response.xpath( "//a[@title = '下一页']/@href").extract_first() next_url = urllib.parse.urljoin(response.url, next_url) page = response.xpath( "//span[@class = 'pagebox_cur_page']/text()").extract_first() if int(page) is int(MAX_PAGE): print('页数上限') else: yield scrapy.Request(next_url, callback=self.parse, dont_filter=True) def parse_main(self, response): item = XinLangspider1Item() item['intro'] = str(response.meta["intro"]).replace( u"...", "", ).replace( u"']", "", ).replace( u"['", "", ) item['href'] = response.meta["href"] item['time'] = response.meta['time'] item['title_main'] = response.meta['title'] item['article'] = response.xpath( "//div[@id = 'artibody']//p//text()|//div[@id = 'article']//p//text()" ).extract() item['source'] = response.xpath( "//a[@class = 'source ent-source']/text()|//span[@class = 'source ent-source']/text()" ).extract() item['TID'] = None a = re.findall(r'http.{1,}sina', item['href'])[0][7:-5] a = a.replace( u"/", "", ) if a in 'k': item['TID'] = re.findall(r'article_.{1,}_', item['href'])[0][8:-1] else: item['TID'] = re.findall(r'-ih.{1,}shtml', item['href'])[0][1:-6] if a in xw_type.cs: item['source'] = response.xpath( "//span[@id = 'art_source']/text()").extract() item['article'] = response.xpath( "//div[@class = 'article-body main-body']//p//text()").extract( ) elif a in xw_type.ss: item['source'] = response.xpath( "//a[@class = 'source content-color']/text()|//span[@class ='source content-color']/text()" ).extract() elif a in xw_type.xw: item['article'] = response.xpath("//div[@id = 'article']").xpath( 'string(.)').extract() item['source'] = response.xpath( "//a[@class = 'source']/text()").extract() elif a in xw_type.bk: item['source'] = '新浪博客' item['article'] = response.xpath( "//div[@id='sina_keyword_ad_area2']/div/font|//div[@id='sina_keyword_ad_area2']/p/font" ).xpath('string(.)').extract() # 手机版网站 if len(item['article']) == 0 and len(item['source']) == 0: item['article'] = response.xpath( "//section[@class = 'art_pic_card art_content']/p//text()" ).extract() item['source'] = response.xpath( "//h2[@class ='weibo_user']/text()").extract() yield item article = Article(tid=item['TID'], channel_id=3, title=item['title_main'], content=item['article'], publish_datetime=item['time'], url=item['href'], author_name=item['source'], digest=item['intro']) self.R.append(article) if len(self.r) == len(self.R): print(len(self.R)) print('开始保存数据库') print('爬虫结束,开始热度分析') SARunner().article_List(self.R)
class RMWSpider(scrapy.Spider): name = 'rmw' allowed_domains = ['people.com', 'people.com.cn.'] custom_settings = { 'ITEM_PIPELINES': { 'crawler.pipelines.RenMingPipeline': 300, }, } r = [] args = SARunner().parser() keyword_list = SARunner().keyworld_list(args.anaentities) keyword_list = re.split(r'\|', keyword_list) R = [] R2 = 0 R1 = len(keyword_list) headers = { 'Location': 'news/getNewsResult.jsp', 'Server': 'Apache-Coyote/1.1', } def start_requests(self): print('爬取关键词', self.keyword_list) for keyword in self.keyword_list: keyword = keyword.encode('gbk') print('正在搜索...') url = 'http://search.people.com.cn/cnpeople/search.do' formdata = { 'siteName': 'news', 'pageNum': '1', 'facetFlag': 'true', 'nodeType': 'belongsId', 'nodeId': '0', 'keyword': keyword, } yield scrapy.FormRequest(url=url, formdata=formdata, headers=self.headers, callback=self.parse_seek, dont_filter=True) def parse_seek(self, response): if response.url == 'http://search.people.com.cn/cnpeople/news/error.jsp': print('搜索失败') else: print(response.url) ul_list = response.xpath("//div[@class='fr w800']/ul") for ul in ul_list: item = {} item['title'] = ul.xpath("./li[1]//a").xpath( 'string(.)').extract() item['time'] = ul.xpath("./li[3]/text()").extract_first() item['intro'] = ul.xpath("./li[2]").xpath( 'string(.)').extract() item['href'] = ul.xpath("./li[1]//a/@href").extract_first() self.R.append(item['href']) yield scrapy.Request(item['href'], callback=self.parse_main, meta={ 'title': item['title'], 'time': item['time'], 'intro': item['intro'], 'href': item['href'] }, dont_filter=True) next_url = response.xpath( "//a[text() = '下一页']/@href").extract_first() next_url = urllib.parse.urljoin(response.url, next_url) num_page = response.xpath( "//div[@class = 'show_nav_bar']/text()").extract() try: num_page = ''.join(num_page) num_page = re.findall(r"\d+", num_page)[0] except IndexError as e: pass self.R2 += 1 if RMW_MAX_PAGE is not None: if int(num_page) == RMW_MAX_PAGE: if self.R1 == self.R2: print('页数上限') else: yield scrapy.Request(next_url, callback=self.parse_seek, dont_filter=True) else: yield scrapy.Request(next_url, callback=self.parse_seek, dont_filter=True) def parse_main(self, response): item = RMWspider1Item() item['title'] = response.meta['title'][0] item['time'] = response.meta['time'] item['intro'] = response.meta['intro'][0].replace('[', '', 1).replace( ']', '', ) item['href'] = response.meta['href'] item['TID'] = re.findall(r'/c.{1,}html', item['href'])[0][1:-5] if 'people' in item['TID']: item['TID'] = re.findall(r'/c.{1,}', item['TID'])[0][1:] item['source'] = response.xpath( "//div[@class = 'artOri']/a/text()|" "//div[@class='box01']//a/text()|" "//div[@class='text_c']/p//a/text()|" "//div[@class = 'msgBox']//a/text()|" "//div[@class = 'page_c']/div[@class = 'fr']/a/text()|" "//div[@class = 'w1000 p2']//a/text()|" "//div[@class = 'p2j_text fl']/h2/a/text()").extract_first() item['article'] = response.xpath( "//div[@id='rwb_zw']//p|" "//div[@class='show_text']//p|" "//div[@class='artDet']//p|" "//div[@class='text_con clearfix']//p|" "//div[@class = 'content clear clearfix']//p|" "//div[@id = 'p_content']//p|" "//div[@class = 'box_con']//p|" "//div[@class = 'text_show']//p|" "//div[@class = 'gray box_text']//p|" "//div[@class = 'text_box clearfix']//p").xpath( 'string(.)').extract() yield item article = Article(tid=item['TID'], channel_id=5, title=item['title'], content=item['article'], publish_datetime=item['time'], url=item['href'], author_name=item['source'], digest=item['intro']) self.r.append(article) if len(self.R) == len(self.r): print(len(self.r)) print('爬虫结束,开始热度分析') SARunner().article_List(self.r)
class TbSpider(scrapy.Spider): name = 'tb' allowed_domains = ['tieba.baidu.com/mo/q'] custom_settings = { 'ITEM_PIPELINES': { 'crawler.pipelines.BaiDuTBPipeline': 300, }, } r = [] R = [] MAX_PAGE = TB_MAX_PAGE # start_urls = ['http://tieba.baidu.com/mo/q----,sz@320_240-1-3---2/m?kw=吧名&pn=0'] """ url = 'http://wap.baidu.com/sf/vsearch?pd=tieba&word=%E4%B8%AD%E5%B1%B1%E5%A4%A7%E5%AD%A6&tn=vsearch&sa=vs_tab&lid=8756617510026267405&ms=1' 可以使用此url重新编写 """ args = SARunner().parser() keyword_list = SARunner().keyworld_list(args.anaentities) keyword_list = re.split(r'\|', keyword_list) p = 0 P = len(keyword_list) def start_requests(self): for keyword in self.keyword_list: url = "http://tieba.baidu.com/f/search/res?ie=utf-8&qw={}".format( keyword) yield scrapy.FormRequest(url=url, callback=self.parse_detail, dont_filter=True) def parse_detail(self, response): print(response.url) div_list = response.xpath( "//div[@class = 's_post_list']/div[@class = 's_post']") for div in div_list: item = BaidutiebaItem() item['title'] = div.xpath( "./span[@class='p_title']/a[@class='bluelink' and @data-fid]" ).xpath('string(.)').extract() item['time'] = div.xpath( ".//font[@class='p_green p_date']/text()").extract_first() item['intro'] = div.xpath(".//div[@class = 'p_content']").xpath( 'string(.)').extract() item['href'] = div.xpath( "./span[@class='p_title']/a[@class='bluelink' and @data-fid]/@href" ).extract_first() item['href'] = urllib.parse.urljoin(response.url, item['href']) item['source'] = div.xpath("./text()|.//a//font//text()").extract() item['source'] = ''.join(item['source']) if item['time'] is None: # 过滤掉贴吧信息 continue self.r.append(item['href']) yield scrapy.Request(item['href'], callback=self.parse_main, meta={'item': item}, dont_filter=True) self.p += 1 num_page = response.xpath( "//span[@class='cur']/text()").extract_first() max_page_url = response.xpath( "//a[text() = '尾页']/@href").extract_first() if self.MAX_PAGE is None and max_page_url is not None: self.MAX_PAGE = re.findall(r'&pn=.{1,}', max_page_url)[0][4:] if int(num_page) == self.MAX_PAGE: if self.p == self.P: print('页数上限') else: next_url = response.xpath( "//a[text() = '下一页>']/@href").extract_first() next_url = urllib.parse.urljoin(response.url, next_url) yield scrapy.Request(next_url, callback=self.parse_detail, dont_filter=True) def parse_main(self, response): item = response.meta['item'] item['reply'] = response.xpath( "//div[@id='thread_theme_5']//span[@class='red'][1]/text()" ).extract() yield item self.R.append(item) if len(self.r) == len(self.R): print('开始保存数据库')
class XHWSpider(scrapy.Spider): name = 'xhw' allowed_domains = ['so.news.cn'] custom_settings = { 'ITEM_PIPELINES': { 'crawler.pipelines.RenMingPipeline': 300, }, } args = SARunner().parser() keyword_list = SARunner().keyworld_list(args.anaentities) keyword_list = re.split(r'\|', keyword_list) p = len(keyword_list) page = 1 R = [] r = [] def start_requests(self): print('正在搜索...') keyWordAll = self.keyword_list[0] if self.p > 1: keyWordOne = self.keyword_list[1:] keyWordOne = '+'.join(keyWordOne) url = 'http://so.news.cn/getNews?keyWordAll={}&keyWordOne={}&keyWordIg=&searchFields=0&sortField=0&url=&senSearch=1&lang=cn&keyword={}&curPage=1'.format( keyWordAll, keyWordOne, keyWordAll) print(url) else: url = 'http://so.news.cn/getNews?keyword={}&curPage=1&sortField=0&searchFields=1&lang=cn'.format( keyWordAll) yield scrapy.Request(url=url, callback=self.parse_seek, dont_filter=True) def parse_seek(self, response): html = json.loads(response.text) data_list = html['content']['results'] max_page = html['content']['pageCount'] for data in data_list: item = XHWspider1Item() item['title'] = data['title'].replace( u'<font color=red>', '').replace(u'</font>', '').replace(u' ', '').replace(u'"', '').replace(u'\u3000', '') # item['title'] = item['title'].replace(u'<font color=red>', '') item['time'] = data['pubtime'] item['href'] = data['url'] item['intro'] = data['des'] if 'xhwkhdapp' in item['href']: continue if item['intro'] is not None: item['intro'] = ''.join(item['intro']) item['intro'] = item['intro'].replace(u'<font', '').replace( u'color=red>', '').replace(u'</font>', '') item['source'] = data['sitename'] self.R.append(item['href']) yield scrapy.Request(url=item['href'], callback=self.parse_main, dont_filter=True, meta={'item': deepcopy(item)}) if XHW_MAX_PAGE is not None: max_page = XHW_MAX_PAGE if self.page == max_page: print('页数上限') else: self.page += 1 a = re.compile('&curPage=\d+') next_url = a.sub('&curPage={}'.format(self.page), response.url) yield scrapy.Request(url=next_url, callback=self.parse_seek, dont_filter=True) def parse_main(self, response): item = response.meta['item'] item['article'] = response.xpath( "//div[@class ='p-right left']//div[@id='p-detail']//p|" "//div[@id='content']//p|" "//div[@class='content']//p|" "//div[@class ='contant clearfix']/div[@class ='xl']//p|" "//div[@id ='Content']//p|" "//div[@class ='zj_left']/div[@class ='zj_nr']//p|" "//td[@class='text_con_16_33']//p|" "//div[@class ='content pack']//p|" "//div[@class = 'article']//p|" "//div[@class ='main-content-box']//p|" "//div[@id ='nr_wz']//p").xpath('string(.)').extract() item['TID'] = re.findall(r'c_.{1,}htm', item['href'])[0][2:-4] yield item article = Article(tid=item['TID'], channel_id=11, title=item['title'], content=item['article'], publish_datetime=item['time'], url=item['href'], author_name=item['source'], digest=item['intro']) self.r.append(article) if len(self.r) == len(self.R): print(len(self.r)) print('爬虫结束,开始热度分析') SARunner().article_List(self.r)