def parse(self, response): nodelist = response.xpath('//tbody/tr/th') #得到一页中的所有帖子 item = BaiduspiderItem() isHasContent = False # 判断此页中是否有合适的信息 NextPageUrl = '' timecount = 0 # 计数器 for node in nodelist: #分析帖子信息 childUrl = node.xpath( "./a[2][@class='s xst']/@href").extract_first() item["title"] = node.xpath( "./a[2][@class='s xst']/text()").extract_first() item["UrlId"] = node.xpath( "./a[2][@class='s xst']/@href").extract_first() if (childUrl != None): item["info"] = ChildPage.ChildPage(childUrl, '1') item["time"] = node.xpath( './a[2]/../../td[@class="by"]/em/span/text()').extract_first() if item["time"] == None: item["time"] = node.xpath( './a[2]/../../td[@class="by"]/em/span/span/text()' ).extract_first() #处理时间为空的情况 if item["time"] == None: item["time"] = '' else: item["time"] = item["time"].strip() item["time"] = TimeCalculate.time_calculate( item["time"], self.name) # # 处理简介为空的情况 # if item["info"] == None: # item["info"] = '' # 判断这个帖子是否符合时间 if (TimeMarch.time_March(item["time"], self.default_scope_day) == True): item["IsLimitedTime"] = 'y' else: item["IsLimitedTime"] = 'n' timecount = timecount + 1 if (NextPageUrl == ''): #记录下一页的链接 NextPageUrl = response.xpath( '//a[@class="bm_h"]/@rel').extract_first() if item["UrlId"] != None: # 非普通帖子的错误处理(置顶帖等异常的帖子) yield item #返回数据到pipeline if (timecount > self.allowed_timesup or NextPageUrl == None): #根据判断决定继续爬取还是结束 #结束爬取 item = BaiduspiderItem() item["IsLimitedTime"] = 'n' yield item else: yield scrapy.Request('http://www.huhutong315.com/' + NextPageUrl, callback=self.parse)
def parse(self, response): nodelist = response.xpath('//tbody/tr/th') #得到一页中的所有帖子 item = BaiduspiderItem() isHasContent = False # 判断此页中是否有合适的信息 NextPageUrl = '' timecount = 0 # 计数器 for node in nodelist: #分析帖子信息 #首判断是否符合时间限制 item["time"] = node.xpath( './a[2]/../../td[2]/em//text()').extract_first() # 处理时间为空的情况 if item["time"] == None: item["time"] = '' item["time"] = item["time"].strip() item["time"] = TimeCalculate.time_calculate( item["time"], self.name) if (TimeMarch.time_March(item["time"], self.default_scope_day) == True): item["IsLimitedTime"] = 'y' else: item["IsLimitedTime"] = 'n' timecount = timecount + 1 item["title"] = node.xpath( "./a[2][@class='s xst']/text()").extract_first() item["UrlId"] = node.xpath( "./a[2][@class='s xst']/@href").extract_first() if (item["IsLimitedTime"] == 'y'): #如果符合时间限制的话 childUrl = node.xpath( "./a[2][@class='s xst']/@href").extract_first() if (childUrl != None): #判断是否已经爬过,决定是否访问子页面 id = item['UrlId'].split('/')[3] # 得到urlid num = id.split('-')[1] if num not in self.idlist: item["info"] = ChildPage.ChildPage(childUrl, '3') else: print("已经爬过") if (NextPageUrl == ''): #记录下一页的链接 NextPageUrl = response.xpath( '//a[@class="bm_h"]/@rel').extract_first() if item["UrlId"] != None: # 非普通帖子的错误处理(置顶帖等异常的帖子) yield item #返回数据到pipeline if (timecount > self.allowed_timesup or NextPageUrl == None): #根据判断决定继续爬取还是结束 #结束爬取 item = BaiduspiderItem() item["IsLimitedTime"] = 'n' yield item else: yield scrapy.Request('https://www.wszgw.net/' + NextPageUrl, callback=self.parse)
def parse(self, response): nodelist = response.xpath('//tbody/tr')#得到一页中的所有帖子 item = BaiduspiderItem() item = inititem(item) isHasContent = False # 判断此页中是否有合适的信息 NextPageUrl = '' timecount = 0 # 计数器 for node in nodelist:#分析帖子信息 item['spidertime'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) childUrl = node.xpath("./th/a[2][@class='s xst']/@href").extract_first() item["title"]= node.xpath("./th/a[2][@class='s xst']/text()").extract_first() item["url"] = node.xpath("./th/a[2][@class='s xst']/@href").extract_first() item["comment"] = node.xpath("./td[@class='num']/a/text()").extract_first() item["read"] = node.xpath("./td[@class='num']/em/text()").extract_first() item["latestcomtime"] = node.xpath("./td[4]/em/a/span/@title | ./td[4]/em/a/text()").extract_first() if(childUrl != None): item["info"] = ChildPage.ChildPage(childUrl,'1') item["time"] = node.xpath('./th/a[2]/../../td[@class="by"]/em/span/text()').extract_first() if item["time"] == None: item["time"] = node.xpath('./th/a[2]/../../td[@class="by"]/em/span/span/text()').extract_first() #处理时间为空的情况 if item["time"] == None: item["time"] = '' else: item["time"] = item["time"].strip() item["time"] = TimeCalculate.time_calculate(item["time"], self.name) # # 处理简介为空的情况 # if item["info"] == None: # item["info"] = '' # 判断这个帖子是否符合时间 if(TimeMarch.time_March(item["time"],self.default_scope_day)==True): item["IsFilter"] = True else: item["IsFilter"] = False timecount = timecount + 1 if(NextPageUrl == ''):#记录下一页的链接 NextPageUrl =response.xpath('//a[@class="bm_h"]/@rel').extract_first() if item["url"] != None: # 非普通帖子的错误处理(置顶帖等异常的帖子) item['urlId'] = item['url'].split('/')[3].split('-')[1] # 得到urlId item["urlId"] = '%s_%s'%(self.name,item["urlId"]) yield item #返回数据到pipeline if(timecount>self.allowed_timesup or NextPageUrl==None):#根据判断决定继续爬取还是结束 #结束爬取 item = BaiduspiderItem() item["IsFilter"]=False yield item else: yield scrapy.Request('http://www.huhutong315.com/'+NextPageUrl,callback = self.parse)
def parse(self, response): item = BaiduspiderItem() item = inititem(item) # 是否符合爬取条件 item['IsFilter'] = False timecount = 0 # 计数器 try: item['spidertime'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) item["title"] = response.xpath( "//div[@class='cont']/h2/text()").extract() item["title"] = "".join(item["title"]) item["url"] = response.url item["urlId"] = item["url"].split('/')[-1].split('.')[0] item["urlId"] = '%s_%s' % (self.name, item["urlId"]) item["time"] = response.xpath( "//table[@class='s1']/tbody/tr[2]/td[2]/text()").extract_first( ) item["info"] = response.xpath( "//p[@class='MsoNormal']/text()").extract() item["info"] = "".join(item["info"]) # 判断这个帖子是否符合时间 if TimeMarch.time_March(item["time"], self.default_scope_day): item["IsFilter"] = True else: item["IsFilter"] = False timecount = timecount + 1 except: item['IsFilter'] = False yield item self.crawler.engine.close_spider(self, 'Finished') # 关闭爬虫
def parse(self, response): nodelist = response.xpath( "//div[@class = 'search-news-mod']") #得到一页中的所有帖子 nodelist = [] if nodelist == None else nodelist item = BaiduspiderItem() item = inititem(item) # 是否符合爬取条件 item['IsFilter'] = False timecount = 0 # 计数器 for node in nodelist: #分析帖子信息 try: item['spidertime'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) item["title"] = node.xpath("./h1/a/text()").extract_first() item["url"] = node.xpath("./h1/a/@href").extract_first() item["urlId"] = item["url"].split('.')[0].split('/')[-1] item["urlId"] = '%s_%s' % (self.name, item["urlId"]) item["time"] = node.xpath("./p[2]/text()").extract_first() item["info"] = node.xpath("./p[1]/text()").extract_first() # item["info"] = node.xpath("./p[1]/text()").extract_first() # 判断这个帖子是否符合时间 if TimeMarch.time_March(item["time"], self.default_scope_day): item["IsFilter"] = True else: item["IsFilter"] = False timecount = timecount + 1 except: item['IsFilter'] = False yield item self.crawler.engine.close_spider(self, 'Finished') # 关闭爬虫
def child_page(self, response): item = BaiduspiderItem() item = inititem(item) # 是否符合爬取条件 item['IsFilter'] = False try: item['spidertime'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) item["title"] = response.xpath( "//tr/td/font/text()").extract_first() item["url"] = response.url item["urlId"] = item["url"].split('id=')[1] item["urlId"] = '%s_%s' % (self.name, item["urlId"]) item['time'] = response.xpath( "//table[@class='dx']/tr/td/text()").extract_first() item['time'] = '%s日' % item['time'].split('发布时间:')[1].split('日')[0] item['info'] = response.xpath( "//div/span/text() | //div/p/span/text()").extract() item["info"] = ("".join(item["info"])).replace('\xa0', '').replace( '\r\n', '') try: # 判断这个帖子是否符合时间 item['time'] = time.strftime( "%Y-%m-%d", time.strptime(item['time'], "%Y年%m月%d日")) if TimeMarch.time_March(item['time'], self.default_scope_day): item['IsFilter'] = True except: item['IsFilter'] = False except: item['IsFilter'] = False yield item
def parse(self, response): print(response) nodelist = response.xpath( "//td[@class = 'td_left30_right30']/table/tr/td") #得到一页中的所有帖子 nodelist = [] if nodelist == None else nodelist item = BaiduspiderItem() item = inititem(item) # 是否符合爬取条件 item['IsFilter'] = False timecount = 0 # 计数器 for node in nodelist: #分析帖子信息 try: item['spidertime'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) item["title"] = node.xpath( "./span[@class = 'blue14bold']/a/text()").extract_first() item["url"] = node.xpath( "./span[@class = 'blue14bold']/a/@href").extract_first() item["urlId"] = item["url"].split('_')[-1].split('.')[0] item["urlId"] = '%s_%s' % (self.name, item["urlId"]) item["time"] = node.xpath( "./span[@class = 'black12bold']/text()").extract_first() item["time"] = item["time"].split(' ')[1].split('\n')[ 0] # 换行符表示 item["time"] = time.strftime( "%Y-%m-%d", time.strptime(item["time"].split(' ')[0], "%Y.%m.%d")) # 判断这个帖子是否符合时间 if TimeMarch.time_March(item["time"], self.default_scope_day): item["IsFilter"] = True else: item["IsFilter"] = False timecount = timecount + 1 res_child = child_page(item["url"]) item["info"] = res_child.xpath( "//div[@class='cas_content']/p/text() | //div[@class='Custom_UnionStyle']/p/text() | //div[@class='Custom_UnionStyle']/span/p/text() |//div[@class='TRS_Editor']/div/div/p/text() | //div[@class='Custom_UnionStyle']/div/span/text()" ) # 格式不统一 item["info"] = "".join(item["info"]) except: item['IsFilter'] = False yield item if (len(nodelist) != 0) and (timecount < self.allowed_timesup): keyword = response.url.split('searchword=')[1].split('&')[0] page_num = response.url.split('page=')[-1] print( '\n第***********************************%s***********************************页\n' % page_num) page_num = int(page_num) + 1 NextPageUrl = "http://was.jl.gov.cn/was5/web/search?presearchword=&searchword1=&channelid=193132&StringEncoding=UTF-8&searchword=%s&page=%s" % ( str(page_num), keyword) print(NextPageUrl) yield scrapy.Request(NextPageUrl, callback=self.parse, dont_filter=True) else: self.crawler.engine.close_spider(self, 'Finished') # 关闭爬虫
def parse(self, response): nodelist = response.xpath( "//div[@style='background:#FFF;padding:5px;width:100%']" ) #得到一页中的所有帖子 nodelist = [] if nodelist == None else nodelist item = BaiduspiderItem() item = inititem(item) # 是否符合爬取条件 item['IsFilter'] = False timecount = 0 # 计数器 for node in nodelist: #分析帖子信息 try: item['spidertime'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) item["title"] = node.xpath("./a/text()").extract() item["title"] = "".join(item["title"]) item["url"] = node.xpath("./a/@href").extract_first() item["url"] = 'http://gdj.ah.gov.cn/%s' % item["url"] item["urlId"] = item["url"].split('id=')[-1] item["urlId"] = '%s_%s' % (self.name, item["urlId"]) item["time"] = node.xpath( "./p[2]/span[2]/text()").extract_first() if item["time"] is not None: item["time"] = str(item["time"]).replace(' ', '') item["time"] = item["time"][0:10] # item["time"] = item["time"][0].split(' ')[0] # 判断这个帖子是否符合时间 if TimeMarch.time_March(item["time"], self.default_scope_day): item["IsFilter"] = True else: item["IsFilter"] = False timecount = timecount + 1 res_child = child_page(item["url"]) print(res_child) item["info"] = res_child.xpath( "//div[@id = 'Zoom']/p/text() | //div[@id = 'Zoom']/p/font/text() | //div[@id = 'Zoom']/text() | //div[@id = 'Zoom']/font/text() | //div[@id = 'Zoom']/p/span/text()" ) item["info"] = "".join(item["info"]) except: item['IsFilter'] = False yield item if (len(nodelist) != 0) and (timecount < self.allowed_timesup): keyword = response.url.split('keycontent=')[1].split('&')[0] page_num = response.url.split('StartPage=')[1] page_num = int(page_num) / 15 print( '\n第***********************************%s***********************************页\n' % page_num) page_num = int(page_num) * 15 + 15 NextPageUrl = "http://gdj.ah.gov.cn/isearch.php?keytype=1&keycontent=%s&StartPage=%s" % ( str(page_num), keyword) print(NextPageUrl) yield scrapy.Request(NextPageUrl, callback=self.parse) else: self.crawler.engine.close_spider(self, 'Finished') # 关闭爬虫
def parse(self, response): nodelist = response.xpath( "//div[@class='jsearch-result-box']") #得到一页中的所有帖子 nodelist = [] if nodelist == None else nodelist item = BaiduspiderItem() item = inititem(item) # 是否符合爬取条件 item['IsFilter'] = False timecount = 0 # 计数器 for node in nodelist: #分析帖子信息 try: item['spidertime'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) item["url"] = node.xpath( "./div/div/div[@class='jsearch-result-url']/a/text()" ).extract_first() item["urlId"] = item["url"].split('/')[-1].split('.')[0] item["urlId"] = '%s_%s' % (self.name, item["urlId"]) item["time"] = node.xpath( "./div/div/span[@class='jsearch-result-date']/text()" ).extract_first() item["time"] = item["time"].split(' ')[0] item["time"] = time.strftime( "%Y-%m-%d", time.strptime(item["time"].split(' ')[0], "%Y年%m月%d日")) # 判断这个帖子是否符合时间 if TimeMarch.time_March(item["time"], self.default_scope_day): item["IsFilter"] = True else: item["IsFilter"] = False timecount = timecount + 1 res_child = child_page(item["url"]) item["info"] = res_child.xpath("//p/text()") item["info"] = "".join(item["info"]) item["title"] = res_child.xpath("//td[@class='title']/text()") item["title"] = "".join(item["title"]) item["title"] = item["title"].replace(' ', '') item["title"] = item["title"].replace('\r', '') item["title"] = item["title"].replace('\n', '') except: item['IsFilter'] = False yield item if (len(nodelist) != 0) and (timecount < self.allowed_timesup): keyword = response.url.split('q=')[1] page_num = response.url.split('p=')[1].split('&')[0] print( '\n第***********************************%s***********************************页\n' % page_num) page_num = int(page_num) + 1 NextPageUrl = "http://www.nrta.gov.cn/jrobot/search.do?webid=1&pg=12&p=%s&tpl=&category=&q=%s" % ( str(page_num), keyword) print(NextPageUrl) yield scrapy.Request(NextPageUrl, callback=self.parse) else: self.crawler.engine.close_spider(self, 'Finished') # 关闭爬虫
def parse(self, response): html = response.text html = str(html) docs = html.split("DOCPUBURL")[1:] item = BaiduspiderItem() item = inititem(item) # 是否符合爬取条件 item['IsFilter'] = False timecount = 0 # 计数器 for doc in docs: item['spidertime'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) url = doc.split(':\"', 1)[1].split('\"', 1)[0] doc = doc.split("DOCPUBTIME", 1)[1] creattime = doc.split(':\"', 1)[1].split('\"', 1)[0] doc = doc.split("DOCID", 1)[1] id = doc.split(':', 1)[1].split(',', 1)[0] doc = doc.split("DOCTITLE", 1)[1] title = doc.split(':\"', 1)[1].split('\"', 1)[0] title = title.replace('<em>', '') title = title.replace('</em>', '') item["title"] = title item["url"] = url item["urlId"] = '%s_%s' % (self.name, id) item["time"] = creattime.replace(' ', '')[0:10] print(item["time"]) if TimeMarch.time_March(item["time"], self.default_scope_day): item["IsFilter"] = True else: item["IsFilter"] = False timecount = timecount + 1 res_child = child_page(item["url"]) item["info"] = res_child.xpath( "//font[@id='Zoom']/div/span/text() | //font[@id='Zoom']//span/text() | //font[@id='Zoom']//p/text()" ) item["info"] = "".join(item["info"]) yield item if (len(docs) != 0) and (timecount < self.allowed_timesup): keyword = response.url.split('keyword=')[1].split('&')[0] page_num = response.url.split('pageNumber=')[1].split('&')[0] page_num = int(page_num) + 1 print( '\n第***********************************%s***********************************页\n' % str(page_num)) NextPageUrl = "http://gbdsj.guizhou.gov.cn/57/front/search.jhtml?code=c10a0a56f987453cb15e6a1fe45f7b8&keyword=" + str( keyword ) + "&pageNumber=" + str( page_num ) + "&filterParam=typename%3A1%3BsiteName%3A50&timeScope=+&orderBy=time&_=1569230227733" print(NextPageUrl) yield scrapy.Request(NextPageUrl, callback=self.parse, dont_filter=True) else: self.crawler.engine.close_spider(self, 'Finished') # 关闭爬虫
def parse(self, response): nodelist = response.xpath( "//div[@class='wr_body_type1 cont2']//li") #得到一页中的所有帖子 nodelist = [] if nodelist == None else nodelist item = BaiduspiderItem() item = inititem(item) # 是否符合爬取条件 item['IsFilter'] = False timecount = 0 # 计数器 for node in nodelist: #分析帖子信息 try: item['spidertime'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) item["url"] = node.xpath("./a/@href").extract_first() item["urlId"] = item["url"].split('_')[-1].split('.')[0] item["urlId"] = '%s_%s' % (self.name, item["urlId"]) item["title"] = node.xpath("./a/text()").extract_first() # item["time"] = time.strftime("%Y-%m-%d", time.strptime(item["time"].split(' ')[0], "%Y年%m月%d日")) res_child = child_page(item["url"]) item["info"] = res_child.xpath( "//div[@id = 'z']/p//span/text()") item["info"] = "".join(item["info"]) item["time"] = res_child.xpath( "//td[@align='center']/span/text()") item["time"] = "".join(item["time"]) item["time"] = item["time"].split(':')[-1] item["time"] = item["time"].replace('年', '-') item["time"] = item["time"].replace('月', '-') item["time"] = item["time"].replace('日', '') # 判断这个帖子是否符合时间 if TimeMarch.time_March(item["time"], self.default_scope_day): item["IsFilter"] = True else: item["IsFilter"] = False timecount = timecount + 1 except: item['IsFilter'] = False yield item if (len(nodelist) != 0) and (timecount < self.allowed_timesup): keyword = response.url.split('searchword=')[1].split('&')[0] page_num = response.url.split('page=')[1] print( '\n第***********************************%s***********************************页\n' % page_num) page_num = int(page_num) + 1 NextPageUrl = "http://gdj.nx.gov.cn/was5/web/search?searchword=%s&channelid=244757&page=%s" % ( keyword, str(page_num)) print(NextPageUrl) yield scrapy.Request(NextPageUrl, callback=self.parse, dont_filter=True) else: self.crawler.engine.close_spider(self, 'Finished') # 关闭爬虫
def parse_html(self, response): item = BaiduspiderItem() html = json.loads(response.text) if html: results = html['data'] for result in results: try: item['img_url'] = result['objURL'] yield item except: pass
def parse(self, response): nodelist = response.xpath("//div[@align='left']")#得到一页中的所有帖子 nodelist = [] if nodelist==None else nodelist item = BaiduspiderItem() item = inititem(item) # 是否符合爬取条件 item['IsFilter'] = False timecount = 0 # 计数器 pagecount = 0 for node in nodelist:#分析帖子信息 try: item['spidertime'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) item["url"] = node.xpath("./a[1]/@href").extract_first() item["url"] = 'http://gd.shandong.gov.cn%s'%item["url"] item["urlId"] = item["url"].split('articles/')[-1].split('/')[0] item["urlId"] = '%s_%s' % (self.name, item["urlId"]) item["time"] = node.xpath("./font[@class='filterTime']/text()").extract_first() item["time"] = "".join(item["time"]) # item["time"] = time.strftime("%Y-%m-%d", time.strptime(item["time"].split(' ')[0], "%Y年%m月%d日")) # 判断这个帖子是否符合时间 if TimeMarch.time_March(item["time"],self.default_scope_day): item["IsFilter"] = True else: item["IsFilter"] = False timecount = timecount + 1 res_child = child_page(item["url"]) item["title"] = res_child.xpath("//h1[@class='title']/text() | //div[@class='editor-content editor-content-nweview']/p/text() | //div[@class='editor-content editor-content-nweview']/p/font/text() | //div[@class='editor-content editor-content-nweview']/p") item["info"] = res_child.xpath("//p[@class='MsoNormal']/span/text() | //font/text() | //div[@class='editor-content editor-content-nweview']//p/text()") item["info"] = "".join(item["info"]) pub_time = res_child.xpath("//div[@class='content content-view']/p[1]/span[1]/text()") if pub_time is not None: item["time"] = pub_time item["time"] = "".join(item["time"]) item["time"] = str(item["time"])[0:10] title = item["title"][0] item["title"] = title except: item['IsFilter'] = False yield item if (len(nodelist)!=0) and (timecount<self.allowed_timesup): keyword = response.url.split('content=')[1].split('&')[0] page_num = response.url.split('currentpage=')[-1] print('\n第***********************************%s***********************************页\n'%page_num) page_num = int(page_num)+1 NextPageUrl = "http://gd.shandong.gov.cn/gentleCMS/cmssearch/search.do?siteId=224c56cd-948a-4ac8-95bf-a44822be2f09&content=%s¤tpage=%s"%(keyword,str(page_num)) print(NextPageUrl) if page_num < 6: yield scrapy.Request(NextPageUrl,callback = self.parse,dont_filter=True) else: self.crawler.engine.close_spider(self, 'Finished') # 关闭爬虫 else: self.crawler.engine.close_spider(self, 'Finished') # 关闭爬虫
def parse(self, response): nodelist = response.xpath("//div[@class='xwd']") #得到一页中的所有帖子 nodelist = [] if nodelist == None else nodelist item = BaiduspiderItem() item = inititem(item) # 是否符合爬取条件 item['IsFilter'] = False timecount = 0 # 计数器 for node in nodelist: # 分析帖子信息 try: item['spidertime'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) item["url"] = node.xpath("./div[1]/a/@href").extract_first() item["url"] = 'http://gdj.shaanxi.gov.cn/%s' % item["url"] item["urlId"] = item["url"].split('/')[-1].split('.')[0] item["urlId"] = '%s_%s' % (self.name, item["urlId"]) item["time"] = node.xpath("./div[2]/text()").extract_first() item["time"] = "".join(item["time"]) item["time"] = item["time"].split("时间:")[-1] # item["time"] = time.strftime("%Y-%m-%d", time.strptime(item["time"].split(' ')[0], "%Y年%m月%d日")) # 判断这个帖子是否符合时间 if TimeMarch.time_March(item["time"], self.default_scope_day): item["IsFilter"] = True else: item["IsFilter"] = False timecount = timecount + 1 res_child = child_page(item["url"]) item["title"] = res_child.xpath( "//div[@class='article-title']/h1/text()") item["title"] = "".join(item["title"]) item["info"] = res_child.xpath( "//div[@class='v_news_content']/p/font/text()") item["info"] = "".join(item["info"]) except: item['IsFilter'] = False yield item if (len(nodelist) != 0) and (timecount < self.allowed_timesup): keyword = response.url.split('keyword=')[1] page_num = response.url.split('currentnum=')[1].split('&')[0] print( '\n第***********************************%s***********************************页\n' % page_num) page_num = int(page_num) + 1 NextPageUrl = "http://gdj.shaanxi.gov.cn/chaxunjieguo.jsp?wbtreeid=1001&searchScope=0¤tnum=%s&keyword=%s" % ( str(page_num), keyword) print(NextPageUrl) yield scrapy.Request(NextPageUrl, callback=self.parse, dont_filter=True) else: self.crawler.engine.close_spider(self, 'Finished') # 关闭爬虫
def parse(self, response): html = response.text html = str(html) docs = html.split("documentId")[1:] item = BaiduspiderItem() item = inititem(item) # 是否符合爬取条件 item['IsFilter'] = False timecount = 0 # 计数器 for doc in docs: item['spidertime'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) id = doc.split(':', 1)[1].split(',', 1)[0] doc = doc.split("documentDate", 1)[1] creattime = doc.split(':\"', 1)[1].split('\"', 1)[0] doc = doc.split("documentTitle", 1)[1] title = doc.split(':\"', 1)[1].split('\",', 1)[0] doc = doc.split("documentUrl", 1)[1] url = doc.split(':\"', 1)[1].split('\"', 1)[0] url = url.split("xwcbj2016")[1] url = "http://gdj.sc.gov.cn%s" % url item["title"] = title item["url"] = url item["urlId"] = '%s_%s' % (self.name, id) item["time"] = creattime.replace(' ', '') if TimeMarch.time_March(item["time"], self.default_scope_day): item["IsFilter"] = True else: item["IsFilter"] = False timecount = timecount + 1 res_child = child_page(item["url"]) item["info"] = res_child.xpath( "//div[@class = 'Custom_UnionStyle']/p/text() | //div[@class = 'Custom_UnionStyle']/p/font/text() | //div[@class='content']//span/text() | //div[@class='content']//font/text() | //div[@class = 'Custom_UnionStyle']//span/text()" ) item["info"] = "".join(item["info"]) yield item if (len(docs) != 0) and (timecount < self.allowed_timesup): keyword = response.url.split('keyword=')[1].split('&')[0] page_num = response.url.split('pageIndex=')[1] page_num = int(page_num) + 1 print( '\n第***********************************%s***********************************页\n' % str(page_num)) NextPageUrl = "http://gdj.sc.gov.cn/scxwcbjss/search?keyword=%s&pageIndex=%s" % ( keyword, str(page_num)) print(NextPageUrl) yield scrapy.Request(NextPageUrl, callback=self.parse, dont_filter=True) else: self.crawler.engine.close_spider(self, 'Finished') # 关闭爬虫
def parse(self, response): nodelist = response.xpath("//li[@class='active']") #得到一页中的所有帖子 nodelist = [] if nodelist == None else nodelist item = BaiduspiderItem() item = inititem(item) # 是否符合爬取条件 item['IsFilter'] = False timecount = 0 # 计数器 for node in nodelist: #分析帖子信息 try: item['spidertime'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) item["url"] = node.xpath( "./div[@class='com-title']/a/@href").extract_first() item["urlId"] = item["url"].split('/')[-1].split('.')[0] item["urlId"] = '%s_%s' % (self.name, item["urlId"]) item["time"] = node.xpath( "./div[2]/div/span[2]/text()").extract_first() item["time"] = item["time"].split(':')[-1] res_child = child_page(item["url"]) # item["info"] = res_child.xpath("//div[@id='j-show-body']/div/div/p/span/voice/text()") item["info"] = res_child.xpath("//span/text()") item["info"] = "".join(item["info"]) item["title"] = res_child.xpath( "//div[@class='main_content']/h2/text()") item["title"] = "".join(item["title"]) # 判断这个帖子是否符合时间 if TimeMarch.time_March(item["time"], self.default_scope_day): item["IsFilter"] = True else: item["IsFilter"] = False timecount = timecount + 1 except: item['IsFilter'] = False yield item if (len(nodelist) != 0) and (timecount < self.allowed_timesup): keyword = response.url.split('q=')[1].split('&')[0] page_num = response.url.split('p=')[1].split('&')[0] page_num = int(page_num) print( '\n第***********************************%s***********************************页\n' % (page_num + 1)) page_num = page_num + 1 NextPageUrl = "http://searchs.hunan.gov.cn/hunan/gbdsj/news?q=%s&searchfields=&sm=0&columnCN=&p=%s&timetype=timeqb" % ( keyword, str(page_num)) print(NextPageUrl) yield scrapy.Request(NextPageUrl, callback=self.parse) else: self.crawler.engine.close_spider(self, 'Finished') # 关闭爬虫
def parse(self, response): html = response.text html = str(html) docs = html.split("\"title\":\"")[1:-1] item = BaiduspiderItem() item = inititem(item) # 是否符合爬取条件 item['IsFilter'] = False timecount = 0 # 计数器 for doc in docs: item['spidertime'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) title = doc.split("\",")[0] doc = doc.split("\",", 1)[1].split('\"url\":\"')[1] url = doc.split("\",")[0] doc = doc.split("\",", 1)[1].split('\"time\":\"')[1] creattime = doc.split(' ')[0] item["title"] = title item["url"] = url item["urlId"] = item["url"].split('_')[-1].split('.')[0] item["urlId"] = '%s_%s' % (self.name, item["urlId"]) item["time"] = creattime if TimeMarch.time_March(item["time"], self.default_scope_day): item["IsFilter"] = True else: item["IsFilter"] = False timecount = timecount + 1 res_child = child_page(item["url"]) item["info"] = res_child.xpath( "//div[@class = 'Custom_UnionStyle']/p/text() | //div[@class = 'Custom_UnionStyle']/p/font/text() | //p/font/text() | //p/text()" ) item["info"] = "".join(item["info"]) yield item if (len(docs) != 0) and (timecount < self.allowed_timesup): keyword = response.url.split('classsql=')[1].split('&')[0] page_num = response.url.split('&page=')[1] print( '\n第***********************************%s***********************************页\n' % page_num) page_num = int(page_num) + 1 NextPageUrl = "http://gdj.fujian.gov.cn/was5/web/search?channelid=229105&templet=advsch.jsp&sortfield=-docreltime&classsql=%s&prepage=20&page=%s" % ( keyword, str(page_num)) print(NextPageUrl) yield scrapy.Request(NextPageUrl, callback=self.parse, dont_filter=True) else: self.crawler.engine.close_spider(self, 'Finished') # 关闭爬虫
def parse(self, response): nodelist = response.xpath( "//span[@class = 'list plist rc']/a") #得到一页中的所有帖子 nodelist = [] if nodelist == None else nodelist item = BaiduspiderItem() item = inititem(item) # 是否符合爬取条件 item['IsFilter'] = False timecount = 0 # 计数器 for node in nodelist: #分析帖子信息 try: item['spidertime'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) item["title"] = node.xpath("./text()").extract_first() item["url"] = node.xpath("./@href").extract_first() item["url"] = 'http://gdj.shanxi.gov.cn/%s' % item["url"] item["urlId"] = item["url"].split('id=')[-1].split('.')[0] item["urlId"] = '%s_%s' % (self.name, item["urlId"]) item["time"] = node.xpath("./em/text()").extract_first() # 判断这个帖子是否符合时间 if TimeMarch.time_March(item["time"], self.default_scope_day): item["IsFilter"] = True else: item["IsFilter"] = False timecount = timecount + 1 res_child = child_page(item["url"]) item["info"] = res_child.xpath( "//div[@id = 'Zoom']/p/text() | //div[@id='Zoom']/text()") item["info"] = "".join(item["info"]) # item["info"] = bytearray.fromhex(''.join(item["info"].split("\\x"))).decode() except: item['IsFilter'] = False yield item if (len(nodelist) != 0) and (timecount < self.allowed_timesup): keyword = response.url.split('title=')[1] page_num = response.url.split('p=')[1].split('&')[0] print( '\n第***********************************%s***********************************页\n' % page_num) page_num = int(page_num) + 1 NextPageUrl = "http://gdj.shanxi.gov.cn/soso.aspx?p=%s&title=%s&type=1" % ( str(page_num), keyword) print(NextPageUrl) yield scrapy.Request(NextPageUrl, callback=self.parse, dont_filter=True) else: self.crawler.engine.close_spider(self, 'Finished') # 关闭爬虫
def parse(self, response): nodelist = response.xpath("//ul[@class='search_list']/li")#得到一页中的所有帖子 nodelist = [] if nodelist==None else nodelist item = BaiduspiderItem() item = inititem(item) # 是否符合爬取条件 item['IsFilter'] = False timecount = 0 # 计数器 for node in nodelist:#分析帖子信息 try: item['spidertime'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) item['spidertime'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) item["title"] = node.xpath("./a/text()").extract_first() item["url"] = node.xpath("./a/@href").extract_first() item["urlId"] = item["url"].split('/')[-1].split('.')[0] item["urlId"] = '%s_%s' % (self.name, item["urlId"]) res_child = child_page(item["url"]) item["time"] = node.xpath("./span/text()").extract_first() item["time"] = item["time"].replace('[','') item["time"] = item["time"].replace(']', '') # item["time"] = item["time"].split('局')[-1].split(' ')[-1] # 判断这个帖子是否符合时间 if "直播卫星" in item["title"] or "中星九号" in item["title"] or "扶贫工程" in item["title"] or "扶贫" in item["title"]: if TimeMarch.time_March(item["time"],self.default_scope_day): item["IsFilter"] = True else: item["IsFilter"] = False timecount = timecount + 1 res_child = child_page(item["url"]) print(res_child.text) item["info"] = res_child.xpath("//div[@class='content content_article']/text() | //div[@class='content content_article']/div/text()") item["info"] = "".join(item["info"]) yield item else: yield None except: i = 0 yield None if (len(nodelist)!=0) and (timecount<self.allowed_timesup): page_num = response.url.split('page=')[1] print('\n第***********************************%s***********************************页\n'%page_num) page_num = int(page_num)+1 if page_num < 30: NextPageUrl = "http://gbdsj.gxzf.gov.cn/index.php?m=search&c=index&a=init&page=%s"%(str(page_num)) print(NextPageUrl) yield scrapy.Request(NextPageUrl,callback = self.parse,dont_filter=True) else: self.crawler.engine.close_spider(self, 'Finished') # 关闭爬虫
def parse(self, response): nodelist = response.xpath('//tbody/tr/th') #得到一页中的所有帖子 item = BaiduspiderItem() isHasContent = False # 判断此页中是否有合适的信息 NextPageUrl = '' timecount = 0 # 计数器 for node in nodelist: #分析帖子信息\ childUrl = node.xpath( "./a[2][@class='s xst']/@href").extract_first() item["title"] = node.xpath( "./a[2][@class='s xst']/text()").extract_first() item["UrlId"] = node.xpath( "./a[2][@class='s xst']/@href").extract_first() if (childUrl != None): item["info"] = ChildPage.ChildPage(childUrl, '2') item["time"] = node.xpath( './a[2]/../../td[@class="by"]/em/span/text()').extract_first() if item["time"] == None: item["time"] = node.xpath( './a[2]/../../td[@class="by"]/em/span/span/text()' ).extract_first() #处理时间为空的情况 if item["time"] == None: item["time"] = '' else: item["time"] = item["time"].strip() item["time"] = TimeCalculate.time_calculate( item["time"], self.name) # 判断这个帖子是否符合时间 if (TimeMarch.time_March(item["time"], self.default_scope_day) == True): item["IsLimitedTime"] = 'y' else: item["IsLimitedTime"] = 'n' timecount = timecount + 1 if (NextPageUrl == ''): #记录下一页的链接 NextPageUrl = response.xpath( '//div[@class="pg"]/a[@class="nxt"]/@href').extract_first( ) if item["UrlId"] != None: # 非普通帖子的错误处理(置顶帖等异常的帖子) yield item #返回数据到pipeline if (timecount > self.allowed_timesup): #根据判断决定继续爬取还是结束 self.crawler.engine.close_spider(self, 'Finished') #关闭爬虫 else: yield scrapy.Request(NextPageUrl, callback=self.parse)
def parse(self, response): nodelist = response.xpath( "//div[@id='items']/div[@class='resultItem']") #得到一页中的所有帖子 nodelist = [] if nodelist == None else nodelist item = BaiduspiderItem() item = inititem(item) # 是否符合爬取条件 item['IsFilter'] = False timecount = 0 # 计数器 for node in nodelist: #分析帖子信息 item['spidertime'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) item["title"] = node.xpath("./a/text()").extract() item["title"] = "".join(item["title"]) item["url"] = node.xpath("./a/@href").extract_first() item["urlId"] = item["url"].split('/')[-1].split('.')[0] item["urlId"] = '%s_%s' % (self.name, item["urlId"]) item["info"] = node.xpath("./div/text()").extract() item["info"] = "".join(item["info"]) item["time"] = node.xpath("./font/text()").extract_first() item["time"] = item["time"].split(' ')[1] try: item["time"] = time.strftime( "%Y-%m-%d", time.strptime(item["time"].split(' ')[0], "%Y年%m月%d日")) # 判断这个帖子是否符合时间 if TimeMarch.time_March(item["time"], self.default_scope_day): item["IsFilter"] = True else: item["IsFilter"] = False timecount = timecount + 1 except: item['IsFilter'] = False yield item if (len(nodelist) != 0) and (timecount < self.allowed_timesup): keyword = response.url.split('q=')[1].split('&')[0] page_num = response.url.split('page=')[1] print( '\n第***********************************%s***********************************页\n' % page_num) page_num = int(page_num) + 1 NextPageUrl = "http://searchgov1.eastday.com/searchwgj/search.ashx?q=%s&page=%s" % ( keyword, str(page_num)) print(NextPageUrl) yield scrapy.Request(NextPageUrl, callback=self.parse) else: self.crawler.engine.close_spider(self, 'Finished') # 关闭爬虫
def parse(self, response): nodelist = response.xpath( "//div[@class='articleList_listBox']/a") #得到一页中的所有帖子 nodelist = [] if nodelist == None else nodelist item = BaiduspiderItem() item = inititem(item) # 是否符合爬取条件 item['IsFilter'] = False timecount = 0 # 计数器 for node in nodelist: #分析帖子信息 try: item["title"] = node.xpath("./@title").extract_first() item["url"] = node.xpath("./@href").extract_first() item["url"] = 'http://www.hnppb.gov.cn%s' % item["url"] item["urlId"] = item["url"].split('/')[-1].split('.')[0] item["urlId"] = '%s_%s' % (self.name, item["urlId"]) item["time"] = node.xpath("./span[2]/text()").extract_first() # 判断这个帖子是否符合时间 if TimeMarch.time_March(item["time"], self.default_scope_day): item["IsFilter"] = True else: item["IsFilter"] = False timecount = timecount + 1 res_child = child_page(item["url"]) item["info"] = res_child.xpath( "//div[@class='article_body']//p/span/text()") item["info"] = "".join(item["info"]) except: item['IsFilter'] = False yield item if (len(nodelist) != 0) and (timecount < self.allowed_timesup): keyword = response.url.split('title=')[1].split('&')[0] page_num = response.url.split('pageNo=')[1] print( '\n第***********************************%s***********************************页\n' % page_num) page_num = int(page_num) + 1 NextPageUrl = "http://www.hnppb.gov.cn/cms/12/search.do?basic_title=%s&pageNo=%s" % ( keyword, str(page_num)) print(NextPageUrl) yield scrapy.Request(NextPageUrl, callback=self.parse, dont_filter=True) else: self.crawler.engine.close_spider(self, 'Finished') # 关闭爬虫
def parse(self, response): nodelist = response.xpath("//li[@class='wrap']") #得到一页中的所有帖子 nodelist = [] if nodelist == None else nodelist item = BaiduspiderItem() item = inititem(item) # 是否符合爬取条件 item['IsFilter'] = False timecount = 0 # 计数器 for node in nodelist: #分析帖子信息 try: item['spidertime'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) item["title"] = node.xpath("./div/h5/a").extract() item["title"] = xml_filter("".join(item["title"])) item["url"] = node.xpath("./div/h5/a/@href").extract_first() item["urlId"] = item["url"].split('=')[-1] item["urlId"] = '%s_%s' % (self.name, item["urlId"]) item['info'] = node.xpath( "//li[@class='wrap']/div/p").extract() item["info"] = xml_filter("".join(item["info"])) item["time"] = node.xpath( "./div[@class='adds']/text()").extract_first() item["time"] = item["time"].split(':')[1] # 判断这个帖子是否符合时间 if TimeMarch.time_March(item["time"], self.default_scope_day): item["IsFilter"] = True else: item["IsFilter"] = False timecount = timecount + 1 except: item['IsFilter'] = False yield item if (len(nodelist) != 0) and (timecount < self.allowed_timesup): keyword = response.url.split('q=')[1].split('&')[0] page_num = response.url.split('page=')[1] print( '\n第***********************************%s***********************************页\n' % page_num) page_num = int(page_num) + 1 NextPageUrl = "http://gbdsj.nmg.gov.cn/?m=search&c=index&a=init&typeid=1&siteid=1&q=%s&page=%s" % ( keyword, str(page_num)) print(NextPageUrl) yield scrapy.Request(NextPageUrl, callback=self.parse) else: self.crawler.engine.close_spider(self, 'Finished') # 关闭爬虫
def parse(self, response): nodelist = response.xpath("//div[@class='msg discuss']") #得到一页中的所有帖子 nodelist = [] if nodelist == None else nodelist item = BaiduspiderItem() item = inititem(item) # 是否符合爬取条件 item['IsFilter'] = False timecount = 0 # 计数器 for node in nodelist: #分析帖子信息 item['spidertime'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) item["title"] = node.xpath( "./div[@class='titleP']/a/@title").extract_first() item["url"] = node.xpath( "./div[@class='titleP']/a/@href").extract_first() item["url"] = 'http://so.kaipuyun.cn/%s' % item["url"] item["urlId"] = '12' item["time"] = node.xpath( "./div[@class='content']/span/text()").extract_first() item["info"] = node.xpath( "./div[@class='content']/p/text()").extract() item["info"] = "".join(item["info"]) try: # 判断这个帖子是否符合时间 if TimeMarch.time_March(item["time"], self.default_scope_day): item["IsFilter"] = True else: timecount = timecount + 1 except: item['IsFilter'] = False yield item if (len(nodelist) != 0) and (timecount < self.allowed_timesup): keyword = response.url.split('qt=')[1].split('&')[0] page_num = response.url.split('page=')[1] print( '\n第***********************************%s***********************************页\n' % page_num) page_num = int(page_num) + 1 NextPageUrl = "http://so.kaipuyun.cn/s?q=1&qt=%s&siteCode=N000005664&page=%s" % ( keyword, str(page_num)) print(NextPageUrl) yield scrapy.Request(NextPageUrl, callback=self.parse) else: self.crawler.engine.close_spider(self, 'Finished') # 关闭爬虫
def parse(self, response): html = str(response.text) item = BaiduspiderItem() item = inititem(item) # 是否符合爬取条件 item['IsFilter'] = False timecount = 0 # 计数器 html = html.split('list":[{', 1)[1] html = html.split('}],"total"', 1)[0] html = str(html).encode('unicode_escape').decode("unicode_escape") docs = html.split("},{") for doc in docs: try: doc = '{' + doc + '}' doc_dict = ast.literal_eval(doc) item['spidertime'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) item["title"] = str(doc_dict['title']) item["title"] = item["title"].replace('<em>', '') item["title"] = item["title"].replace('</em>', '') item["title"] = item["title"].replace('<\\\\/em>', '') item["title"] = item["title"].replace('[', '') item["title"] = item["title"].replace(']', '') item["url"] = str(doc_dict['url']).replace('\\', '') item["urlId"] = item["url"].split('post_')[-1].split('.')[0] item["urlId"] = '%s_%s' % (self.name, id) item["time"] = doc_dict['pub_time'] info = str(doc_dict['content']) if TimeMarch.time_March(item["time"], self.default_scope_day): item["IsFilter"] = True else: item["IsFilter"] = False timecount = timecount + 1 res_child = child_page(item["url"]) item["info"] = res_child.xpath( "//div[@calss='article-content']/p/text() | //div[@calss='article-content']//p/text() | //div[@id='content']//span/text()" ) item["info"] = "".join(item["info"]) if len(item["info"]) < len(info): item["info"] = info yield item except: print() self.crawler.engine.close_spider(self, 'Finished') # 关闭爬虫
def parse(self, response): nodelist = response.xpath("//div[@class = 'listItem']/ul/a")#得到一页中的所有帖子 nodelist = [] if nodelist==None else nodelist item = BaiduspiderItem() item = inititem(item) # 是否符合爬取条件 item['IsFilter'] = False timecount = 0 # 计数器 for node in nodelist:#分析帖子信息 try: item['spidertime'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) item["url"] = node.xpath("./@href").extract_first() item["url"] = "http://gdj.gansu.gov.cn" + item["url"] item["urlId"] = item["url"].split('/')[-1].split('.')[0] item["urlId"] = '%s_%s' % (self.name, item["urlId"]) item["time"] = node.xpath(".//div[@class = 'date']/text()").extract_first() item["time"] = item["time"].split(' ')[0] # item["time"] = time.strftime("%Y-%m-%d", time.strptime(item["time"].split(' ')[0], "%Y年%m月%d日")) # 判断这个帖子是否符合时间 if TimeMarch.time_March(item["time"],self.default_scope_day): item["IsFilter"] = True else: item["IsFilter"] = False timecount = timecount + 1 res_child = child_page(item["url"]) item["info"] = res_child.xpath("//div[@class = 'notice_content']/p/span/text() | //div[@class = 'notice_content']/section//p/span/text()") item["info"] = "".join(item["info"]) item["title"] = res_child.xpath("//div[@class = 'titles']/h6/text()") item["title"] = "".join(item["title"]) except: item['IsFilter'] = False yield item if (len(nodelist)!=0) and (timecount<self.allowed_timesup): keyword = response.url.split('keyword=')[1] page_num = response.url.split('p=')[1].split('&')[0] print('\n第***********************************%s***********************************页\n'%page_num) page_num = int(page_num)+1 NextPageUrl = "http://gdj.gansu.gov.cn/home/search/index.html?keyword=%s&p=%s"%(keyword,str(page_num)) print(NextPageUrl) yield scrapy.Request(NextPageUrl,callback = self.parse,dont_filter=True) else: self.crawler.engine.close_spider(self, 'Finished') # 关闭爬虫
def parse(self, response): nodelist = response.xpath("//p")#得到一页中的所有帖子 nodelist = [] if nodelist==None else nodelist item = BaiduspiderItem() item = inititem(item) # 是否符合爬取条件 item['IsFilter'] = False timecount = 0 # 计数器 for node in nodelist:#分析帖子信息 try: item['spidertime'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) item["title"] = node.xpath("./a[@class='subject']/text()").extract_first() item["url"] = node.xpath("./a[@class='subject']/@href").extract_first() item["urlId"] = item["url"].split('/')[-1].split('.')[0] item["urlId"] = '%s_%s' % (self.name, item["urlId"]) item["time"] = node.xpath("./a[@class='green']/text()").extract_first() item["time"] = "".join(item["time"]) item["time"] = item["time"].split(' - ')[-1] # item["time"] = time.strftime("%Y-%m-%d", time.strptime(item["time"].split(' ')[0], "%Y年%m月%d日")) # 判断这个帖子是否符合时间 if TimeMarch.time_March(item["time"],self.default_scope_day): item["IsFilter"] = True else: item["IsFilter"] = False timecount = timecount + 1 res_child = child_page(item["url"]) item["info"] = res_child.xpath("//div[@id = 'zoom']/p/text() | //div[@id = 'zoom']/p/span/text() | //font") item["info"] = "".join(item["info"]) except: item['IsFilter'] = False yield item if (len(nodelist)!=0) and (timecount<self.allowed_timesup): keyword = response.url.split('q=')[1].split('&')[0] page_num = response.url.split('p=')[1].split('&')[0] print('\n第***********************************%s***********************************页\n'%page_num) page_num = int(page_num)+1 NextPageUrl = "http://www.zjxwcb.gov.cn/jsearch/search?q=%s&area=1&pos=1&date=1&p=%s&pg=10&x=17&y=13"%(keyword,str(page_num)) print(NextPageUrl) yield scrapy.Request(NextPageUrl,callback = self.parse,dont_filter=True) else: self.crawler.engine.close_spider(self, 'Finished') # 关闭爬虫
def parse(self, response): nodelist = response.xpath("//dl[@class = 'bbda cl']")#得到一页中的所有帖子 nodelist = [] if nodelist==None else nodelist item = BaiduspiderItem() item = inititem(item) # 是否符合爬取条件 item['IsFilter'] = False timecount = 0 # 计数器 for node in nodelist:#分析帖子信息 item['spidertime'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) item["title"] = node.xpath("./dt/a/text()").extract_first() item["url"] = node.xpath("./dt/a/@href").extract_first() item["urlId"] = item["url"].split('/')[-1].split('.')[0] item["urlId"] = '%s_%s'%(self.name,item["urlId"]) item["time"] = node.xpath("./dd[2]/span/text()").extract_first() item["time"] = item["time"].split(' ',1)[-1] try: # 判断这个帖子是否符合时间 if TimeMarch.time_March(item["time"],self.default_scope_day): item["IsFilter"] = True else: item["IsFilter"] = False timecount = timecount + 1 except: item['IsFilter'] = False res_child = child_page(item["url"]) item["info"] = res_child.xpath("//td[@id='article_content']/p/span/text() | //td[@id='article_content']/font/text() | //td[@id='article_content']/p/text() | //td[@id='article_content']/div/span/text() | //td[@id='article_content']/span/text()") item["info"] = "".join(item["info"]) item["comment"] = res_child.xpath("//p[@class='xg1']/a[2]/em/text()") item["latestcomtime"] = res_child.xpath("//div[@class='bm_c']/dl[1]/dt/span[@class='xg1 xw0']/text()") item["latestcomtime"] = "".join(item["latestcomtime"]) if item["latestcomtime"] == "": item["latestcomtime"] = None if item["comment"] != []: item["comment"] = item["comment"][0] else: item["comment"] = None item["read"] = res_child.xpath("//em[@id='_viewnum']/text()")[0] yield item self.crawler.engine.close_spider(self, 'Finished') # 关闭爬虫
def parse(self, response): nodelist = response.xpath('//div[@class="col2_right j_threadlist_li_right "]') item = BaiduspiderItem() NextPageUrl = '' for node in nodelist: item["title"]= node.xpath("./div[1]/div/a[@title]/text()").extract_first() item["UrlId"] = node.xpath("./div[1]/div/a[@href]/@href").extract_first() item["info"] = node.xpath('./div[2]/div[@class="threadlist_text pull_left"]/div[1]/text()').extract_first() item["time"] = node.xpath('./div[1]/div[2]/span[@title="创建时间"]/text()').extract_first() childUrl = "https://tieba.baidu.com" + item["UrlId"] item["UrlId"] = childUrl if(NextPageUrl == ''): NextPageUrl = 'https:'+ response.xpath('//a[@class = "next pagination-item "]/@href').extract_first() #读取帖子详细信息的方法,但需求中不需要,实际只需使用使用'baidu2'即可,若用此方法需开启items中的childPage request = scrapy.Request(childUrl,callback =self.ChildPage) request.meta['item'] = item yield request yield scrapy.Request('https://tieba.baidu.com/f?kw=%E6%88%B7%E6%88%B7%E9%80%9A&ie=utf-8&pn=50',callback = self.parse) print("翻页了!!!!!!!!!!!!!!!!!")
def parse(self, response): nodelist = response.xpath("//td[@class='ta']/table/tr") #得到一页中的所有帖子 nodelist = [] if nodelist == None else nodelist item = BaiduspiderItem() item = inititem(item) # 是否符合爬取条件 item['IsFilter'] = False timecount = 0 # 计数器 for node in nodelist: #分析帖子信息 try: item['spidertime'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) item["title"] = node.xpath("./td[2]/a/text()").extract_first() item["url"] = node.xpath("./td[2]/a/@href").extract_first() item["url"] = 'http://gdj.qinghai.gov.cn/%s' % item["url"] item["urlId"] = item["url"].split('/')[-1].split('.')[0] item["urlId"] = '%s_%s' % (self.name, item["urlId"]) item["time"] = node.xpath("./td[3]/text()").extract_first() item["time"] = "".join(item["time"]) if item["time"] is not None: item["time"] = str(item["time"]).replace(' ', '')[0:10] # item["time"] = item["time"].split(' - ')[-1] # item["time"] = time.strftime("%Y-%m-%d", time.strptime(item["time"].split(' ')[0], "%Y年%m月%d日")) # 判断这个帖子是否符合时间 if TimeMarch.time_March(item["time"], self.default_scope_day): item["IsFilter"] = True else: item["IsFilter"] = False timecount = timecount + 1 res_child = child_page(item["url"]) item["info"] = res_child.xpath( "//p/span/text() | //p/font/span/text()") item["info"] = "".join(item["info"]) except: item['IsFilter'] = False yield item self.crawler.engine.close_spider(self, 'Finished') # 关闭爬虫