def parse_detail(self, response): news_content = '' news_img = '' for p in response.xpath('//*[@id="newscontent"]/div/div[1]/div/div/p'): content = p.xpath('.//text()').extract() img = p.xpath('./img/@src').extract() # 要先判断是否为空,不然可能会出现indexError if content: # print("正文:"+temp[0]) news_content += content[0] + "\r\n" if img: # print("图片:"+img[0]) news_img += "http://news.xmu.edu.cn" + img[0] + ";" item = NewsspiderItem() item["news_title"] = response.xpath( '//*[@id="mainContainer"]/div[3]/table/tr/td/span/span/span/text()' ).extract()[0] item["news_content"] = news_content item["news_source"] = '厦门大学新闻网' item["news_link"] = response.url item["news_release_time"] = response.xpath( '//*[@id="mainContainer"]/div[4]/table/tr[2]/td/span[1]/span/span/text()' ).extract()[0] item["news_read_status"] = '1' item["news_get_time"] = time.strftime('%Y-%m-%d', time.localtime(time.time())) item["news_imgs"] = news_img release_time = item['news_release_time'] print("the latest_release_time from web_xmu_news is {0}".format( self.latest_release_time)) st1 = time.mktime(time.strptime(release_time, "%Y-%m-%d")) st2 = time.mktime(time.strptime(self.latest_release_time, "%Y-%m-%d")) result = int(st1) - int(st2) print("发布时间:{0} 是否继续:{1}".format(release_time, result)) # 发布时间小于数据库最新时间则停止爬取,且数据若已存在则不保存到数据库 if result <= 0: self.crawler.engine.close_spider(self, '厦门大学新闻网消息爬取完成!') elif myMysql.myMysql().columnExist(tableName="web_news", columnValue=item['news_link']): return else: yield item
def news_detail(self, response): news_content = '' news_imgs = '' for p in response.xpath('//div[@class="Article_Content"]/p'): content = p.xpath(".//text()").extract() img = p.xpath("./img/@src").extract() if content: news_content += content[0] + "\r\n" if img: news_imgs += "http://xsc.xmu.edu.cn" + img[0] + ";" item = xscNewsspiderItem() item["news_title"] = response.xpath( '//span[@class="Article_Title"]/text()').extract()[0] item["news_content"] = news_content item["news_source"] = '厦门大学学生处' item["news_link"] = response.url item["news_release_time"] = response.xpath( '//span[@class="Article_PublishDate"]/text()').extract()[0] item["news_read_status"] = '1' item["news_get_time"] = time.strftime('%Y-%m-%d', time.localtime(time.time())) item["news_imgs"] = news_imgs release_time = item["news_release_time"] print("the latest_release_time from web_xsc_news is {0}".format( self.latest_release_time)) str1 = time.mktime(time.strptime(release_time, "%Y-%m-%d")) # str2 = time.mktime(time.strptime("2017-01-01", "%Y-%m-%d")) str2 = time.mktime(time.strptime(self.latest_release_time, "%Y-%m-%d")) result = int(str1) - int(str2) print("发布时间:{0} 是否继续:{1}".format(release_time, result)) if result < 0: self.crawler.engine.close_spider(self, "学生处消息爬取完成!") elif myMysql.myMysql().columnExist(tableName="web_xsc_news", columnValue=item['news_link']): return else: yield item
def news_detail(self, response): news_content = '' news_imgs = '' for p in response.xpath("/html/body/div[3]/div[2]/div/div/div/div/p"): content = p.xpath(".//text()").extract() img = p.xpath("./img/@src").extract() if content: news_content += content[0] + "\r\n" if img: news_imgs += "http://jwc.xmu.edu.cn" + img[0] + ";" item = jwcNewsspiderItem() item["news_title"] = response.xpath( "/html/body/div[3]/div[1]/h1/span/span/span/text()").extract()[0] item["news_content"] = news_content item["news_source"] = '厦门大学教务处' item["news_link"] = response.url item["news_release_time"] = response.xpath( "/html/body/div[3]/div[1]/h2/span/span/span/text()").extract()[0] item["news_read_status"] = '1' item["news_get_time"] = time.strftime('%Y-%m-%d', time.localtime(time.time())) item["news_imgs"] = news_imgs release_time = item["news_release_time"] print("the latest_release_time from web_jwc_news is {0}".format( self.latest_release_time)) str1 = time.mktime(time.strptime(release_time, "%Y-%m-%d")) str2 = time.mktime(time.strptime(self.latest_release_time, "%Y-%m-%d")) result = int(str1) - int(str2) print("发布时间:{0} 是否继续:{1}".format(release_time, result)) if result < 0: self.crawler.engine.close_spider(self, "教务处消息爬取完成!") elif myMysql.myMysql().columnExist(tableName="web_jwc_news", columnValue=item['news_link']): return else: yield item
class GetnewsSpider(scrapy.Spider): # 爬虫名 name = 'xmu_news' # 爬虫作用范围 allowed_domains = ['news.xmu.edu.cn'] start_urls = ['http://news.xmu.edu.cn/1552/list.htm'] # 作用类似静态变量属性 latest_release_time = myMysql.myMysql().getLatestTime(tableName="web_news") def parse(self, response): for each in response.xpath( "/html/body/div/div[4]/div[3]/table/tr/td/div/div/div[1]/table/tr" ): detail_url = 'http://news.xmu.edu.cn' + each.xpath( "./td/table/tr/td[1]/a/@href").extract()[0] print("详情地址:" + detail_url) yield scrapy.Request(url=detail_url, callback=self.parse_detail, dont_filter=False) # url跟进开始,翻页 # 获取下一页的url信息 url = response.xpath('//a[@class="next"]/@href').extract() if url: # # 将信息组合成下一页的url next_page = 'http://news.xmu.edu.cn' + url[0] print("下一页地址:" + next_page) # # 返回url yield scrapy.Request(url=next_page, callback=self.parse) # url跟进结束 # 获取消息详情 def parse_detail(self, response): news_content = '' news_img = '' for p in response.xpath('//*[@id="newscontent"]/div/div[1]/div/div/p'): content = p.xpath('.//text()').extract() img = p.xpath('./img/@src').extract() # 要先判断是否为空,不然可能会出现indexError if content: # print("正文:"+temp[0]) news_content += content[0] + "\r\n" if img: # print("图片:"+img[0]) news_img += "http://news.xmu.edu.cn" + img[0] + ";" item = NewsspiderItem() item["news_title"] = response.xpath( '//*[@id="mainContainer"]/div[3]/table/tr/td/span/span/span/text()' ).extract()[0] item["news_content"] = news_content item["news_source"] = '厦门大学新闻网' item["news_link"] = response.url item["news_release_time"] = response.xpath( '//*[@id="mainContainer"]/div[4]/table/tr[2]/td/span[1]/span/span/text()' ).extract()[0] item["news_read_status"] = '1' item["news_get_time"] = time.strftime('%Y-%m-%d', time.localtime(time.time())) item["news_imgs"] = news_img release_time = item['news_release_time'] print("the latest_release_time from web_xmu_news is {0}".format( self.latest_release_time)) st1 = time.mktime(time.strptime(release_time, "%Y-%m-%d")) st2 = time.mktime(time.strptime(self.latest_release_time, "%Y-%m-%d")) result = int(st1) - int(st2) print("发布时间:{0} 是否继续:{1}".format(release_time, result)) # 发布时间小于数据库最新时间则停止爬取,且数据若已存在则不保存到数据库 if result <= 0: self.crawler.engine.close_spider(self, '厦门大学新闻网消息爬取完成!') elif myMysql.myMysql().columnExist(tableName="web_news", columnValue=item['news_link']): return else: yield item
class JwcXmuSpider(scrapy.Spider): name = 'jwc_xmu_news' allowed_domains = ['jwc.xmu.edu.cn'] start_urls = ['http://jwc.xmu.edu.cn/2194/list.htm'] latest_release_time = myMysql.myMysql().getLatestTime( tableName="web_jwc_news") def parse(self, response): for each in response.xpath( "/html/body/div[3]/div[2]/div[2]/div/div[1]/table/tr"): detail_url = "http://jwc.xmu.edu.cn" + each.xpath( "./td[2]/table/tr/td[1]/a/@href").extract()[0] print("详情地址:" + detail_url) yield scrapy.Request(url=detail_url, callback=self.news_detail) # url跟进 # 下一页的url url = response.xpath('//a[@class="next"]/@href').extract() if url: next_page = "http://jwc.xmu.edu.cn" + url[0] yield scrapy.Request(url=next_page, callback=self.parse, dont_filter=False) # url跟进结束 def news_detail(self, response): news_content = '' news_imgs = '' for p in response.xpath("/html/body/div[3]/div[2]/div/div/div/div/p"): content = p.xpath(".//text()").extract() img = p.xpath("./img/@src").extract() if content: news_content += content[0] + "\r\n" if img: news_imgs += "http://jwc.xmu.edu.cn" + img[0] + ";" item = jwcNewsspiderItem() item["news_title"] = response.xpath( "/html/body/div[3]/div[1]/h1/span/span/span/text()").extract()[0] item["news_content"] = news_content item["news_source"] = '厦门大学教务处' item["news_link"] = response.url item["news_release_time"] = response.xpath( "/html/body/div[3]/div[1]/h2/span/span/span/text()").extract()[0] item["news_read_status"] = '1' item["news_get_time"] = time.strftime('%Y-%m-%d', time.localtime(time.time())) item["news_imgs"] = news_imgs release_time = item["news_release_time"] print("the latest_release_time from web_jwc_news is {0}".format( self.latest_release_time)) str1 = time.mktime(time.strptime(release_time, "%Y-%m-%d")) str2 = time.mktime(time.strptime(self.latest_release_time, "%Y-%m-%d")) result = int(str1) - int(str2) print("发布时间:{0} 是否继续:{1}".format(release_time, result)) if result < 0: self.crawler.engine.close_spider(self, "教务处消息爬取完成!") elif myMysql.myMysql().columnExist(tableName="web_jwc_news", columnValue=item['news_link']): return else: yield item
class XscNewsSpider(scrapy.Spider): name = 'xsc_xmu_news' allowed_domains = ['xsc.xmu.edu.cn'] start_urls = ['http://xsc.xmu.edu.cn/3084/list.htm'] latest_release_time = myMysql.myMysql().getLatestTime( tableName="web_xsc_news") def parse(self, response): for each in response.xpath('//div[@id="wp_news_w4"]/table/tr'): # item = xscNewsspiderItem() # title = each.xpath('./td[2]/table/tr/td[1]/a/@title').extract()[0] # release_time = each.xpath('./td[2]/table/tr/td[2]//text()').extract()[0] detail_url = "http://xsc.xmu.edu.cn" + each.xpath( './/td[2]/table/tr/td[1]/a/@href').extract()[0] print("详情地址:{0}".format(detail_url)) # print("title: {0} \n release_time: {1} \n link: {2}".format(title, release_time, link)) # item['news_title'] = title # item['news_content'] = '暂无内容' # item['news_source'] = '厦门大学学生处' # item['news_link'] = link # item['news_release_time'] = release_time # item['news_read_status'] = '1' # item['news_get_time'] = time.strftime("%Y-%m-%d", time.localtime(time.time())) # item['news_imgs'] = '' yield scrapy.Request(url=detail_url, callback=self.news_detail, dont_filter=False) # str1 = time.mktime(time.strptime(release_time, "%Y-%m-%d")) # str2 = time.mktime(time.strptime("2017-01-01", "%Y-%m-%d")) # result = int(str1)-int(str2) # if result<0: # self.crawler.engine.close_spider(self, "学生处消息爬取完成!") url = response.xpath('//a[@class="next"]/@href').extract() if url: next_page = "http://xsc.xmu.edu.cn" + url[0] print("next page: {0}".format(next_page)) yield scrapy.Request(url=next_page, callback=self.parse, dont_filter=False) def news_detail(self, response): news_content = '' news_imgs = '' for p in response.xpath('//div[@class="Article_Content"]/p'): content = p.xpath(".//text()").extract() img = p.xpath("./img/@src").extract() if content: news_content += content[0] + "\r\n" if img: news_imgs += "http://xsc.xmu.edu.cn" + img[0] + ";" item = xscNewsspiderItem() item["news_title"] = response.xpath( '//span[@class="Article_Title"]/text()').extract()[0] item["news_content"] = news_content item["news_source"] = '厦门大学学生处' item["news_link"] = response.url item["news_release_time"] = response.xpath( '//span[@class="Article_PublishDate"]/text()').extract()[0] item["news_read_status"] = '1' item["news_get_time"] = time.strftime('%Y-%m-%d', time.localtime(time.time())) item["news_imgs"] = news_imgs release_time = item["news_release_time"] print("the latest_release_time from web_xsc_news is {0}".format( self.latest_release_time)) str1 = time.mktime(time.strptime(release_time, "%Y-%m-%d")) # str2 = time.mktime(time.strptime("2017-01-01", "%Y-%m-%d")) str2 = time.mktime(time.strptime(self.latest_release_time, "%Y-%m-%d")) result = int(str1) - int(str2) print("发布时间:{0} 是否继续:{1}".format(release_time, result)) if result < 0: self.crawler.engine.close_spider(self, "学生处消息爬取完成!") elif myMysql.myMysql().columnExist(tableName="web_xsc_news", columnValue=item['news_link']): return else: yield item