class IfengSpider(scrapy.Spider): name = "tech" start_urls = [ "http://tech.ifeng.com/listpage/26344/1/list.shtml", #问题来了 "http://tech.ifeng.com/listpage/26333/1/list.shtml", #车科技 "http://tech.ifeng.com/listpage/26335/1/list.shtml", #可穿戴 "http://tech.ifeng.com/listpage/26334/1/list.shtml", #智慧家庭 "http://digi.ifeng.com/listpage/4085/1/list.shtml", #手机 "http://digi.ifeng.com/listpage/11143/1/list.shtml", #苹果 "http://digi.ifeng.com/listpage/11148/1/list.shtml", #平板 "http://digi.ifeng.com/listpage/2689/1/list.shtml", #笔记本 "http://digi.ifeng.com/listpage/5098/1/list.shtml", #影像 ] def parse(self, response): # response.body soup = BeautifulSoup(response.body, "lxml") divs = soup.findAll('div', {'class': 'box_list clearfix'}) for div in divs: # title, content,url item = IfengItem() h2 = div.find('h2') link = h2.find('a') url = link['href'] item['url'] = url title = link['title'] item['title'] = title response2 = urllib.urlopen(url) soup2 = BeautifulSoup(response2, "lxml") content = soup2.find('div', {'id': 'artical_real'}).get_text() item['content'] = content item['label'] = 'technology' if self.check(item['url']): yield item # //*[@id="pagenext"] //*[@id="pagenext"] next_url = response.xpath( "//*[@id='pagenext'] /@href").extract() # 找到下一个链接,也就是翻页。 if next_url: yield scrapy.Request(next_url[0], callback=self.parse) def check(self, url): self.database = Database() self.database.connect('crawl_data') sql = "SELECT * FROM news where url=%s order by url" str_article_url = url.encode('utf-8') data = (str_article_url, ) try: search_result = self.database.query(sql, data) if search_result == (): self.database.close() return True except Exception, e: print e traceback.print_exc() self.database.close() return False
class IfengSpider(scrapy.Spider): name = "cul" start_urls = [ "http://culture.ifeng.com/listpage/59669/1/list.shtml", #眼界 "http://culture.ifeng.com/listpage/59668/1/list.shtml", #艺文 "http://culture.ifeng.com/listpage/59667/1/list.shtml", #思想 "http://culture.ifeng.com/listpage/59665/1/list.shtml", #文学 "http://culture.ifeng.com/listpage/59664/1/list.shtml", #热点 ] def parse(self, response): # response.body soup = BeautifulSoup(response.body, "lxml") divs = soup.findAll('div', {'class': 'box_list clearfix'}) for div in divs: # title, content,url item = IfengItem() h2 = div.find('h2') link = h2.find('a') url = link['href'] item['url'] = url title = link['title'] item['title'] = title response2 = urllib.urlopen(url) soup2 = BeautifulSoup(response2, "lxml") content = soup2.find('div', {'id': 'main_content'}).get_text() item['content'] = content item['label'] = 'culture' if self.check(item['url']): yield item # //*[@id="pagenext"] //*[@id="pagenext"] next_url = response.xpath( "//*[@id='pagenext'] /@href").extract() # 找到下一个链接,也就是翻页。 if next_url: yield scrapy.Request(next_url[0], callback=self.parse) def check(self, url): self.database = Database() self.database.connect('crawl_data') sql = "SELECT * FROM news where url=%s order by url" str_article_url = url.encode('utf-8') data = (str_article_url, ) try: search_result = self.database.query(sql, data) if search_result == (): self.database.close() return True except Exception, e: print e traceback.print_exc() self.database.close() return False
class IfengSpider(scrapy.Spider): name = "history1" start_urls = [ "http://news.ifeng.com/listpage/71096/1/list.shtml", #假设历史 "http://news.ifeng.com/listpage/41708/1/list.shtml", #凤凰历史 "http://news.ifeng.com/listpage/70296/1/list.shtml", #兰台说史 ] def parse(self, response): # response.body soup = BeautifulSoup(response.body, "lxml") #/html/body/div[4]/div[1]/div/div/div[1]/a divs = soup.findAll('div', {'class': 'con_lis show'}) for div in divs: # title, content,url item = IfengItem() url = div.find('a')['href'] title = div.find('h4').get_text() item['url'] = url item['title'] = title response2 = urllib.urlopen(url) soup2 = BeautifulSoup(response2, "lxml") content = soup2.find('div', {'id': 'yc_con_txt'}).get_text() item['content'] = content item['label'] = 'history' if self.check(item['url']): yield item #//*[@id="pagenext"] next_url = response.xpath( "//*[@id='pagenext'] /@href").extract() # 找到下一个链接,也就是翻页。 if next_url: yield scrapy.Request(next_url[0], callback=self.parse) def check(self, url): self.database = Database() self.database.connect('crawl_data') sql = "SELECT * FROM news where url=%s order by url" str_article_url = url.encode('utf-8') data = (str_article_url, ) try: search_result = self.database.query(sql, data) if search_result == (): self.database.close() return True except Exception, e: print e traceback.print_exc() self.database.close() return False
def check(self, url): self.database = Database() self.database.connect('crawl_data') sql = "SELECT * FROM news where url=%s order by url" str_article_url = url.encode('utf-8') data = (str_article_url, ) try: search_result = self.database.query(sql, data) if search_result == (): self.database.close() return True except Exception, e: print e traceback.print_exc()
class IfengSpider(scrapy.Spider): name = "sports" start_urls = [ "http://sports.ifeng.com/listpage/11244/1/list.shtml", #中国足球 "http://sports.ifeng.com/listpage/31190/1/list.shtml", #中超 "http://sports.ifeng.com/listpage/35586/1/list.shtml", #亚冠 "http://sports.ifeng.com/listpage/31186/1/list.shtml",#英超 "http://sports.ifeng.com/listpage/31188/1/list.shtml", #西甲 "http://sports.ifeng.com/listpage/31198/1/list.shtml",#欧冠 "http://sports.ifeng.com/listpage/31419/1/list.shtml",#田径 "http://sports.ifeng.com/listpage/31418/1/list.shtml",#网球 "http://sports.ifeng.com/listpage/34120/1/list.shtml",#排球 "http://sports.ifeng.com/listpage/11246/1/list.shtml",#国际足球 "http://sports.ifeng.com/listpage/31194/1/list.shtml",#CBA "http://sports.ifeng.com/listpage/11247/1/list.shtml",#综合 ] def parse(self, response): # response.body soup = BeautifulSoup(response.body, "lxml") divs = soup.findAll('div', {'class': 'box_list clearfix'}) for div in divs: # title, content,url item = IfengItem() h2 = div.find('h2') link = h2.find('a') url = link['href'] item['url'] = url title = link['title'] item['title'] = title response2 = urllib.urlopen(url) soup2 = BeautifulSoup(response2, "lxml") content = soup2.find('div', {'id': 'artical_real'}).get_text() item['content'] = content item['label'] = 'sports' if self.check(item['url']): yield item # //*[@id="pagenext"] //*[@id="pagenext"] next_url = response.xpath("//*[@id='pagenext'] /@href").extract() # 找到下一个链接,也就是翻页。 if next_url: yield scrapy.Request(next_url[0], callback=self.parse) def check(self, url): self.database = Database() self.database.connect('crawl_data') sql = "SELECT * FROM news where url=%s order by url" str_article_url = url.encode('utf-8') data = (str_article_url,) try: search_result = self.database.query(sql, data) if search_result == (): self.database.close() return True except Exception, e: print e traceback.print_exc() self.database.close() return False
class IfengSpider(scrapy.Spider): name = "mil3" start_urls = [ "http://news.qq.com/l/milite/milgn/list2010122872223.htm", #国内军情 "http://news.qq.com/l/milite/milhqj/list2010122872321.htm",#环球军情 "http://news.qq.com/l/milite/junbei/list2012095132410.htm",#军备动态 ] base = "http://news.qq.com/l/milite/junbei/list2012095132410_"#80.htm for i in range(2,51): url = base +str(i)+".htm" start_urls.append(url) base = "http://news.qq.com/l/milite/milhqj/list2010122872321_"#80.htm for i in range(2,333): url = base +str(i)+".htm" start_urls.append(url) base = "http://news.qq.com/l/milite/milgn/list2010122872223_"#80.htm for i in range(2,335): url = base +str(i)+".htm" start_urls.append(url) def parse(self, response): # response.body soup = BeautifulSoup(response.body, "lxml") root = soup.find('div', {'class': 'leftList'}) lis = root.findAll('li') for li in lis: # title, content,url item = IfengItem() url = li.find('a')['href'] item['url'] = url title = li.get_text() item['title'] = title response2 = urllib.urlopen(url) soup2 = BeautifulSoup(response2, "lxml") try: content = soup2.find('div', {'id': 'Cnt-Main-Article-QQ'}).get_text()#Cnt-Main-Article-QQ item['content'] = content except AttributeError: print AttributeError.message item['label'] = 'military' if self.check(item['url']): yield item # next_url = response.xpath("//*[@class='f12'] /@href").extract() # 找到下一个链接,也就是翻页。 # # if next_url: # yield scrapy.Request(next_url[0], callback=self.parse) def check(self, url): self.database = Database() self.database.connect('crawl_data') sql = "SELECT * FROM news where url=%s order by url" str_article_url = url.encode('utf-8') data = (str_article_url,) try: search_result = self.database.query(sql, data) if search_result == (): self.database.close() return True except Exception, e: print e traceback.print_exc() self.database.close() return False