def parse(self, response): ''' :param response: :return: 取各版块的url并传入到parse_pages ''' #test # url = 'http://m.sohu.com/cl/58/?page=1' # yield scrapy.Request(url, self.parse_item) #end test file_name = "log-" + str(datetime.date.today()) + ".txt" t_file = codecs.open(file_name, 'ab', encoding='utf-8') if self.isFirst: self.isFirst = False line = str(datetime.datetime.now()) + " " + self.name + " spider start!\n" t_file.write(line.decode("unicode_escape")) updatetool = UpdateTool() self.log('Hi, this is an page! %s' % response.url) self.new = 0 self.pages += 1 root = bs(response.body.decode('utf-8')) div = root.find("div", attrs={"class": "bd3 pb1"}) lis = div.findAll("p") for li in lis: item = FenghuoItem() iurl = 'm.sohu.com'+li.find("a").get("href") title = li.find("a").text pubdate = root.find('p',attrs={'class': 'w c2'}).text month = pubdate[16:18] day = pubdate[19:21] hour = pubdate[22:24] year = pubdate[11:15] item_date = datetime.date(int(year), int(month), int(day)) item['url'] = iurl item['title'] = title item['pubdate'] = str(item_date) item['snatch_time'] = datetime.datetime.now() item['topPost'] = 1 item['site_name'] = '手机搜狐网' item['site_url'] = "m.sohu.com/" print item if (not updatetool.hasUrl(iurl)) and self.now - item_date < self.delay: self.new += 1 self.total_new += 1 fp.process_item(item, "123") url = 'http://m.sohu.com/cl/58/?page='+str(self.pages) if self.new > 3 and self.hasNext: yield scrapy.Request(url, self.parse) else: line = str(datetime.datetime.now()) + " Totally crawled " + str(self.total_new) + " items " + self.name + " spider has finished start!\n\n" t_file.write(line.decode("unicode_escape"))
def parse(self, response): ''' :param response: :return: 取各版块的url并传入到parse_pages ''' #test # url = 'http://www.gs.whu.edu.cn/index.php/index-show-tid-40-p-1.html' # yield scrapy.Request(url, self.parse_item) #end test file_name = "log-" + str(datetime.date.today()) + ".txt" t_file = codecs.open(file_name, 'ab', encoding='utf-8') if self.isFirst: self.isFirst = False line = str(datetime.datetime.now()) + " " + self.name + " spider start!\n" t_file.write(line.decode("unicode_escape")) updatetool = UpdateTool() self.log('Hi, this is an page! %s' % response.url) self.new = 0 self.pages += 1 root = bs(response.body.decode('utf-8')) div = root.find("div", attrs={"class": "ulnotice"}) lis = div.findAll("li") for li in lis: item = FenghuoItem() iurl = li.find("a").get("href") if iurl[0:4]!='http': iurl='http://gs.whu.edu.cn'+iurl title = li.find("a").text pubdate = li.find("span").text month = pubdate[6:8] day = pubdate[9:11] hour = '00' year = pubdate[1:5] item_date = datetime.date(int(year), int(month), int(day)) if (not updatetool.hasUrl(iurl)) and self.now - item_date < self.delay: self.new += 1 self.total_new += 1 yield scrapy.Request(iurl, self.parse_items)#according to iurl,requuest the detail page url = 'http://www.gs.whu.edu.cn/index.php/index-show-tid-40-p-'+str(self.pages)+'.html' if self.new > 10 and self.hasNext: yield scrapy.Request(url, self.parse) else: line = str(datetime.datetime.now()) + " Totally crawled " + str(self.total_new) + " items " + self.name + " spider has finished start!\n\n" t_file.write(line.decode("unicode_escape"))
def parse(self, response): file_name = "log-" + str(datetime.date.today()) + ".txt" t_file = codecs.open(file_name, 'ab', encoding='utf-8') if self.isFirst: self.isFirst = False line = str( datetime.datetime.now()) + " " + self.name + " spider start!\n" t_file.write(line.decode("unicode_escape")) updatetool = UpdateTool() self.log('Hi, this is an page! %s' % response.url) self.new = 0 self.pages += 1 root = bs(response.body) div = root.find("div", "fallsFlow") lis = div.findAll("li") for li in lis: iurl = li.find("a").get("href") pubdate = li.find("h6").text year = pubdate[0:4] month = pubdate[5:7] day = pubdate[8:10] item_date = datetime.date(int(year), int(month), int(day)) yield scrapy.Request( iurl, self.parse_items) #according to iurl,requuest the detail page if (not updatetool.hasUrl(iurl) ) and self.now - item_date < self.delay: self.new += 1 self.total_new += 1 if self.pages >= 2: url = 'http://china.huanqiu.com/local/' + str(self.pages) + '.html' else: url = 'http://china.huanqiu.com/local/index.html' if self.new > 10 and self.hasNext: yield scrapy.Request(self.url, self.parse) else: line = str(datetime.datetime.now()) + " Totally crawled " + str( self.total_new ) + " items " + self.name + " spider has finished start!\n\n" t_file.write(line.decode("unicode_escape"))
def parse(self, response): ''' :param response: :return: 取出新闻页面的具体信息 ''' #test # url = 'http://www.xcxww.com/content-19-3751-1.html' # yield scrapy.Request(url, self.parse_item) #end test self.log('Hi, this is an page! %s' % response.url) file_name = "log-" + str(datetime.date.today()) + ".txt" t_file = codecs.open(file_name, 'ab', encoding='utf-8') if self.isFirst: self.isFirst = False line = str(datetime.datetime.now() ) + " " + self.name + " spider has started!\n" t_file.write(line.decode("unicode_escape")) updatetool = UpdateTool() self.log('Hi, this is an page! %s' % response.url) self.new = 0 self.pages += 1 root = bs(response.body.decode('gb2312')) codeTexts = root.find("script", attrs={"language": "JavaScript"}).text maxPage = codeTexts[codeTexts.index("maxPage") + 10:codeTexts.index("maxPage") + 14] # maxPage = int(maxPage) #print maxPage div = root.find("div", attrs={"class": "new-article"}) lis = div.findAll("div", attrs={"class": "article-list"}) for li in lis: iurl = li.find("h3").contents[1].get("href") title = li.find("h3").contents[1].text pubdate = li.find("div", attrs={"class": "time"}).text pubdate = pubdate[pubdate.index(" ") + 1:] year = pubdate[0:4] month = pubdate[5:7] day = pubdate[8:10] item_date = datetime.date(int(year), int(month), int(day)) yield scrapy.Request( iurl, self.parse_items) #according to iurl,requuest the detail page if (not updatetool.hasUrl(iurl) ) and self.now - item_date < self.delay: self.new += 1 self.total_new += 1 url = 'http://news.sohu.com/guoneixinwen_' + str(maxPage - self.pages + 1) + '.shtml' print url if self.new > 10 and self.hasNext: yield scrapy.Request(url, self.parse) else: line = str(datetime.datetime.now()) + " Totally crawled " + str( self.total_new ) + " items " + self.name + " spider has finished start!\n\n" t_file.write(line.decode("unicode_escape"))
from BeautifulSoup import BeautifulSoup as bs from BeautifulSoup import Comment #used to remove comments in text from fenghuo.items import FenghuoItem import HTMLParser #used to parse HTML import requests from com.crawler.dao.UpdateTool import UpdateTool import datetime, calendar root_domain = "http://news.sohu.com/" import sys reload(sys) sys.setdefaultencoding('utf-8') from fenghuo.pipelines import FenghuoPipeline as FP fp = FP() updatetool = UpdateTool() class SoHuSpider(CrawlSpider): name = "sohus" allowed_domains = ["news.sohu.com"] start_urls = ['http://news.sohu.com/guoneixinwen.shtml'] new = 11 total_new = 0 url = start_urls[0] hasNext = True now = datetime.date.today() delay = datetime.timedelta(days=3) pages = 1 isFirst = True