class Zsk(CrawlSpider): name = "zsk" allowed_domains = ["csdn.net"] start_urls = ["http://lib.csdn.net"] rules = [ # 定义爬取URL的规则 # (没有callback意味着follow默认为True) #Rule(LinkExtractor(allow=('/base/\w+'),deny=('/base/\w+/resource/\w+'))), Rule(sle(allow=("/bases/\w+")), follow=True, callback='parse_item'), Rule(sle(allow=("/base/\w+")), follow=False, callback='parse_item') ] def parse_item(self, response): #print response.url #logtxt = open("F:\\HW\\scrapyzsk\\csdnzsk\\zsk.txt",'a+') #logtxt.write(response.url+'\r\n') #logtxt.close() try: hxs = HtmlXPathSelector(response) item = csdnzsk.items.CsdnzskItem() item['name'] = hxs.select( "//div[@class='banner_log']/em/text()")[0].extract() item['url'] = response.url print """**********%s\r\n""" % response.url return item except Exception, e: print Exception, ":", e
class HrtencentSpider(CrawlSpider): name = "hrtencent" allowed_domains = ["tencent.com"] start_urls = [ "http://hr.tencent.com/position.php?start=%d" % d for d in range(0, 20, 10) ] rules = [ Rule(sle(allow=("/position_detail.php\?id=\d*.*", )), callback='parse_2'), Rule(sle(allow=("/position.php\?&start=\d{,2}#a")), follow=True, callback='parse_1') ] def parse_2(self, response): items = [] sel = Selector(response) sites = sel.css('.tablelist') for site in sites: item = PositionDetailItem() item['sharetitle'] = site.css('.h #sharetitle::text').extract() item['bottomline'] = site.css('.bottomline td::text').extract() # item['duty'] = site.css('.c .l2::text').extract() item['link'] = response.url items.append(item) print repr(item).decode("unicode-escape") + '\n' # info('parsed ' + str(response)) self.parse_1(response) return items def parse_1(self, response): # url cannot encode to Chinese easily.. XXX info('parsed ' + str(response)) def _process_request(self, request): info('process ' + str(request)) return request
class HrtencentSpider(CrawlSpider): name = "hrtencent" allowed_domains = ["dewen.io"] start_urls = [ "http://www.dewen.io/questions?page=%d" % d for d in range(1, 10, 1) ] rules = [ Rule(sle(allow=("/q/\d*")), callback='parse_2'), Rule(sle(allow=("/questions?page=\d{,4}")), follow=True, callback='parse_1') ] def parse_2(self, response): items = [] item = PositionDetailItem() sel = Selector(response) site = sel.css('.container') item['sharetitle'] = site.css('#title::text').extract() item['description'] = site.css('#qst_content').extract() #item['duty'] = site.css('.c .l2::text').extract() item['link'] = response.url item['tags'] = site.css('#topic a::text').extract() #get content images url images_1 = sel.css('#qst_content img::attr(loadsrc)').extract() images_2 = sel.css('.post_area img::attr(loadsrc)').extract() item['image_urls'] = images_1 + images_2 answers = [] an_articles = site.css('.ans_item') for an_article in an_articles: answer = {} answer['description'] = an_article.css('.post_area').extract() answer['votes'] = an_article.css('.voting::attr(score)').extract() if an_article.css('.best_ans_text').extract(): answer['chosen'] = 1 else: answer['chosen'] = 0 answers.append(answer) #item['answers'] = answers item['answers'] = answers items.append(item) print repr(item).decode("unicode-escape") + '\n' # info('parsed ' + str(response)) self.parse_1(response) return items def parse_1(self, response): # url cannot encode to Chinese easily.. XXX info('parsed ' + str(response)) def _process_request(self, request): info('process ' + str(request)) return request
class tmtSpider(CrawlSpider): name = 'tmt' allowed_domains = ['www.tmtpost.com'] start_urls = ['http://www.tmtpost.com/lists/hot_list'] rules = [ # 人们话题一共有5页,每页10篇文章 Rule(sle(allow=('www.tmtpost.com/hot/\d+')), follow=True), Rule(sle(allow=('www.tmtpost.com/\d+.html')), callback='parse_tmt') ] def parse_tmt(self, response): items = [] sel = Selector(response) sites = sel.xpath('/html') for site in sites: item = TmtItem() item['product_url'] = response.url item['title'] = site.xpath('//article/h1/text()').extract() # web上使用 Xpath Checker可以用//article//span[@class="time"]/text()来提取出时间 # 但Scrapy抓取却无法抓取到,并且它也不属于js生成内容,interesting item['pub_date'] = site.xpath( '//div[@class="post-info"]/span[2]/text()').extract() item['intro_content'] = site.xpath( '//p[@class="post-abstract"]/text()').extract() items.append(item) return items
class DoubanBookSpider(CrawlSpider): name = "doubanbook" allowed_domains = ["douban.com"] start_urls = [ "http://book.douban.com/tag/" ] rules = [ Rule(sle(allow=("/subject/\d+/?$")), callback='parse_2'), Rule(sle(allow=("/tag/[^/]+/?$", )), follow=True), Rule(sle(allow=("/tag/$", )), follow=True), ] def parse_2(self, response): items = [] sel = Selector(response) sites = sel.css('#wrapper') for site in sites: item = DoubanSubjectItem() item['title'] = site.css('h1 span::text').extract() item['link'] = response.url item['content_intro'] = site.css('#link-report .intro p::text').extract() items.append(item) print repr(item).decode("unicode-escape") + '\n' # print item # info('parsed ' + str(response)) return items def parse_1(self, response): # url cannot encode to Chinese easily.. XXX info('parsed ' + str(response)) def _process_request(self, request): info('process ' + str(request)) return request
class IcibaSpider(CrawlSpider): name = 'iciba' allowed_domains = ['iciba.com'] start_urls = ['http://news.iciba.com/dailysentence/history.html'] rules = [ # Extract links matching 'category.php' (but not matching 'subsection.php') # and follow links from them (since no callback means follow=True by default). Rule(sle(allow=( "/appv3/wwwroot/ds.php\?action=history&ob=1&order=2&page=\d+#nav", )), follow=True), # Extract links matching 'item.php' and parse them with the spider's method parse_item Rule(sle(allow=("/dailysentence/detail-\d+.html#nav")), callback='parse_item'), ] def parse_item(self, response): self.logger.info('Hi, this is an item page! %s', response.url) item = IcibaspiderItem() item['en'] = response.xpath( "//li[@class='en']/descendant::text()").extract() item['cn'] = response.xpath( "//li[@class='cn']/descendant::text()").extract() item['url'] = response.url return item
class w61856Spider(CrawlSpider): siteid = 1 #采集数据的站点id sitename = u'鲜直达' #采集数据的站点名称 name = "XianZhiDa" allowed_domains = ["61856.com"] start_urls = [ "http://www.61856.com/category.php?id=16", "http://www.61856.com/category.php?id=17" ] # rules = [ # 定义爬取URL的规则 # Rule(sle(allow=("/position.php\?&start=\d{,4}#a")), follow=True, callback='parse_item') # ] rules = [ # 定义爬取URL的规则 Rule(sle(allow=( "/category.php\?id=16&price_min=0&price_max=0&page=\d{,4}&sort=sort_order&order=DESC" )), follow=True, callback='parse_item'), Rule(sle(allow=( "/category.php\?id=17&price_min=0&price_max=0&page=\d{,4}&sort=sort_order&order=DESC" )), follow=True, callback='parse_item') ] def parse_item(self, response): # 提取数据到Items里面,主要用到XPath和CSS选择器提取网页数据 log.start(logfile='log.txt', loglevel=log.WARNING) items = [] sel = Selector(response) base_url = get_base_url(response) catalog = sel.css('div.box_1 div.sp_13').xpath('text()').extract()[0] sites = sel.css('div.centerPadd div.sp_16') for site in sites: item = GuoShuItem() item['siteid'] = self.siteid item['sitename'] = self.sitename item['name'] = site.css('p a').xpath('text()').extract()[0] relative_url = site.css('p a').xpath('@href').extract()[0] item['detailurl'] = urlparse.urljoin( base_url, relative_url) #urljoin_rfc(base_url, relative_url) item['catalog'] = catalog item['guige'] = site.css('.shop').xpath('text()').extract()[0] price = site.css('.shop_s2').xpath('text()').extract() item['price'] = price[0].split('/')[0].replace("¥", "") item['danwei'] = price[0].split('/')[1] items.append(item) # print repr(item).decode("unicode-escape") + '\n' # log.msg('item %s' % repr(item).decode("unicode-escape"),level=log.WARNING) # info('parsed ' + str(response)) return items def _process_request(self, request): # info('process ' + str(request)) return request
class alexaCNSpider(CrawlSpider): name = "alexa.cn" allowed_domains = ["alexa.com"] start_urls = [ "http://www.alexa.com/", "http://www.alexa.com/topsites/category/World/Chinese_Simplified_CN", ] rules = [ Rule(sle(allow=( "/topsites/category;?[0-9]*/Top/World/Chinese_Simplified_CN/.*$")), callback='parse_category_top_xxx', follow=True), Rule(sle(allow=("/topsites/category/World/Chinese_Simplified_CN$", )), callback='parse_category_top_xxx', follow=True), #Rule(sle(allow=("/people/[^/]+$", )), callback='parse_people', follow=True), ] # www.alexa.com/topsites/category/Top/Computers # www.alexa.com/topsites/category;1/Top/Computers def parse_category_top_xxx(self, response): info('parsed ' + str(response)) items = [] sel = Selector(response) sites = sel.css('.site-listing') for site in sites: item = alexaSiteInfoItem() item['url'] = site.css( 'a[href*=siteinfo]::attr(href)')[0].extract() item['name'] = site.css('a[href*=siteinfo]::text')[0].extract() item['description'] = site.css('.description::text')[0].extract() remainder = site.css('.remainder::text') if remainder: item['description'] += remainder[0].extract() # more specific item['category'] = urllib.unquote('/'.join( response.url.split('/')[-3:])).decode('utf-8') items.append(item) return items def parse_category_top(self, response): info('parsed ' + str(response)) items = [] sel = Selector(response) categories = sel.css('li a[href*="/topsites/category/Top/"]') for category in categories: item = alexaCategoryItem() item['url'] = category.css('::attr(href)')[0].extract() item['name'] = category.css('::text')[0].extract() items.append(item) return items
class MiHeSpider(CrawlSpider): siteid = 3 #采集数据的站点id sitename = u'米禾' #采集数据的站点名称 name = "MiHe" allowed_domains = ["ranlixu.com"] start_urls = [ "http://www.ranlixu.com/class.asp?larCode=1", "http://www.ranlixu.com/class.asp?larCode=701" # "http://www.ranlixu.com/list.asp?ProdId=A03026" ] rules = [ # 定义爬取URL的规则 Rule(sle(allow=("/class.asp\?larCode=1&Page=\d{,4}")), follow=True), Rule(sle(allow=("/class.asp\?larCode=701&Page=\d{,4}")), follow=True), Rule(sle(allow=("/list.asp\?ProdId=G\d{,10}")), follow=True, callback='parse_item'), Rule(sle(allow=("/list.asp\?ProdId=A\d{,10}")), follow=True, callback='parse_item') ] def parse_item(self, response): # 提取数据到Items里面,主要用到XPath和CSS选择器提取网页数据 parse_item log.start(logfile='log.txt', loglevel=log.WARNING) items = [] sel = Selector(response) base_url = get_base_url(response) catalog = sel.css('div.cc').xpath('text()').extract()[2] catalog = catalog[catalog.index(u'品牌:'):].replace("\r\n", "").replace( "品牌:", "").lstrip().rstrip() item = GuoShuItem() item['siteid'] = self.siteid item['sitename'] = self.sitename item['name'] = sel.css('div.cc h2').xpath('text()').extract()[0] item['detailurl'] = base_url item['catalog'] = catalog item['guige'] = sel.css('div.cc b').xpath('text()').extract()[0] price = sel.css('div.cc').xpath( './/font[@color="red"]/text()').extract()[0] item['price'] = price item['danwei'] = item['guige'] items.append(item) # print repr(item).decode("unicode-escape") + '\n' # log.msg('item %s' % repr(item).decode("unicode-escape"),level=log.WARNING) # info('parsed ' + str(response)) return items def _process_request(self, request): # info('process ' + str(request)) return request
class amazonbookSpider(CommonSpider): name = "amazonbook" allowed_domains = ["amazon.com", "www.amazon.com"] start_urls = [ #"http://www.amazon.com/b/ref=s9_acss_bw_en_BGG15eve_d_1_6?_encoding=UTF8&node=17&pf_rd_m=ATVPDKIKX0DER&pf_rd_s=merchandised-search-top-3&pf_rd_r=0XCRZV6SDKBTKDPH8SFR&pf_rd_t=101&pf_rd_p=2293718502&pf_rd_i=283155", "http://www.amazon.com/books-used-books-textbooks/b?node=283155", ] rules = [ #Rule(sle(allow=("/gp/product/.*")), callback='parse_1', follow=True), Rule(sle(allow=("/books-used-books-textbooks/.*")), callback='parse_0', follow=True), ] css_rules = { ".inner .a-row": { "url": ".title::attr(href)", #"desc": "span::text" "title": ".s9TitleText::text", "comments": ".a-icon-row .a-size-small::text", } } def parse_0(self, response): info('Parse 0 '+response.url) pp.pprint(self.parse_with_rules(response, self.css_rules, dict)) #.inner .a-row def parse_1(self, response): info('Parse 1 '+response.url)
class templateSpider(CommonSpider): name = "template" allowed_domains = ["template.com"] start_urls = [ "http://www.template.com/", ] rules = [ Rule(sle(allow=("/topsites/category;?[0-9]*/Top/World/Chinese_Simplified_CN/.*$")), callback='parse_1', follow=True), ] list_css_rules = { '.linkto': { 'url': 'a::attr(href)', 'name': 'a::text', } } list_css_rules_2 = { '#listZone .Q-tpWrap': { 'url': '.linkto::attr(href)', 'name': '.linkto::text' } } content_css_rules = { 'text': '#Cnt-Main-Article-QQ p *::text', 'images': '#Cnt-Main-Article-QQ img::attr(src)', 'images-desc': '#Cnt-Main-Article-QQ div p+ p::text', } def parse_1(self, response): info('Parse '+response.url)
class DoubanBookSpider(CrawlSpider): name = "ypgs" allowed_domains = ["07938.com"] start_urls = ["http://www.07938.com/zheligushi/"] rules = [ Rule(sle(allow=("/\d+.html$")), callback='parse_2'), # Rule(sle(allow=("/tag/[^/]+/?$", )), follow=True), # Rule(sle(allow=("/tag/$", )), follow=True), ] def parse_2(self, response): item = StoryItem() sel = Selector(response) item['title'] = sel.css("h1::text") content = sel.css(".content") # TODO 内容格式处理 item['content'] = self.process_content(content) # info('parsed ' + str(response)) return item def parse_1(self, response): # url cannot encode to Chinese easily.. XXX info('parsed ' + str(response)) def _process_request(self, request): info('process ' + str(request)) return request def process_content(self, content): # TODO process content return content
class DmozSpider(CrawlSpider): name = "dmoz" allowed_domains = ["tencent.com"] start_urls = ["http://hr.tencent.com/position.php"] rules = [ Rule(sle(allow=("/position.php\?&start=\d{,4}#a")), follow=True, callback='parse_item') ] def parse_item(self, response): p = Pinyin() items = [] sel = Selector(response) base_url = get_base_url(response) sites_even = sel.css('table.tablelist tr.even') for site in sites_even: item = Website() item['name'] = site.css('.l.square a').xpath('text()').extract()[0] item['description'] = site.css( 'tr > td:nth-child(2)::text').extract()[0] url = site.css('tr > td:nth-child(4)::text').extract()[0] item['url'] = p.get_pinyin(url, u'') item['address'] = url item['num'] = int( site.css('tr > td:nth-child(3)::text').extract()[0]) item['date'] = site.css('tr > td:nth-child(5)::text').extract()[0] item['uid'] = item['date'] + '-' + url + '-' + item['name'] items.append(item) return items
class MySpider(CrawlSpider): name = "taobao" allowed_domains = ["taobao.com"] start_urls = [ "https://www.taobao.com", ] rules = [ # 定义爬取URL的规则 # 在起始页面的下一级进行查找 Rule(sle(allow=("https://www\.taobao\.com/market/.+"))), Rule(sle(allow=("item\.taobao\.com/item\.htm\?spm=.+")), follow=True, callback='parse_item') ] def parse_item(self, response): items = [] # // filename = response.url.split("/")[-2] print "taobao =========================%s"%response.url return items
class Jiandan_net(CrawlSpider): """定义蜘蛛功能""" name = "jiandan" allowed_domains = ['jandan.net'] start_urls = [ #起始抓取URL "http://jandan.net/new" ] rules = ( Rule(sle(allow=(r"/[\d]{4}/[\d]{2}/[\d]{2}/[\d\w-]{0,50}\.html$")), callback="prase_detail"), # Rule(sle(allow=(r"/page/[23]"))), Rule(sle(deny=(r"/tag/.*")), follow=False), # Rule(sle(deny=(r"/author/.*"))), Rule(sle(deny=(r"/page/.*")), follow=False), Rule(sle(deny=(r"(/v|/duan|/pic|/guanyu|/feed|/app|/|/author/.*)$")), follow=False), ) def prase_detail(self, response): Items = [] resBody = Selector(text=response.body) base_url = get_base_url(response) #获取源网址成功 print '***********************************', base_url title = resBody.xpath('//title/text()').extract()[0] #获取标题成功 publishtime = resBody.xpath('//div[@class="time_s"]/text()').re( "\s@\s(.*)\s,\s(.*)") #获取发布时间(需处理) author = resBody.xpath('//div[@class="time_s"]/a/text()').extract()[ 0] #获取作者成功 content = resBody.xpath( '//div[@class="post f"]//p').extract() #获取网页主内容成功 item = Jiandan() item['title'] = title item['source_url'] = base_url item['publish_time'] = publishtime item['source_author'] = author item['content'] = content # file_d = open("./jiandan.txt",'a+') # mark = len(file_d.readlines())+1 # file_d.write(str(mark)+" "+str(get_base_url(response))+title.encode('utf-8')+str(content)+"\n") # file_d.close() return item
class qqnewsSpider(CommonSpider): name = "qqnews" allowed_domains = ["tencent.com", 'qq.com'] start_urls = ['http://news.qq.com/society_index.shtml'] rules = [ Rule(sle(allow=('society_index.shtml')), callback='parse_0', follow=True), Rule(sle(allow=(".*[0-9]{8}.*htm$")), callback='parse_1', follow=True), ] list_css_rules = { '.linkto': { 'url': 'a::attr(href)', 'name': 'a::text', } } list_css_rules_2 = { '#listZone .Q-tpWrap': { 'url': '.linkto::attr(href)', 'name': '.linkto::text' } } content_css_rules = { 'text': '#Cnt-Main-Article-QQ p *::text', 'images': '#Cnt-Main-Article-QQ img::attr(src)', 'images-desc': '#Cnt-Main-Article-QQ div p+ p::text', } def parse_0(self, response): info('Parse0 ' + response.url) x = self.parse_with_rules(response, self.list_css_rules, dict) pp.pprint(x) #return self.parse_with_rules(response, self.list_css_rules, qqnewsItem) def parse_1(self, response): info('Parse1 ' + response.url) x = self.parse_with_rules(response, self.content_css_rules, dict) pp.pprint(x) #import pdb; pdb.set_trace() def parse_2(self, response): info('Parse2 ' + response.url)
def __init__(self, forum_id=58, digit=1, *args, **kwargs): self.start_urls = [self.ip_format % d for d in [int(forum_id)]] self.rules = [ Rule(sle(allow=("/forum/forum-" + forum_id + "-[0-9]{," + digit + "}\.html")), follow=True, callback='parse_1'), ] super(sisSpider, self).__init__(*args, **kwargs)
class TencentSpider(CrawlSpider): name = "tencent" allowed_domains = ["tencent.com"] start_urls = ["http://hr.tencent.com/position.php"] rules = [ # 定义爬取URL的规则 Rule(sle(allow=("/position.php\?&start=\d{,4}#a")), follow=True, callback='parse_item') ] def parse(self, response): # 提取数据到Items里面,主要用到XPath和CSS选择器提取网页数据 sel = Selector(response) base_url = get_base_url(response) sites_even = sel.css('table.tablelist tr.even') for site in sites_even: yield self.parsePages(site, base_url) sites_odd = sel.css('table.tablelist tr.odd') for site in sites_odd: yield self.parsePages(site, base_url) next_page = sel.css( 'table.tablelist tr.f #next ::attr(href)').extract_first() if next_page: yield scrapy.Request(response.urljoin(next_page), callback=self.parse) def parsePages(self, site, base_url): item = TencentItem() item['name'] = site.css('.l.square a ::text').extract_first() relative_url = site.css('.l.square a').xpath('@href').extract()[0] item['detailLink'] = urljoin_rfc(base_url, relative_url) item['catalog'] = site.css('tr > td:nth-child(2)::text').extract() item['location'] = site.css('tr > td:nth-child(4)::text').extract() item['number'] = site.css('tr > td:nth-child(3)::text').extract() item['publishTime'] = site.css('tr > td:nth-child(5)::text').extract() return scrapy.Request(item['detailLink'], meta={'item': item}, callback=self.parseDetail) def parseDetail(self, response): sel = Selector(response) item = response.meta['item'] responsibilities = [] lis = sel.css('table.tablelist tr:nth-child(3) ul li') for li in lis: responsibilities.append(li.css('::text').extract_first()) item['responsibilities'] = responsibilities requirements = [] lis = sel.css('table.tablelist tr:nth-child(3) ul li') for li in lis: requirements.append(li.css('::text').extract_first()) item['requirements'] = requirements return item
class sinanewsSpider(CommonSpider): name = "sinanews" allowed_domains = ["news.sina.com.cn"] start_urls = [ "http://news.sina.com.cn/", ] rules = [ Rule(sle(allow=("http://news.sina.com.cn/$")), callback='parse_0'), Rule(sle(allow=(".*doc[^/]*shtml$")), callback='parse_1'), #, follow=True), #Rule(sle(allow=('/c/2015-11-19/doc-ifxkszhk0386278.shtml')), callback='parse_1', follow=True, process_request='process_request'), ] list_css_rules = { '#blk_yw_01 a': { 'url': 'a::attr(href)', 'name': 'a::text', } } content_css_rules = { 'text': 'p::text', 'images': 'img::attr(src)', 'images-desc': '.img_descr::text', # need url analysis for video #'video': '#J_Article_Player', } def process_request(self, r): info('process ' + str(r)) return r def parse_0(self, response): info('Parse 0 ' + response.url) x = self.parse_with_rules(response, self.list_css_rules, dict) pp.pprint(x) #pdb.set_trace() #return self.parse_with_rules(response, self.list_css_rules, sinanewsItem) def parse_1(self, response): info('Parse 1 ' + response.url) x = self.parse_with_rules(response, self.content_css_rules, dict) pp.pprint(x)
class MySpider(CrawlSpider): name = "jingdong" allowed_domains = ["jd.com","3.cn"] start_urls = [ "http://www.jd.com/", ] # Todo:summary CrawSpider中的rules 调用父类的parse 从第一级页面开始进行抓取。注意页面跳转及过滤的正则,要按顺序依次放入列表 # 逻辑上似乎已经默认调用父类的parse代码逻辑对页面中的链接按rules中的规则开始分析 rules = ( # 定义爬取URL的规则 Rule(sle(allow=("http://channel.jd.com/.+\.html"))), Rule(sle(allow=("http://item.jd.com/\d+\.html")), follow=True, callback='parse_item'), ) # def parse_page(self, response): # print response.url # yield Request(response.url,callback=self.parse_item ) def parse_item(self,response): sel = Selector(response) filename = response.url.split("/")[-1] item = JingdongItem() item["url"] = [response.url] item["name"] = sel.xpath('//*[@id="name"]/h1/text()').extract() # js生成的价格。。。 # item["price"] = sel.xpath('//div[2]/div[2]/strong/text()').extract() # 参考 网文 http://blog.csdn.net/lanshanlei/article/details/42741179 productid = os.path.splitext(filename)[-2] #response.url[19:29] priceUrl = 'http://p.3.cn/prices/mgets?skuIds=J_' + productid + 'J_' r = Request(priceUrl,callback= self.parsePrice) r.meta['item'] = item yield r def parsePrice(self,response): sel = Selector(response) item = response.meta['item'] try: price = sel.xpath("//text()").extract()[0].encode('utf-8').split('"')[7] except Exception,ex: print ex price = -2 item['price'] = [price] return item
class TencentSpider(CrawlSpider): name = "tencent" allowed_domains = ["tencent.com"] start_urls = ["http://hr.tencent.com/position.php"] rules = [ # 定义爬取URL的规则 Rule(sle(allow=("/position.php\?&start=\d{,4}#a")), follow=True, callback='parse_item') ] def parse_item(self, response): # 提取数据到Items里面,主要用到XPath和CSS选择器提取网页数据 items = [] sel = Selector(response) base_url = get_base_url(response) sites_even = sel.css('table.tablelist tr.even') for site in sites_even: item = TencentItem() item['name'] = site.css('.l.square a').xpath('text()').extract()[0] relative_url = site.css('.l.square a').xpath('@href').extract()[0] item['detailLink'] = urljoin_rfc(base_url, relative_url) item['catalog'] = site.css( 'tr > td:nth-child(2)::text').extract()[0] item['workLocation'] = site.css( 'tr > td:nth-child(4)::text').extract()[0] item['recruitNumber'] = site.css( 'tr > td:nth-child(3)::text').extract()[0] item['publishTime'] = site.css( 'tr > td:nth-child(5)::text').extract()[0] items.append(item) #print repr(item).decode("unicode-escape") + '\n' sites_odd = sel.css('table.tablelist tr.odd') for site in sites_odd: item = TencentItem() item['name'] = site.css('.l.square a').xpath('text()').extract()[0] relative_url = site.css('.l.square a').xpath('@href').extract()[0] item['detailLink'] = urljoin_rfc(base_url, relative_url) item['catalog'] = site.css( 'tr > td:nth-child(2)::text').extract()[0] item['workLocation'] = site.css( 'tr > td:nth-child(4)::text').extract()[0] item['recruitNumber'] = site.css( 'tr > td:nth-child(3)::text').extract()[0] item['publishTime'] = site.css( 'tr > td:nth-child(5)::text').extract()[0] items.append(item) #print repr(item).decode("unicode-escape") + '\n' info('parsed ' + str(response)) return items def _process_request(self, request): info('process ' + str(request)) return request
class DoubanBookSpider(CrawlSpider): name = "douban_book" allowed_domains = ["douban.com"] start_urls = ["http://book.douban.com/tag/"] rules = ( Rule(sle(allow=("/tag/[^/]+/?$", )), callback="parse_1"), Rule(sle(allow=("/tag/$", )), follow=True, process_request='_process_request'), ) # NOTE: depth index is hidden. depth_class_list = [ '.*/tag/?$', '.*/tag/.+/?', ] def _cal_depth(self, response): """ Calculate the depth of response, and call corresponding method or stop crawl. """ url = response.url for depth, depth_regexp in enumerate(self.depth_class_list): if re.match(depth_regexp, url): return depth # warn("Unknown url depth: " + url) # If the url pattern is unknown, then return -1. return -1 def parse_1(self, response): # url cannot encode to Chinese easily.. XXX info('parsed ' + str(response)) def _process_request(self, request): info('process ' + str(request)) return request '''
class tencentDemoSpider(CrawlSpider): name = "tencent" allowed_domains = ["tencent.com"] start_urls = [ "http://hr.tencent.com/position.php" ] #Todo:summary CrawlSpider 用默认的parse 解析start_urls .本例子中把parse和parse_item合并一起了。 # 对第一级start_urls的链接用rules的规则进行分析 还有一个方法是,不写rule,直接重写parse方法,方法里分别获取产品url和下一页url,也很好用 rules = [ # 定义爬取URL的规则 Rule(sle(allow=("/position\.php\?&start=\d{,4}#a")), follow=True, callback='parse') ] # parse yield返回给pipeline的必须是item,字典,或Request 。 def parse(self, response): # 提取数据到Items里面,主要用到XPath和CSS选择器提取网页数据 items = [] sel = Selector(response) base_url = get_base_url(response) sites_even = sel.css('table.tablelist tr.even') for site in sites_even: item = TencentdemoItem() item['name'] = site.css('.l.square a').xpath('text()').extract() relative_url = site.css('.l.square a').xpath('@href').extract()[0] item['detailLink'] = urljoin_rfc(base_url, relative_url) item['catalog'] = site.css('tr > td:nth-child(2)::text').extract() item['workLocation'] = site.css('tr > td:nth-child(4)::text').extract() item['recruitNumber'] = site.css('tr > td:nth-child(3)::text').extract() item['publishTime'] = site.css('tr > td:nth-child(5)::text').extract() # items.append(item) yield item # print repr(item).decode("unicode-escape") + '\n' sites_odd = sel.css('table.tablelist tr.odd') for site in sites_odd: item = TencentdemoItem() item['name'] = site.css('.l.square a').xpath('text()').extract() relative_url = site.css('.l.square a').xpath('@href').extract()[0] item['detailLink'] = urljoin_rfc(base_url, relative_url) item['catalog'] = site.css('tr > td:nth-child(2)::text').extract() item['workLocation'] = site.css('tr > td:nth-child(4)::text').extract() item['recruitNumber'] = site.css('tr > td:nth-child(3)::text').extract() item['publishTime'] = site.css('tr > td:nth-child(5)::text').extract() yield item # items.append(item) # print repr(item).decode("unicode-escape") + '\n' # info('parsed ' + str(response)) #Todo:summary 下一页继续 urls = sel.xpath('//*[@id="next"]/@href').extract() for url in urls: print url yield Request(urljoin_rfc(base_url, url), callback=self.parse)
class vipSpider(CrawlSpider): name = 'vip' allowed_domains = ['m.vip.com'] start_urls = ['http://m.vip.com'] rules = [ Rule(sle(allow=(u'index\.php.*')), follow=True), Rule(sle(allow=('m.vip.com/product.*')), callback='parse_vip'), Rule(sle(allow=('m.vip.com/brand.*')), follow=True), # Rule(sle(allow=(u'product.*')),callback='parse_vip') ] def parse_vip(self, response): items = [] sel = Selector(response) sites = sel.xpath('/html') for site in sites: item = VipItem() item['product_url'] = response.url item['image_url'] = site.xpath( '//li[@style="width: 224px; display: table-cell; vertical-align: top;"]//img[@src][1]' ).re(r'src=(.*?) data') item['price'] = site.xpath( '//span[@class="u-detail-price"]/text()').extract() item['name'] = site.xpath('//h1/text()').extract() # item['brand']= # item['location']= # item['material']= items.append(item) return items
class sisSpider(CrawlSpider): name = "sis" ip = "38.103.161.147" allowed_domains = [ip] ip_format = 'http://' + ip + '/forum/forum-%d-1.html' start_urls = [ip_format % d for d in [143, 230]] rules = [ Rule(sle(allow=("/forum/thread-\d*-1-1\.html")), callback='parse_2'), Rule(sle(allow=("/forum/forum-\d*-1\.html")), follow=True, callback='parse_1'), ] def parse_2(self, response): items = [] sel = Selector(response) sites = sel.css('.postcontent')[0:1] for site in sites: item = SisItem() item['title'] = site.css('.postmessage h2::text').extract() item['imgs'] = site.css('.postmessage img::attr(src)').extract() item['torrents'] = site.css( '.t_attachlist a[href*=attachment]').extract() # item['duty'] = site.css('.c .l2::text').extract() item['link'] = response.url items.append(item) print repr(item).decode("unicode-escape") + '\n' # info('parsed ' + str(response)) self.parse_1(response) return items def parse_1(self, response): # url cannot encode to Chinese easily.. XXX info('parsed ' + str(response)) def _process_request(self, request): info('process ' + str(request)) return request
class BaiduEncySpider(CrawlSpider): name = 'baiduEncy' allowed_domains = ['baike.baidu.com'] start_urls = [ 'http://baike.baidu.com/wenhua', 'http://baike.baidu.com/dili', 'http://baike.baidu.com/shenghuo', ] rules = [ Rule(sle(allow=("/view/\d+.htm$")), callback='parse1'), Rule(sle(allow=("/view/\d+$/\d+.htm$")), callback='parse1'), Rule(sle(allow=("/\w+$", )), follow=True), ] def parse1(self, response): item = BaiduencyclopediaItem() item['id'] = response.url.split('/')[-1].split('.')[0] item['name'] = response.xpath('//span[@class="lemmaTitleH1"]/text()').extract() summary = response.xpath('//div[@class="card-summary-content"]').extract() item['summaryText'] = re.compile('<[^>]*>').sub('', summary[0]) rawAttrs = response.css('.biItemInner') attr = {} for rAttr in rawAttrs: attrName = ''.join(rAttr.css('.biTitle ::text').extract()) attrValue = ''.join(rAttr.css('.biContent ::text').extract()) attr[attrName] = attrValue item['attr'] = attr rawLables = response.xpath('//sapn[@class="taglist"]/text()') lable = [] for rLable in rawLables: lab = rLable.extract() lable.append(lab) item['lable'] = lable return [item, ] def _process_request(self, request): info('process ' + str(request)) return request
class ClassName(CrawlSpider): """docstring for ClassName""" name = "bdhub" allowed_domains = ["ibmbigdatahub.com"] start_urls = ["http://www.ibmbigdatahub.com/blogs"] rules = [ Rule(sle(allow=("/blogs\?page=\d{,4}")), follow=True, callback='parse_item') ] def isempty(self, var): if len(var): return var[0] else: return None def parse_item(self, response): items = [] sel = Selector(response) base_url = get_base_url(response) siteviews = sel.css('div.view-content div.views-row') for view in siteviews: item = DbhubItem() view = view.css('div.node__teaser') item['author'] = self.isempty( view.css('div.node__attributes span.blogger__name').xpath( 'a/text()').extract()) item['authorTitle'] = self.isempty( view.css('div.node__attributes span.blogger__title-and-company' ).xpath('text()').extract()) item['imageUrl'] = self.isempty( view.css('div.blog__image a').xpath('img/@src').extract()) item['title'] = self.isempty( view.css('h2').xpath('a/text()').extract()) item['date'] = self.isempty( view.css('div.node__attributes span.blog__created-date').xpath( 'text()').extract()) item['summery'] = self.isempty( view.css('div.blog__summary').xpath('text()').extract()) relative_url = self.isempty( view.css('h2').xpath('a/@href').extract()) item['link'] = urljoin_rfc(base_url, relative_url) item['baseUrl'] = response.url items.append(item) info("parsed " + str(response)) return items def parse_start_url(self, response): return self.parse_item(response)
class E21jobSpider(CrawlSpider): name = "e21job" allowed_domains = ["job.e21.edu.cn"] start_urls = [ "http://job.e21.edu.cn/stu_more.php?page=0&fenye=yes" ] rules = [ Rule(sle(allow=("stu_more.php\?page=\d{,5}&fenye=yes")), follow=True, callback='parse_item') ] def parse_item(self, response): items = [] sel = Selector(response) base_url = get_base_url(response) #sites = sel.css('table:nth-child(5)').css('table:nth-child(2)').css('table.black12').css('tr') for i in range(1, 50, 2): item = GraduateItem() query = 'tr:nth-child(%d)' %i #print query site = sel.css('table:nth-child(5)').css('table:nth-child(2)').css('table.black12').css(query) array = site.css('a').xpath('text()').extract() if len(array) == 1 : item['name'] = array[0] array = site.css('a').xpath('@href').extract() if len(array) == 1 : relative_url = array[0] item['detailLink'] = urljoin_rfc(base_url, relative_url) array = site.css('td:nth-child(2)::text').extract() if len(array) == 1 : item['school'] = array[0] array = site.css('td:nth-child(3)::text').extract() if len(array) == 1 : item['specialty'] = array[0] array = site.css('td:nth-child(4)::text').extract() if len(array) == 1 : item['education'] = array[0] items.append(item) #print repr(item).decode("unicode-escape") + '\n' info('parsed ' + str(response)) #log.msg(str('parsed ' + str(response)), level=log.INFO) return items def _process_request(self, request): #print request info('process ' + str(request)) # log.msg(str('process ' + str(request)), level=log.INFO) return request
class MySpider(CrawlSpider): name = "yaohao" allowed_domains = ["bjhjyd.gov.cn"] start_urls = [ "http://www.bjhjyd.gov.cn/", ] rules = [ # 定义爬取URL的规则 # 在起始页面的下一级进行查找 Rule(sle(allow=("https://www\.taobao\.com/market/.+"))), Rule(sle(allow=("item\.taobao\.com/item\.htm\?spm=.+")), follow=True, callback='parse_item') ] def parse_start_url(self, response): print response.url sel = Selector(response) print "+++++++++++++++++++++++" print sel.xpath('//*[@id="getValidCode"]').extract() return None def parse_item(self, response): items = [] # // filename = response.url.split("/")[-2] print "taobao =========================%s"%response.url return items
class templateSpider(CommonSpider): name = "template" allowed_domains = ["template.com"] start_urls = [ "http://www.template.com/", ] rules = [ Rule(sle(allow=( "/topsites/category;?[0-9]*/Top/World/Chinese_Simplified_CN/.*$")), callback='parse', follow=True), ] def parse(self, response): info('Parse ' + response.url)
def __init__(self, forum_id=58, digit=1, *args, **kwargs): self.start_urls = [self.ip_format % d for d in [int(forum_id)]] self.rules = [Rule(sle(allow=("/forum/forum-" + forum_id + "-[0-9]{," + digit + "}\.html")), follow=True, callback='parse_1'), ] super(sisSpider, self).__init__(*args, **kwargs)