def __init__(self, url=None): if url: # retrieve with post method, put for create, get for read, delete for delete # unvisitedurls http://localhost:5000/unvisitedurls?start=0&offset=10&spider=douban unirest.timeout(180) req = unirest.post(url, headers={"Accept":"application/json"}) self.start_urls = [data['url'] for data in req.body['data']] self.name = url[url.find('spider=')+7:] self.visitedurldict = OrderedDict() self.datadict = OrderedDict() self.filedict = OrderedDict() self.deadurldict = OrderedDict() self.visitedurldict['urls'] = [] self.datadict['datas'] = [] self.filedict['files'] = [] self.deadurldict['urls'] = [] rules = ( Rule(sle(allow=("http://book.douban.com/isbn/\d+$")), callback="parse", follow=True), Rule(sle(allow=("http://book.douban.com/subject/\d+$")), callback="parse", follow=True), ) # def __del__(self) work dispatcher.connect(self.spider_closed, signals.spider_closed)
class HrtencentSpider(CrawlSpider): name = "hrtencent" allowed_domains = ["tencent.com"] start_urls = [ "http://hr.tencent.com/position.php?start=%d" % d for d in range(0, 20, 10) ] rules = [ Rule(sle(allow=("/position_detail.php\?id=\d*.*", )), callback='parse_2'), Rule(sle(allow=("/position.php\?&start=\d{,2}#a")), follow=True, callback='parse_1') ] def parse_2(self, response): items = [] sel = Selector(response) sites = sel.css('.tablelist') for site in sites: item = PositionDetailItem() item['sharetitle'] = site.css('.h #sharetitle::text').extract() item['bottomline'] = site.css('.bottomline td::text').extract() # item['duty'] = site.css('.c .l2::text').extract() item['link'] = response.url items.append(item) print repr(item).decode("unicode-escape") + '\n' # info('parsed ' + str(response)) self.parse_1(response) return items def parse_1(self, response): # url cannot encode to Chinese easily.. XXX info('parsed ' + str(response)) def _process_request(self, request): info('process ' + str(request)) return request
class DoubanBookSpider(CrawlSpider): name = "doubanbook" allowed_domains = ["douban.com"] start_urls = [ "https://book.douban.com/tag" ] rules = [ Rule(sle(allow=("/subject/\d+$")), callback="parse_2"), Rule(sle(allow=("/tag/[^/]+$", )), follow=True) ] def parse_2(self, response): items = [] sel = Selector(response) # #wrapper means <div id='wrapper'> sites = sel.css('#wrapper') for site in sites: item = DoubanSubjectItem() item['title'] = site.css("h1 span::text").extract() item['link'] = response.url item['content_intro'] = site.css('#link-report .intro p::text').extract() items.append(item) print item return items def process_request(self, request): return request
def __init__(self, url=None): #print "here i am" if url: # retrieve with post method, put for create, get for read, delete for delete # unvisitedurls http://localhost:5000/unvisitedurls?start=0&offset=10&spider=6w unirest.timeout(180) req = unirest.post(url, headers={"Accept":"application/json"}) self.start_urls = [data['url'] for data in req.body['data']] self.name = url[url.find('spider=')+7:] self.visitedurldict = OrderedDict() self.datadict = OrderedDict() self.filedict = OrderedDict() self.deadurldict = OrderedDict() self.visitedurldict['urls'] = [] self.datadict['datas'] = [] self.filedict['files'] = [] self.deadurldict['urls'] = [] rules = ( Rule(sle(allow=("http://book.douban.com/isbn/\d+$")), callback="parse", follow=True), Rule(sle(allow=("http://book.douban.com/subject/\d+$")), callback="parse", follow=True), ) # def __del__(self) work dispatcher.connect(self.spider_closed, signals.spider_closed)
class alexaCNSpider(CrawlSpider): name = "alexa.cn" allowed_domains = ["alexa.com"] start_urls = [ "http://www.alexa.com/", "http://www.alexa.com/topsites/category/World/Chinese_Simplified_CN", ] rules = [ Rule(sle(allow=( "/topsites/category;?[0-9]*/Top/World/Chinese_Simplified_CN/.*$")), callback='parse_category_top_xxx', follow=True), Rule(sle(allow=("/topsites/category/World/Chinese_Simplified_CN$", )), callback='parse_category_top_xxx', follow=True), #Rule(sle(allow=("/people/[^/]+$", )), callback='parse_people', follow=True), ] # www.alexa.com/topsites/category/Top/Computers # www.alexa.com/topsites/category;1/Top/Computers def parse_category_top_xxx(self, response): info('parsed ' + str(response)) items = [] sel = Selector(response) sites = sel.css('.site-listing') for site in sites: item = alexaSiteInfoItem() item['url'] = site.css( 'a[href*=siteinfo]::attr(href)')[0].extract() item['name'] = site.css('a[href*=siteinfo]::text')[0].extract() item['description'] = site.css('.description::text')[0].extract() remainder = site.css('.remainder::text') if remainder: item['description'] += remainder[0].extract() # more specific item['category'] = urllib.unquote('/'.join( response.url.split('/')[-3:])).decode('utf-8') items.append(item) return items def parse_category_top(self, response): info('parsed ' + str(response)) items = [] sel = Selector(response) categories = sel.css('li a[href*="/topsites/category/Top/"]') for category in categories: item = alexaCategoryItem() item['url'] = category.css('::attr(href)')[0].extract() item['name'] = category.css('::text')[0].extract() items.append(item) return items
class hacker_newsSpider(CommonSpider): name = "hacker_news" allowed_domains = ["news.ycombinator.com"] start_urls = [ "https://news.ycombinator.com/", ] rules = [ Rule(sle(allow=("https://news.ycombinator.com/$")), callback='parse_1', follow=True), ] list_css_rules = { 'title': '.storylink::text', 'desc': '.subtext .score::text', } content_css_rules = { 'text': '#Cnt-Main-Article-QQ p *::text', 'images': '#Cnt-Main-Article-QQ img::attr(src)', 'images-desc': '#Cnt-Main-Article-QQ div p+ p::text', } def parse_1(self, response): info('Parse ' + response.url) x = self.parse_with_rules(response, self.list_css_rules, dict) # x = self.parse_with_rules(response, self.content_css_rules, dict) print(json.dumps(x, ensure_ascii=False, indent=2))
class twitchSpider(CommonSpider): name = "twitch" allowed_domains = ["twitch.tv"] start_urls = [ "https://www.twitch.tv/directory/game/Hearthstone%3A%20Heroes%20of%20Warcraft" ] rules = [ Rule(sle(allow=( "https://www.twitch.tv/directory/game/Hearthstone%3A%20Heroes%20of%20Warcraft" )), callback='parse_1', follow=True), ] list_css_rules = { '.content': { 'room_name': '.meta .title a::text', 'author': '.meta .info a::text', 'people_count': '.meta .info a::attr(data-ember-action)' } } content_css_rules = { 'text': '#Cnt-Main-Article-QQ p *::text', 'images': '#Cnt-Main-Article-QQ img::attr(src)', 'images-desc': '#Cnt-Main-Article-QQ div p+ p::text', } def parse_1(self, response): info('Parse ' + response.url) x = self.parse_with_rules(response, self.list_css_rules, dict) # x = self.parse_with_rules(response, self.content_css_rules, dict) import pdb pdb.set_trace() print(json.dumps(x, ensure_ascii=False, indent=2))
class v2exSpider(CommonSpider): name = "v2ex" allowed_domains = ["v2ex.com"] start_urls = [ "http://www.v2ex.com/", ] rules = [ Rule(sle(allow=("http://www.v2ex.com/$")), callback='parse_1', follow=True), ] list_css_rules = { '.cell.item': { 'title': '.item_title a::text', 'node': '.node::text', 'author': '.node+ strong a::text', 'reply_count': '.count_livid::text' } } def parse_1(self, response): info('Parse ' + response.url) # import pdb; pdb.set_trace() x = self.parse_with_rules(response, self.list_css_rules, dict) print(json.dumps(x, ensure_ascii=False, indent=2))
class samairSpider(CommonSpider): name = "samair" allowed_domains = ["samair.ru"] start_urls = ['http://www.samair.ru/proxy/'] rules = [ Rule(sle(allow=("proxy/$")), callback='parse_1', follow=True), ] list_css_rules = { '#proxylist tr': { 'ip': "td:nth-child(1) *::text", #, "xpath:.//*[not(contains(@style,'display:none'))]/text()"], #'port': 'td:nth-child(3) a::text', 'anonymity': 'td:nth-child(2) *::text', 'last_checked': 'td:nth-child(3) *::text', 'country': 'td:nth-child(4) a::text', } } def parse_1(self, response): info('Parse ' + response.url) items = [] n = response.css('tbody tr') #import pdb; pdb.set_trace() x = self.parse_with_rules(response, self.list_css_rules, dict, True) x = x[0]['#proxylist tr'] pp.pprint(x)
class HouseSpider(CrawlSpider): name = 'house' allowed_domains = ['hz.fang.lianjia.com'] start_urls = ['http://hz.fang.lianjia.com/loupan'] rules = (Rule(sle(allow=("/pg\d{0,4}")), follow=False, callback='parse_item'), ) def parse_item(self, response): items = [] sel = Selector(response) base_url = get_base_url(response) houses = sel.xpath('//div[@class="resblock-desc-wrapper"]') for house in houses: item = HouseItem() house_name = house.xpath( 'div[@class="resblock-name"]/a/text()').extract() house_address = house.xpath( 'div[@class="resblock-location"]/a/text()').extract() house_price = house.xpath( 'div[@class="resblock-price"]/div[@class="main-price"]/span/text()' ).extract() house_url = house.xpath( 'div[@class="resblock-name"]/a/@href').extract() url = base_url + '/' + ''.join(house_url).split('/')[2] item['house_name'] = house_name item['house_address'] = house_address item['house_price'] = house_price[0] + house_price[1].strip() item['house_url'] = url items.append(item) return items
class CnbetaSpider(CrawlSpider): name = 'cnbeta' allowed_domains = ['cnbeta.com'] start_urls = ['http://www.cnbeta.com/'] rules = [ Rule(sle(allow=("/articles/.*\.htm")), callback='parse_cnbeta', follow=True), ] def parse_cnbeta(self, response): logger.debug('parse_cnbeta: ' + response.url) articlelist = Selector(response).xpath( '//div[@class="cnbeta-article"]') items = [] for article in articlelist: item = CnbetaItem() item['title'] = article.xpath( 'header[@class="title"]/h1/text()').extract_first() item['publishtime'] = article.xpath( 'header[@class="title"]/div[@class="meta"]/span/text()' ).extract_first() logger.info('function: parse_cnbeta, url: ' + response.url + ' , item: ' + str(item)) items.append(item) return items
class pandatvSpider(CommonSpider): name = "pandatv" allowed_domains = ["panda.tv"] start_urls = [ "http://www.panda.tv/all", ] rules = [ Rule(sle(allow=("http://www.panda.tv/all")), callback='parse_1', follow=True), ] list_css_rules = { '.video-list-item.video-no-tag': { 'room_name': '.video-title::text', 'author': '.video-nickname::text', 'people_count': '.video-number::text', 'tag': '.video-cate::text', } } content_css_rules = { 'text': '#Cnt-Main-Article-QQ p *::text', 'images': '#Cnt-Main-Article-QQ img::attr(src)', 'images-desc': '#Cnt-Main-Article-QQ div p+ p::text', } def parse_1(self, response): info('Parse '+response.url) x = self.parse_with_rules(response, self.list_css_rules, dict) # x = self.parse_with_rules(response, self.content_css_rules, dict) print(json.dumps(x, ensure_ascii=False, indent=2))
class github_trendingSpider(CommonSpider): name = "github_trending" allowed_domains = ["github.com"] start_urls = [ "http://www.github.com/trending", ] rules = [ Rule(sle(allow=("/trending$")), callback='parse_1', follow=True), ] list_css_rules = { '.repo-list-item': { 'repo_name': '.repo-list-name a::attr(href)', 'repo_meta': '.repo-list-meta::text', } } content_css_rules = { 'text': '#Cnt-Main-Article-QQ p *::text', 'images': '#Cnt-Main-Article-QQ img::attr(src)', 'images-desc': '#Cnt-Main-Article-QQ div p+ p::text', } def parse_1(self, response): info('Parse ' + response.url) x = self.parse_with_rules(response, self.list_css_rules, dict) # x = self.parse_with_rules(response, self.content_css_rules, dict) print(json.dumps(x, ensure_ascii=False, indent=2))
class amazonbookSpider(CommonSpider): name = "amazonbook" allowed_domains = ["amazon.com", "www.amazon.com"] start_urls = [ #"http://www.amazon.com/b/ref=s9_acss_bw_en_BGG15eve_d_1_6?_encoding=UTF8&node=17&pf_rd_m=ATVPDKIKX0DER&pf_rd_s=merchandised-search-top-3&pf_rd_r=0XCRZV6SDKBTKDPH8SFR&pf_rd_t=101&pf_rd_p=2293718502&pf_rd_i=283155", "http://www.amazon.com/books-used-books-textbooks/b?node=283155", ] rules = [ #Rule(sle(allow=("/gp/product/.*")), callback='parse_1', follow=True), Rule(sle(allow=("/books-used-books-textbooks/.*")), callback='parse_0', follow=True), ] css_rules = { ".inner .a-row": { "url": ".title::attr(href)", #"desc": "span::text" "title": ".s9TitleText::text", "comments": ".a-icon-row .a-size-small::text", } } def parse_0(self, response): info('Parse 0 ' + response.url) pp.pprint(self.parse_with_rules(response, self.css_rules, dict)) #.inner .a-row def parse_1(self, response): info('Parse 1 ' + response.url)
class youtube_trendingSpider(CommonSpider): name = "youtube_trending" allowed_domains = ["youtube.com"] start_urls = [ "https://www.youtube.com/feed/trending", ] rules = [ Rule(sle(allow=("feed/trending$")), callback='parse_1', follow=True), ] list_css_rules = { '.yt-lockup-content': { 'video_title': '.yt-lockup-title a::text', 'author': '.yt-lockup-byline a::text', } } content_css_rules = { 'text': '#Cnt-Main-Article-QQ p *::text', 'images': '#Cnt-Main-Article-QQ img::attr(src)', 'images-desc': '#Cnt-Main-Article-QQ div p+ p::text', } def parse_1(self, response): info('Parse ' + response.url) x = self.parse_with_rules(response, self.list_css_rules, dict) # x = self.parse_with_rules(response, self.content_css_rules, dict) print(json.dumps(x, ensure_ascii=False, indent=2))
class alexa_topsitesSpider(CommonSpider): name = "alexa_topsites" allowed_domains = ["alexa.com"] start_urls = [ "http://www.alexa.com/topsites", ] rules = [ Rule(sle(allow=("http://www.alexa.com/topsites")), callback='parse_1', follow=True), ] list_css_rules = { '.site-listing': { 'rank': '.count::text', 'name': '.desc-paragraph a::text', 'desc': '.description::text' } } content_css_rules = { 'text': '#Cnt-Main-Article-QQ p *::text', 'images': '#Cnt-Main-Article-QQ img::attr(src)', 'images-desc': '#Cnt-Main-Article-QQ div p+ p::text', } def parse_1(self, response): info('Parse '+response.url) x = self.parse_with_rules(response, self.list_css_rules, alexa_topsitesItem) # x = self.parse_with_rules(response, self.content_css_rules, dict) print(json.dumps(x, ensure_ascii=False, indent=2)) # pp.pprint(x) return self.parse_with_rules(response, self.list_css_rules, alexa_topsitesItem)
class QuotesSpider(CrawlSpider): name = 'planet_char' rotate_user_agent = True allowed_domains = ['www.anime-planet.com'] start_urls = ['https://www.anime-planet.com'] rules = (Rule(sle(allow='/characters/', ), callback='parse_anime_links', follow=True),) def parse_anime_links(self, response): server = [] server = anime_planet_character() server['name'] = \ response.css('#siteContainer > h1::text').get() server['gender'] = \ response.css('#siteContainer > section.pure-g.entryBar > div:nth-child(1)::text').get() server['hair_color'] = \ response.css('#siteContainer > section.pure-g.entryBar > div:nth-child(2)::text').get() server['image_url'] = \ response.css( '#siteContainer > section:nth-child(10) > div.pure-1.md-2-3 > div.pure-g.entrySynopsis > div.pure-1-2.md-1-3 > div > img::attr(src)').get() server['about'] = \ response.css( '#siteContainer > section:nth-child(10) > div.pure-1.md-2-3 > div.pure-g.entrySynopsis > div.pure-1.md-2-3 > div::text').getall() server['tags'] = \ response.css( '#siteContainer > section:nth-child(10) > div.pure-1.md-2-3 > div.pure-g.entrySynopsis > div.pure-1.md-2-3 > div.tags > ul > li > a::text').getall() return server
class proxy4freeSpider(CommonSpider): name = "proxy4free" allowed_domains = ["proxy4free.com"] start_urls = ['http://www.proxy4free.com/list/webproxy1.html'] rules = [ Rule(sle(allow=("list/webproxy[0-9]+\.html")), callback='parse_1', follow=True), ] list_css_rules = { 'tbody tr': { 'domain': "td:nth-child(2) *::text", #, "xpath:.//*[not(contains(@style,'display:none'))]/text()"], #'port': 'td:nth-child(3) a::text', 'country': 'td:nth-child(4) *::text', 'rating': 'td:nth-child(5) *::text', 'access_time': 'td:nth-child(6) *::text', 'uptime': 'td:nth-child(7) *::text', 'online_since': 'td:nth-child(8) *::text', 'last_checked': 'td:nth-child(9) *::text', 'features_hian': 'td:nth-child(10) *::text', 'features_ssl': 'td:nth-child(11) *::text', } } def parse_1(self, response): info('Parse ' + response.url) items = [] n = response.css('tbody tr') #import pdb; pdb.set_trace() x = self.parse_with_rules(response, self.list_css_rules, dict, True) x = x[0]['tbody tr'] pp.pprint(x)
class templateSpider(CommonSpider): name = "template" allowed_domains = ["template.com"] start_urls = [ "http://www.template.com/", ] rules = [ Rule(sle(allow=( "/topsites/category;?[0-9]*/Top/World/Chinese_Simplified_CN/.*$")), callback='parse_1', follow=True), ] list_css_rules = { '.linkto': { 'url': 'a::attr(href)', 'name': 'a::text', } } list_css_rules_2 = { '#listZone .Q-tpWrap': { 'url': '.linkto::attr(href)', 'name': '.linkto::text' } } content_css_rules = { 'text': '#Cnt-Main-Article-QQ p *::text', 'images': '#Cnt-Main-Article-QQ img::attr(src)', 'images-desc': '#Cnt-Main-Article-QQ div p+ p::text', } def parse_1(self, response): info('Parse ' + response.url)
class proxylistorgSpider(CommonSpider): name = "proxylistorg" allowed_domains = ["proxy-list.org"] start_urls = ['https://proxy-list.org/english/index.php'] rules = [ Rule(sle(allow=("english/index.php(\?p=[0-9]+)?$")), callback='parse_1', follow=True), ] list_css_rules = { '#proxy-table .table ul': { 'ip': "li:nth-child(1)::text", #, "xpath:.//*[not(contains(@style,'display:none'))]/text()"], #'port': 'td:nth-child(3) a::text', 'anonymity': 'li:nth-child(4)::text', 'speed': 'li:nth-child(3)::text', 'ssl': 'li:nth-child(2)::text', 'country': 'li:nth-child(5) *::text', } } def parse_1(self, response): info('Parse ' + response.url) items = [] n = response.css('tbody tr') #import pdb; pdb.set_trace() x = self.parse_with_rules(response, self.list_css_rules, dict, True) x = x[0]['#proxy-table .table ul'] pp.pprint(x)
class musasiSpider(CommonSpider): name = "musasi" allowed_domains = ["http://www.musasi.jp"] start_urls = ["http://www.musasi.jp/"] rules = [ Rule(sle(allow=("/explanation/[0-9]*/.*$")), callback='parse_1', follow=True), ] list_css_rules = { '.linkto': { 'url': 'a::attr(href)', 'name': 'a::text', } } content_css_rules = { 'text': '#Cnt-Main-Article-QQ p *::text', 'images': '#Cnt-Main-Article-QQ img::attr(src)', 'images-desc': '#Cnt-Main-Article-QQ div p+ p::text', } def start_requests(self): return [ scrapy.FormRequest("/ichikawa-chuo/login", formdata={ 'username': '******', 'password': '******' }, callback=self.parse_1) ] def parse_1(self, response): info('Parse ' + response.url)
class DmozSpider(CrawlSpider): name = "dmoz" allowed_domains = ["dooland.com"] start_urls = [ # "http://www.dmoz.org/Computers/Programming/Languages/Python/Books/", # "http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/" "http://www.dooland.com/magazine/article_784457.html" ] rules = [ # Rule(sle(allow=('/Resources/')),callback='parse_details'), Rule(sle(allow=('/magazine/article_784457\.html')), callback='parseBooks', follow=False), # Rule(sle(allow=()), callback='parse_pages'), # 分页 # Rule(sle(allow=(),restrict_xpaths=('//*[@id="page"]')),callback='parse_pages',follow=True) ] def parseBooks(self, response): for sel in response.xpath('//*[@id="main"]/article/div[1]/div[1]'): item = ArticleItem() item['url'] = sys.getdefaultencoding() # item['title'] = sys.getdefaultencoding() item['title'] = sel.xpath('h1/text()').extract()[0].strip() # # return item
class QuotesSpider(CrawlSpider): name = 'anime_list' allowed_domains = ['myanimelist.net'] start_urls = \ ['https://myanimelist.net/anime/34134/One_Punch_Man_2nd_Season'] rules = (Rule(sle( allow='https://myanimelist.net/anime/', deny=('userrecs', 'video', 'episode', 'characters', 'featured', 'profile', 'login\.php', 'anime\.php', 'password\.php' 'pressroom', 'people', 'advertising', 'producer', 'membership', 'manga_translation_battle', 'watch', 'about\.php', 'register\.php', 'genre', 'reviews', 'stats', 'forum', 'clubs', 'character', 'news', 'modules')), callback='parse_images', follow=True), ) def parse_images(self, response): image = [] image = AjarWallAlphaItem() image['image_url'] = response.request.url image['image_pixels'] = \ response.css('#contentWrapper > div:nth-child(1) > h1 > span::text' ).get() return image
class PictureSpider(CrawlSpider): name = "picture" allowed_domains = ["tumblr.com"] start_urls = [ "https://nanue1.tumblr.com/", ] rules = [ #Rule(sle(allow=(r"/likes")),callback='parse_likes',follow = True), #Rule(sle(allow=(r"/likes")),callback='parse_likes'), Rule(sle(allow=(r"/following")), callback='parse_following', follow=True), Rule(sle(allow=(r"/archive")), callback='parse_archive', follow=True), ] def get_archive_post_pic(self, response): print "Parse archive post page for pic......" #pic_page_urls=re.findall('https://\d+.media.tumblr.com/\S+/\S+_\d+.jpg',response.body) pic_page_urls = re.findall( 'https://\d+.media.tumblr.com/\S+/\S+_1280.jpg', response.body) if pic_page_urls: item = TumblrPictureItem() for pic_page_url in pic_page_urls: print pic_page_url item['url'] = [urlparse.urljoin(response.url, pic_page_url)] yield item def parse_archive(self, response): print "Parse archive......" print response.url archive_post_urls = response.xpath( '//*[@class="hover"]/@href').extract() for archive_post_url in archive_post_urls: req = Request(archive_post_url, dont_filter=True, callback=self.get_archive_post_pic) yield req def parse_following(self, response): print "Parse following user......" follows_blog_urls = response.xpath( '//*[@class="blog-name"]/@href').extract() for follow_blog_url in follows_blog_urls: req = Request(follow_blog_url, dont_filter=True) print follow_blog_url yield req
class XiCiDaiLiSpider(CrawlSpider): name = 'xicidaili' allowed_domains = ['www.xicidaili.com'] start_urls = [ 'http://www.xicidaili.com/nn', 'http://www.xicidaili.com/nt', 'http://www.xicidaili.com/wn' ] rules = [ Rule(sle(allow=("/nn/[\d]{1,2}$")), callback='parse_proxy', follow=True), Rule(sle(allow=("/nt/[\d]{1,2}$")), callback='parse_proxy', follow=True), Rule(sle(allow=("/wn/[\d]{1,2}$")), callback='parse_proxy', follow=True), ] headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Encoding": "gzip, deflate, sdch", "Accept-Language": "zh-CN,zh;q=0.8", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0", } def start_requests(self): for url in self.start_urls: yield Request(url, headers=self.headers, dont_filter=True) def parse_proxy(self, response): logger.debug('parse_proxy: '+response.url); table = Selector(response).xpath('//table[@id="ip_list"]')[0] trs = table.xpath('//tr')[1:] #去掉标题行 items = []; for tr in trs: item = XiCiDaiLiItem(); item['ip'] = tr.xpath('td[2]/text()').extract()[0] item['port'] = tr.xpath('td[3]/text()').extract()[0] item['position'] = tr.xpath('string(td[4])').extract()[0].strip() item['type'] = tr.xpath('td[6]/text()').extract()[0] item['speed'] = tr.xpath('td[7]/div/@title').re('\d+\.\d*')[0] item['last_check_time'] = tr.xpath('td[10]/text()').extract()[0] if 'http' in item['type'].lower(): logger.info('function: parse_proxy, url: '+response.url+' , item: '+str(item)); items.append(item) return items;
class qqnewsSpider(CommonSpider): name = "qqnews" allowed_domains = ["tencent.com", 'qq.com'] start_urls = ['http://news.qq.com/society_index.shtml'] rules = [ Rule(sle(allow=('society_index.shtml')), callback='parse_0', follow=True), Rule(sle(allow=(".*[0-9]{8}.*htm$")), callback='parse_1', follow=True), ] list_css_rules = { '.linkto': { 'url': 'a::attr(href)', 'name': 'a::text', } } list_css_rules_2 = { '#listZone .Q-tpWrap': { 'url': '.linkto::attr(href)', 'name': '.linkto::text' } } content_css_rules = { 'text': '#Cnt-Main-Article-QQ p *::text', 'images': '#Cnt-Main-Article-QQ img::attr(src)', 'images-desc': '#Cnt-Main-Article-QQ div p+ p::text', } def parse_0(self, response): info('Parse0 ' + response.url) x = self.parse_with_rules(response, self.list_css_rules, dict) pp.pprint(x) #return self.parse_with_rules(response, self.list_css_rules, qqnewsItem) def parse_1(self, response): info('Parse1 ' + response.url) x = self.parse_with_rules(response, self.content_css_rules, dict) pp.pprint(x) #import pdb; pdb.set_trace() @staticmethod def parse_2(response): info('Parse2 ' + response.url)
class QuotesSpider(CrawlSpider): name = 'pdf' allowed_domains = ['www.pdfdrive.com'] start_urls = ['https://www.pdfdrive.com/'] rules = (Rule(sle(allow='', deny=('/auth/', '/home/', '/search\?', '/category/')), callback='parse_anime', follow=True), ) def parse_anime(self, response): item = [] item = pdfdrive() # item['name'] = \ # response.css('#main > div > div.widget.info > div > div:nth-child(1)' # ).get() item['main_url'] = response.url item['name'] = \ response.css('body > div.dialog > div.dialog-main > div.dialog-left > div.ebook-main > div.ebook-right > div > h1::text' ).get() item['image_url'] = \ response.css('body > div.dialog > div.dialog-main > div.dialog-left > div.ebook-main > div.ebook-left > a > img::attr(src)' ).get() item['year_pub'] = \ response.css('body > div.dialog > div.dialog-main > div.dialog-left > div.ebook-main > div.ebook-right > div > div.ebook-file-info > span:nth-child(3)::text' ).get() item['total_pages'] = \ response.css('body > div.dialog > div.dialog-main > div.dialog-left > div.ebook-main > div.ebook-right > div > div.ebook-file-info > span:nth-child(1)::text' ).get() item['book_size'] = \ response.css('body > div.dialog > div.dialog-main > div.dialog-left > div.ebook-main > div.ebook-right > div > div.ebook-file-info > span:nth-child(5)::text' ).get() item['no_downloads'] = \ response.css('body > div.dialog > div.dialog-main > div.dialog-left > div.ebook-main > div.ebook-right > div > div.ebook-file-info > span.info-green.hidemobile::text' ).get() item['book_language'] = \ response.css('body > div.dialog > div.dialog-main > div.dialog-left > div.ebook-main > div.ebook-right > div > div.ebook-file-info > span:nth-child(9)::text' ).get() item['book_id'] = \ response.css('#previewButtonMain::attr(data-id)').get() item['book_preview'] = \ response.css('#previewButtonMain::attr(data-preview)').get() item['book_buy'] = \ response.css('body > div.dialog > div.dialog-main > div.dialog-left > div.ebook-main > div.ebook-buttons > div > a::attr(href)' ).get() item['book_quotes'] = \ response.css('body > div.dialog > div.dialog-main > div.dialog-left > div.quotes::text' ).get() item['book_author'] = \ response.css('body > div.dialog > div.dialog-main > div.dialog-left > div.ebook-main > div.ebook-right > div > div.ebook-author > a > span::text' ).get() item['book_tags'] = \ response.css('body > div.dialog > div.dialog-main > div.dialog-left > div.ebook-main > div.ebook-right > div > div.ebook-tags > a::text' ).getall() item['download_url'] = \ response.css('#download-button-link::attr(href)').get() return item
class youku_Spider(CommonSpider): name = "movies_spider" allowed_domains = ["list.youku.com", "v.youku.com"] start_urls = [ "http://list.youku.com/category/show/c_97.html", "http://list.youku.com/category/show/c_96.html", "http://list.youku.com/category/show/c_100.html", ] rules = [ Rule(sle(allow=( "list.youku.com/category/show/c_[10967]+_?[_a-z0-9]*\.html")), callback='parse_1', follow=True), Rule(sle(allow=("v.youku.com/v_show/id_[\S+]+.html.*?")), callback='parse_tv', follow=True), ] # 列表页面用 content_css_rules = { 'title': 'div.p-thumb a::attr(title)', 'url': 'div.p-thumb a::attr(href)', 'bg_img_url': 'div.p-thumb img::attr(src)', # 'images': '#Cnt-Main-Article-QQ img::attr(src)', # 'images-desc': '#Cnt-Main-Article-QQ div p+ p::text', } # 播放页面用 player_css_rules = { 'category': 'h1.title a::text', 'title': 'div.tvinfo h3::text', 'sub_title': '#vpofficiallistv5_wrap div.items .item::attr(title)', 'seq': '#vpofficiallistv5_wrap div.items .item::attr(seq)', 'url': '#vpofficiallistv5_wrap div.items .item a::attr(href)', } def parse_1(self, response): # info('Parse ' + response.url) x = self.parse_with_rules(response, self.content_css_rules, dict) # print(json.dumps(x, ensure_ascii=False, indent=2)) return x def parse_tv(self, response): x = self.parse_with_rules(response, self.player_css_rules, dict) return x
def __init__(self, conf_module='TestSpiderConfig', *args, **kwargs): cm = __import__(conf_module, globals=globals()) conf = cm.Config() self.name = conf.name self.allowed_domains = conf.allowed_domains self.start_urls = conf.start_urls self.rules = [Rule(sle(allow=(c.allowed_rule_regex)), callback='parse_1', cb_kwargs=c.paras, follow=True) for c in conf.ex_rules] info(self.start_urls) info(self.rules) super(general_spiderSpider, self).__init__(*args, **kwargs)
class sinanewsSpider(CommonSpider): name = "sinanews" allowed_domains = ["news.sina.com.cn"] start_urls = [ "http://news.sina.com.cn/", ] rules = [ Rule(sle(allow=("http://news.sina.com.cn/$")), callback='parse_0'), Rule(sle(allow=(".*doc[^/]*shtml$")), callback='parse_1'), #, follow=True), #Rule(sle(allow=('/c/2015-11-19/doc-ifxkszhk0386278.shtml')), callback='parse_1', follow=True, process_request='process_request'), ] list_css_rules = { '#blk_yw_01 a': { 'url': 'a::attr(href)', 'name': 'a::text', } } content_css_rules = { 'text': 'p::text', 'images': 'img::attr(src)', 'images-desc': '.img_descr::text', # need url analysis for video #'video': '#J_Article_Player', } def process_request(self, r): info('process '+str(r)) return r def parse_0(self, response): info('Parse 0 '+response.url) x = self.parse_with_rules(response, self.list_css_rules, dict) pp.pprint(x) #pdb.set_trace() #return self.parse_with_rules(response, self.list_css_rules, sinanewsItem) def parse_1(self, response): info('Parse 1 '+response.url) x = self.parse_with_rules(response, self.content_css_rules, dict) pp.pprint(x)
class LianjiaSpider(scrapy.Spider): name = "lianjia" allowed_domains = ["lianjia.com"] start_urls = ["http://tj.lianjia.com/ershoufang/pg2/"] # response.css('.page-box .house-lst-page-box::attr(page-url)').extract() # response.css('.page-box .house-lst-page-box::attr(page-data)').extract() rules = ( Rule(sle(allow=('http://tj.lianjia.com/ershoufang/pg\d+$')), callback='parse_item'), # items = LinkExtractor(allow=('/ershoufang/pg2')).extract_links(response) # for i in items: # print i # # Rule(SgmlLinkExtractor(allow=('huhuuu/p/',)), callback='parse_item'), # Rule(SgmlLinkExtractor(allow=('huhuuu/archive/',)), callback='parse_item'), ) # rules = [ # Rule(sle(allow=("/pg\d+$")), callback='parse', follow=True), # ] def parse(self, response): filename = response.url.split("/")[-2] with open(filename, 'wb') as f: f.write(response.body) # def parse(self, response): # items = [] # sel = Selector(response) # sites = sel.css('.info') # for site in sites: # item = TutorialItem() # item['title'] = site.css('.info .title a::text').extract() # items.append(item) # # print item['title'] # return items def parse_item(self, response): items = [] sel = Selector(response) sites = sel.css('.info') for site in sites: item = TutorialItem() item['title'] = site.css('.info .title a::text').extract() items.append(item) print item['title'] return items
class DoubanBookSpider(CrawlSpider): name = "douban_book" allowed_domains = ["douban.com"] start_urls = ["http://book.douban.com/tag/"] rules = ( Rule(sle(allow=("/tag/[^/]+/?$", )), callback="parse_1"), Rule(sle(allow=("/tag/$", )), follow=True, process_request='_process_request'), ) # NOTE: depth index is hidden. depth_class_list = [ '.*/tag/?$', '.*/tag/.+/?', ] def _cal_depth(self, response): """ Calculate the depth of response, and call corresponding method or stop crawl. """ url = response.url for depth, depth_regexp in enumerate(self.depth_class_list): if re.match(depth_regexp, url): return depth # warn("Unknown url depth: " + url) # If the url pattern is unknown, then return -1. return -1 def parse_1(self, response): # url cannot encode to Chinese easily.. XXX info('parsed ' + str(response)) def _process_request(self, request): info('process ' + str(request)) return request '''
def __init__(self, conf_module='TestSpiderConfig', *args, **kwargs): if conf_module.endswith(".py"): conf_module = conf_module[:-3] cm = __import__(conf_module, globals=globals()) conf = cm.Config() self.name = conf.name self.allowed_domains = conf.allowed_domains self.start_urls = conf.start_urls self.rules = [Rule(sle(allow=(c.allowed_rule_regex)), callback='parse_1', cb_kwargs=c.paras, follow=conf.follow) for c in conf.ex_rules] self.conf = conf info(self.start_urls) info(self.rules) info([[c.allowed_rule_regex, c.paras] for c in conf.ex_rules]) # import pdb; pdb.set_trace() super(general_json_spiderSpider, self).__init__(*args, **kwargs)
def __init__(self,software,*args,**kwargs): self.software = software if software == "wireshark" or software == "FFmpeg" or software == "openssl": self.start_urls = ['https://github.com/%s/%s/commits/master/?page=0' % (software,software),'https://github.com/%s/%s/commits/master/?page=101' % (software,software)] self.rules = [ Rule(sle(allow=('/%s/%s/commit/[a-f0-9]+' % (software,software)),restrict_xpaths=('//a[@class="message"]')),follow=True,callback='parse_item'), Rule(sle(allow=('https://github.com/%s/%s/commits/master\?page=\d+' % (software,software)),restrict_xpaths=('//div[@class="pagination"]/a[contains(.,"Older")]'))), ] elif software == "httpd": self.start_urls = ['https://github.com/apache/%s/commits/trunk?page=0' % software,'https://github.com/apache/%s/commits/trunk?page=101' % software] self.rules = [ Rule(sle(allow=('/apache/%s/commit/[a-f0-9]+' % software),restrict_xpaths=('//a[@class="message"]')),follow=True,callback='parse_item'), Rule(sle(allow=('https://github.com/apache/%s/commits/trunk\?page=\d+' % software),restrict_xpaths=('//div[@class="pagination"]/a[contains(.,"Older")]'))), ] elif software == "linux": self.start_urls = ['https://github.com/torvalds/%s/commits/master' % software,'https://github.com/torvalds/%s/commits/master?page=101' % software] self.rules = [ Rule(sle(allow=('/torvalds/%s/commit/[a-f0-9]+' % software),restrict_xpaths=('//a[@class="message"]')),follow=True,callback='parse_item'), Rule(sle(allow=('https://github.com/torvalds/%s/commits/master\?page=\d+' % software),restrict_xpaths=('//div[@class="pagination"]/a[contains(.,"Older")]'))), ] super(SpiderGithub,self).__init__()
def __init__(self, forum_id=58, digit=1, *args, **kwargs): self.start_urls = [self.ip_format % d for d in [int(forum_id)]] self.rules = [Rule(sle(allow=("/forum/forum-" + str(forum_id) + "-[0-9]{," + str(digit) + "}\.html")), follow=True, callback='parse_1'),] super(sisSpider, self).__init__(*args, **kwargs)