class PoiSpider(CrawlSpider): district = '110108' name = 'poi' allowed_domains = ['poi86.com'] start_urls = ('http://www.poi86.com/poi/amap/district/' + district + '/1.html', ) rules = ( Rule( SgmlLinkExtractor( allow=(r'http://www.poi86.com/poi/amap/district/' + district + '/\d+.html'))), Rule(SgmlLinkExtractor( allow=(r'http://www.poi86.com/poi/amap/\d+.html')), callback='parse_item'), ) def parse_item(self, response): item = PoiItem() # '/html/body/div[2]/div/div[1]/h1' item['name'] = response.xpath( '/html/body/div[2]/div[1]/div[1]/h1/text()').extract() item['address'] = response.xpath( '/html/body/div[2]/div[1]/div[2]/ul/li[4]/text()').extract() item['category'] = response.xpath( '/html/body/div[2]/div[1]/div[2]/ul/li[6]/text()').extract() item['wgs_84'] = response.xpath( '/html/body/div[2]/div[1]/div[2]/ul/li[7]/text()').extract() item['gcj_02'] = response.xpath( '/html/body/div[2]/div[1]/div[2]/ul/li[8]/text()').extract() item['bd_09'] = response.xpath( '/html/body/div[2]/div[1]/div[2]/ul/li[9]/text()').extract() yield item
class BloggerSpider(CrawlSpider): name = "TheHackerWay" start_urls = ['http://thehackerway.com'] # urls desde las cuales el spider comenzará el proceso de crawling rules = [ Rule(SgmlLinkExtractor(allow=[r'/\d{4}']), follow=True, callback='parse_blog'), # r'/\d+' : expression regular para http://thehackerway.com/X URLs Rule(SgmlLinkExtractor(allow=[r'\d{4}/\d{2}\d{2}/\w+']), callback='parse_blog') ] # http://thehackerway.com/YYYY/MM/DD/titulo URLs def parse_blog(self, response): print 'link parseado %s' % response.url hxs = HtmlXPathSelector(response) item = HackerWayItem() item['title'] = hxs.select( '//title/text()').extract() # Selector XPath para el titulo item['author'] = hxs.select( "//span[@class='author']/a/text()").extract( ) # Selector XPath para el author item['tag'] = hxs.select("//meta[@property='og:title']/text()" ).extract() # Selector XPath para el tag item['date'] = hxs.select("//span[@class='date']/text()").extract( ) # Selector XPath para la fecha return item # Retornando el Item.
class AirbnbSpider(RentBaseSpider): name = "airbnb" allowed_domains = ["airbnb.com"] start_urls = ["https://www.airbnb.com/sitemaps"] rules = ( Rule( SgmlLinkExtractor(restrict_xpaths=("//*[@class='sitemap']//a", ))), Rule( SgmlLinkExtractor( restrict_xpaths=("//*[@class='next next_page']/a", ))), Rule(SgmlLinkExtractor(restrict_xpaths=("//*[@itemprop='name']/a", )), callback="parse_product"), ) def parse_product(self, response): """Function extracting values from product page""" item = PlaceItem() item['item_source'] = response.url item['name'] = self.get_name(response) item['price'] = self.get_price(response) yield item def get_name(self, response): return response.xpath("//*[@id='listing_name']/text()").extract()[0] def get_price(self, response): return response.xpath( "//*[contains(@class,'book-it__price-amount')]//text()").extract( )[0].strip()
def test_attrs(self): lx = self.extractor_cls(attrs="href") self.assertEqual(lx.extract_links(self.response), [ Link(url='http://example.com/sample1.html', text=u''), Link(url='http://example.com/sample2.html', text=u'sample 2'), Link(url='http://example.com/sample3.html', text=u'sample 3 text'), Link(url='http://www.google.com/something', text=u''), Link(url='http://example.com/innertag.html', text=u'inner tag'), ]) lx = self.extractor_cls(attrs=("href", "src"), tags=("a", "area", "img"), deny_extensions=()) self.assertEqual(lx.extract_links(self.response), [ Link(url='http://example.com/sample1.html', text=u''), Link(url='http://example.com/sample2.html', text=u'sample 2'), Link(url='http://example.com/sample2.jpg', text=u''), Link(url='http://example.com/sample3.html', text=u'sample 3 text'), Link(url='http://www.google.com/something', text=u''), Link(url='http://example.com/innertag.html', text=u'inner tag'), ]) lx = self.extractor_cls(attrs=None) self.assertEqual(lx.extract_links(self.response), []) html = """<html><area href="sample1.html"></area><a ref="sample2.html">sample text 2</a></html>""" response = HtmlResponse("http://example.com/index.html", body=html) lx = SgmlLinkExtractor(attrs=("href")) self.assertEqual(lx.extract_links(response), [ Link(url='http://example.com/sample1.html', text=u''), ])
class ListSpider(CrawlSpider): #爬虫名称 name = "tutorial" #设置下载延时 download_delay = 1 #允许域名 allowed_domains = ["news.cnblogs.com"] #开始URl start_urls = ["https://news.cnblogs.com"] #爬虫规则 rules = ( Rule(SgmlLinkExtractor( allow=(r'https://news.cnblogs.com/n/page/\d', ))), Rule(SgmlLinkExtractor(allow=(r'https://news.cnblogs.com/n/\d+', )), callback='parse_content'), ) #解析内容 def parse_content(self, response): item = TutorialItem() #当前url title = response.selector.xpath( '//div[@id="news_title"]')[0].extract().decode('utf-8') item['title'] = title author = response.selector.xpath('//div[@id="news_info"]/span/a/text()' )[0].extract().decode('utf-8') item['author'] = author releasedate = response.selector.xpath( '//div[@id="news_info"]/span[@class="time"]/text()')[0].extract( ).decode('utf-8') item['releasedate'] = releasedate yield item
class DoubanCrawler(CrawlSpider): name = "douban" allowed_domains = ["movie.douban.com"] start_urls = ["https://movie.douban.com/top250"] #allowed_domains = ["fling.seas.upenn.edu/"] #start_urls = ["https://fling.seas.upenn.edu/~yinfeiy/"] rules = (Rule( SgmlLinkExtractor(allow=( r'http://movie\.douban\.com/top250\?start=\d+&filter=&type=', ))), Rule(SgmlLinkExtractor( allow=(r'http://movie\.douban\.com/subject/\d+', )), callback='parse_page', follow=True)) def start_requests(self): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36' } for i, url in enumerate(self.start_urls): yield Request(url, cookies={'over18': '1'}, callback=self.parse_page, headers=headers) def parse_page(self, response): sel = Selector(response) item = DoubanMovieItem() item['name'] = sel.xpath( '//h1/span[@property="v:itemreviewed"]/text()').extract() item['desc'] = sel.xpath( '//div/span[@property="v:summary"]/text()').extract() item['url'] = response.url return item
class HomeawaySpider(RentBaseSpider): name = "homeaway" allowed_domains = ["homeaway.com"] start_urls = ['https://www.homeaway.com/search'] rules = ( Rule( SgmlLinkExtractor( restrict_xpaths=("(//*[@class='region-refinement'])[6]", "//*[@class='next']/a"))), Rule(SgmlLinkExtractor(restrict_xpaths=( "//*[@class='hit-content']//*[@class='hit-headline']//a", )), callback="parse_product"), ) def parse_product(self, response): """Function extracting values from product page""" item = PlaceItem() item['name'] = self.get_name(response) item['price'] = self.get_price(response) yield item def get_name(self, response): return response.xpath( "(//*[@class='container hidden-phone']//h1/text())").extract()[0] def get_price(self, response): # price for some places is unavailable, only available on request price = response.xpath("(//*[@class='price-large']/text())").extract() not_available_message = "Available on Inquiry" return price[0] if price else not_available_message
def test_deny_extensions(self): html = """<a href="page.html">asd</a> and <a href="photo.jpg">""" response = HtmlResponse("http://example.org/", body=html) lx = SgmlLinkExtractor(deny_extensions="jpg") self.assertEqual(lx.extract_links(response), [ Link(url='http://example.org/page.html', text=u'asd'), ])
def test_attrs(self): lx = self.extractor_cls(attrs="href") self.assertEqual(lx.extract_links(self.response), [ Link(url='http://example.com/sample1.html', text=u''), Link(url='http://example.com/sample2.html', text=u'sample 2'), Link(url='http://example.com/sample3.html', text=u'sample 3 text'), Link(url='http://www.google.com/something', text=u''), Link(url='http://example.com/innertag.html', text=u'inner tag'), ]) lx = self.extractor_cls(attrs=("href","src"), tags=("a","area","img"), deny_extensions=()) self.assertEqual(lx.extract_links(self.response), [ Link(url='http://example.com/sample1.html', text=u''), Link(url='http://example.com/sample2.html', text=u'sample 2'), Link(url='http://example.com/sample2.jpg', text=u''), Link(url='http://example.com/sample3.html', text=u'sample 3 text'), Link(url='http://www.google.com/something', text=u''), Link(url='http://example.com/innertag.html', text=u'inner tag'), ]) lx = self.extractor_cls(attrs=None) self.assertEqual(lx.extract_links(self.response), []) html = """<html><area href="sample1.html"></area><a ref="sample2.html">sample text 2</a></html>""" response = HtmlResponse("http://example.com/index.html", body=html) lx = SgmlLinkExtractor(attrs=("href")) self.assertEqual(lx.extract_links(response), [ Link(url='http://example.com/sample1.html', text=u''), ])
def test_attrs_sgml(self): html = """<html><area href="sample1.html"></area> <a ref="sample2.html">sample text 2</a></html>""" response = HtmlResponse("http://example.com/index.html", body=html) lx = SgmlLinkExtractor(attrs="href") self.assertEqual(lx.extract_links(response), [ Link(url='http://example.com/sample1.html', text=u''), ])
def __init__(self, selector=None, type='css', *args, **kwargs): if selector: if type not in ['css', 'xpath']: raise Exception('Selector type not supported.') if type == 'xpath': kwargs['restrict_xpaths'] = selector else: kwargs['restrict_xpaths'] = pyquery.PyQuery('a')._css_to_xpath(selector) SgmlLinkExtractor.__init__(self, *args, **kwargs)
def test_link_nofollow(self): html = """ <a href="page.html?action=print" rel="nofollow">Printer-friendly page</a> <a href="about.html">About us</a> """ response = HtmlResponse("http://example.org/page.html", body=html) lx = SgmlLinkExtractor() self.assertEqual([link for link in lx.extract_links(response)], [ Link(url='http://example.org/page.html?action=print', text=u'Printer-friendly page', nofollow=True), Link(url='http://example.org/about.html', text=u'About us', nofollow=False), ])
class homespider(CrawlSpider): name = 'home' allow_domains = ['qd.fang.lianjia.com'] start_urls = [] for i in range(1, 48): start_urls.append('http://qd.fang.lianjia.com/loupan/pg' + str(i)) rules = ( Rule( SgmlLinkExtractor(allow=('loupan/p_\w+', ), restrict_xpaths="//div[@class = 'con-box']")), Rule(SgmlLinkExtractor(allow=('loupan/p_\w+/xiangqing/')), callback='parse_item'), ) def parse_item(self, response): torrent = Home_item() #======================================================================= # deny = ('loupan/p_\w+/xiangce.*', 'loupan/p_\w+/dongtai.*', 'loupan/p_\w+/pinglun.*','loupan/p_\w+/huxingtu.*','loupan/p_\w+/tuijian.*','loupan/p_\w+/peitao.*','loupan/p_\w+/%.*','loupan/p_\w+/xiangqing/%.*','loupan/p_\w+/xiangqing/.+') # torrent['name'] = response.xpath("//div[@class = 'col-1']/h2/a/text()").extract() # torrent['address'] = response.xpath("//span[@class = 'region']/text()").extract() # torrent['price'] = response.xpath("//span[@class = 'num']/text()").extract() # torrent['area'] = response.xpath("//div[@class = 'area']/text()").extract() # torrent['square'] = response.xpath("//div[@class = 'area']/span/text()").extract() #======================================================================= torrent['name'] = response.css("div.resb-name::text").extract() torrent['price'] = response.css( "ul.x-box span.label-val span::text").extract() torrent['where'] = response.xpath( "//div[@class = 'big-left fl']/ul[1]/li/span[@class = 'label-val']/a/text()" ).extract() torrent['address'] = response.xpath( "//div[@class = 'big-left fl']/ul[1]/li[5]/span[@class = 'label-val']/text()" ).extract() torrent['sellor'] = response.xpath( "//div[@class = 'big-left fl']/ul[1]/li[7]/span[@class = 'label-val']/text()" ).extract() torrent['opentime'] = response.css("span.fq-open span::text").extract() torrent['gettime'] = response.css( "span.fq-handover span::text").extract() torrent['alltime'] = response.xpath( "//div[@class = 'big-left fl']/ul[3]/li[8]/span[@class = 'label-val']/text()" ).extract() # torrent['name'] = response.css("a.clear h1::text").extract() # torrent['address'] = response.css("span.region::text").extract() # torrent['price'] = response.css("p.jiage span.junjia::text").extract() # torrent['area'] = response.css("div.area::text").extract() # torrent['square'] = response.css("div.area span::text").extract() return torrent
class MantaSpider(CrawlSpider): name = 'manta' allowed_domains = ['manta.com'] rules = (Rule(SgmlLinkExtractor(allow=r'Items/'), callback='parse_item', follow=True), Rule(SgmlLinkExtractor(allow=r"/c/[^/]*/[^/]*$"), callback='parse_company_detail', follow=True)) def __init__(self, term=None, *args, **kwargs): super(MantaSpider, self).__init__(*args, **kwargs) if term: self.start_urls = ['http://www.manta.com/mb?search=%s' % term] else: self.start_urls = ['http://www.manta.com/'] def parse_start_url(self, response): return self.parse_company(response) def parse_search_result(self, response): hxs = HtmlXPathSelector(response) elems = hxs.select('//a[contains(@class, "nextYes")]/@href').extract() if len(elems) >= 1: yield Requeset(elems[0], callback=self.parse_company) def parse_company(self, response): hxs = HtmlXPathSelector(response) items = [] for h in hxs.select('//div[contains(@class, "pbl")]'): c = Company() c['name'] = h.select('*/h2[@itemprop="name"]/a/text()').extract() c['manta_url'] = h.select( '*/h2[@itemprop="name"]/a/@href').extract() c['street'] = h.select( '*/div[@itemprop="streetAddress"]/text()').extract() c['locality'] = h.select( '*/div[@itemprop="addressLocality"]/text()').extract() c['region'] = h.select( '*/div[@itemprop="addressRegion"]/text()').extract() c['postal_code'] = h.select( '*/div[@itemprop="postalCode"]/text()').extract() c['phone'] = h.select( '*/div[@itemprop="telephone"]/text()').extract() c['website'] = h.select('*/div[@itemprop="url"]/text()').extract() yield c def parse_company_detail(self, response): print(response)
def _init_args(self, **kwargs): start_url = kwargs.get('START_URL', '') if start_url: self.start_urls = [start_url] self.rules = (Rule(SgmlLinkExtractor(allow=filter_rules), callback="parse_resp", follow=True, process_links=self.put_links), ) self.headers = { 'Host': 'cn.futureelectronics.com', 'Accept-Language': 'en-US,en;q=0.8,zh-CN;q=0.6,zh;q=0.4', 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36', 'Referer': 'http://cn.futureelectronics.com/zh/Pages/index.aspx' } self.cookies = { 'SelectedCurrency': 'NY', 'SelectedLanguage': 'zh-CN', } # 商品搜索 self.product_url_pattern_0 = re.compile(filter_rules[0], re.IGNORECASE) # 判断是否是商品详情url self.product_url_pattern_1 = re.compile(filter_rules[1], re.IGNORECASE) self.product_url_pattern_2 = re.compile(filter_rules[2], re.IGNORECASE) # 从商品详情url中获取 product_id 作为 goods_sn self.product_id_pattern_1 = re.compile(r'ProductID=([^&]+)', re.IGNORECASE) self.product_id_pattern_2 = re.compile(r'/Pages/(.*)\.aspx', re.IGNORECASE) # 每一页的商品数量 self.limit_num = 10.0
def test_link_nofollow(self): html = """ <a href="page.html?action=print" rel="nofollow">Printer-friendly page</a> <a href="about.html">About us</a> <a href="http://google.com/something" rel="external nofollow">Something</a> """ response = HtmlResponse("http://example.org/page.html", body=html) lx = SgmlLinkExtractor() self.assertEqual( [link for link in lx.extract_links(response)], [ Link(url="http://example.org/page.html?action=print", text=u"Printer-friendly page", nofollow=True), Link(url="http://example.org/about.html", text=u"About us", nofollow=False), Link(url="http://google.com/something", text=u"Something", nofollow=True), ], )
def __init__(self, book_key, ct, *args, **kwargs): self.book_key = book_key self.ct = ct self.start_urls = ["http://m.88dushu.com/mulu/" + book_key + "-1/"] self.rules = ( Rule(SgmlLinkExtractor( allow=(r'http://m.88dushu.com/mulu/' + book_key + '-\d+/', ), restrict_xpaths=('//a[text()="%s"]' % (self.nextpage))), follow=True), Rule(SgmlLinkExtractor( allow=(r'http://m.88dushu.com/book/' + book_key + '-\d+/', ), restrict_xpaths=('//ul[@class="chapter" and not(@id)]')), callback='parse_content', follow=False), ) super(ChapterSpider, self).__init__(*args, **kwargs)
class MySpider(CrawlSpider): name = "stimdi" allowed_domains = ["stimdi.se"] start_urls = ["http://www.stimdi.se/tidslinjen/"] rules = (Rule(SgmlLinkExtractor( allow=(), restrict_xpaths=('//*[@id="content"]/div/div/h2/a')), callback="parse", follow=True), ) def parse(self, response): i = 0 print i for div in response.xpath('//*[@id="content"]/div/div'): print "IN FOR" item = AfeventItem() #Store data into lists item['title'] = div.xpath('//h2/a/text()').extract()[i] item['url'] = div.xpath('//h2/a/@href').extract()[i] item['location'] = '' item['description'] = div.xpath( '//*[@id="content"]/div/div[1]/a[1]/p/text()').extract()[i] #The following code changes the format of the date origDate = div.xpath('//p/text()').extract()[i] newDate = ''.join(origDate).replace(',', '').split() #Assign values to month names month = [ "", "januari", "februari", "mars", "april", "maj", "juni", "juli", "augusti", "september", "oktober", "november", "december" ].index(newDate[1]) #Assign a "0" in the beginning if month number is < 10 if month < 10: zeroMonth = [0, month] zeroMonth = ''.join(map(str, zeroMonth)) else: zeroMonth = month #same thing as above with day if int(newDate[0]) < 10: zeroDate = [0, newDate[0]] zeroDate = ''.join(map(str, zeroDate)) else: zeroDate = newDate[0] #Puts everything together and stores into item['date'] finalDate = [newDate[2], zeroMonth, zeroDate] item['date'] = '-'.join(finalDate) print i if i < len(response.xpath('//*[@id="content"]/div/div')): print "I IF" print len(response.xpath('//*[@id="content"]/div/div')) i = i + 1 yield item
class etaoSpider(CrawlSpider): # name of spiders name = 'Spider' allow_domain = ['gouwu.sogou.com'] start_urls = [('http://gouwu.sogou.com/shop?query=' + searchWord) for searchWord in lstData().lst] link_extractor = { 'page': SgmlLinkExtractor(allow='/detail/\d+\.html.+'), 'page_down': SgmlLinkExtractor( allow='/shop\?query=.+', ), #restrict_xpaths = '//a[@class = "pagination-next"]' } _x_query = { 'title': '//p[@class="title"]/a/@title', 'name': '//span[@class="floatR hui61 mt1"]/text()', #//li[2]/a/div[@class="ruyitao-market-name ruyitao-market-name-hightlight"]/text() 'price': '//span[@class="shopprice font17"]/text()', # 'price' : '//span[@class = "price"]/text()', } def __init__(self): CrawlSpider.__init__(self) # use any browser you wish self.browser = webdriver.Firefox() def __del__(self): self.browser.close() def parse(self, response): #crawl all display page for link in self.link_extractor['page_down'].extract_links(response): yield Request(url=link.url, callback=self.parse) #start browser self.browser.get(response.url) #loading time interval time.sleep(5) # get the data and write it to scrapy items etaoItem_loader = ItemLoader(item=EtaoItem(), response=response) url = str(response.url) etaoItem_loader.add_value('url', url) etaoItem_loader.add_xpath('title', self._x_query['title']) etaoItem_loader.add_xpath('name', self._x_query['name']) etaoItem_loader.add_xpath('price', self._x_query['price']) yield etaoItem_loader.load_item()
class PiaohuaCrawlSpider(CrawlSpider): name = "PiaohuaCrawlSpider" allowed_domains = ['piaohua.com'] start_urls = [ 'http://piaohua.com/html/aiqing/index.html', 'http://piaohua.com/html/kehuan/index.html', ] rules = [ Rule(SgmlLinkExtractor(allow=('list_'), restrict_xpaths=("//div[@class='page']/a")), callback='parse_item', follow=True) ] def parse_item(self, response): # print "parse_item>>>>>>" items = [] sel = Selector(response) movie_list = sel.xpath("//div[@id='nml']//dl") for movie in movie_list: item = PiaohuaItem() item['linkurl'] = self.getLinkUrl(movie) item['name'] = self.getName(movie) item['imageurl'] = self.getImageUrl(movie) item['type'] = self.getType(response) movieDetail = self.getMovieDetail(item['linkurl']) # item['downloadlink'] = self.getDownloadLink(movieDetail) # item['updatetime'] = self.getUpdateTime(movieDetail) items.append(item) return items def getLinkUrl(self, site): return site.xpath("dt/a/@href").extract()[0] def getImageUrl(self, site): return site.xpath("dt//img/@src").extract()[0] def getName(self, site): return site.xpath("dd/strong/a/b/font/text()").extract()[0] def getType(self, response): return response.url.split('/')[-2] def getUpdateTime(self, site): str = site.xpath( "//div[@id='show']/div[@id='showdesc']/text()").extract()[0] return re.search(r'.*(\d{4}-\d{2}-\d{2}).*', str).group(1) def getDownloadLink(self, site): return site.xpath("//anchor/a/text()").extract() def getMovieDetail(self, url): url = 'http://piaohua.com' + url return Selector(Request(url=url))
def test_restrict_xpaths_with_html_entities(self): html = '<html><body><p><a href="/♥/you?c=€">text</a></p></body></html>' response = HtmlResponse("http://example.org/somepage/index.html", body=html, encoding='iso8859-15') links = SgmlLinkExtractor( restrict_xpaths='//p').extract_links(response) self.assertEqual(links, [ Link(url='http://example.org/%E2%99%A5/you?c=%E2%82%AC', text=u'text') ])
class bbcSpider(CrawlSpider): name = "bbc" start_urls = ["http://www.bbc.com"] download_delay = 2 handle_httpstatus_list = [301] rules = [ Rule( SgmlLinkExtractor( allow=(r"http://www.bbc.com/.*?"), # deny=("http:\/\/.*\.(gif|GIF|jpg|JPG|ico|ICO|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe).*", # "http:\/\/.*#.*", # "https:\/\/www\.bbc\.com\/w\/index\.php\?.*type=signup.*", # "https:\/\/www\.bbc\.com\/w\/index\.php\?.*action=.*", # "https:\/\/www\.bbc\.com\/w\/index\.php\?.*title=Talk:.*", # "https:\/\/www\.bbc\.com\/w\/index\.php\?.*title=Category:.*", # "https:\/\/www\.bbc\.com\/w\/index\.php\?.*title=Special:.*", # "https:\/\/www\.bbc\.com\/sport.*", # "https:\/\/www\.bbc\.com\/weather.*", # "http:\/\/www\.bbc\.com\/earth.*", # "http:\/\/www.bbc.com\/travel.*", # "https:\/\/www\.bbc\.com\/w\/index\.php\?.*title=Special%3AUserLogin.*", # "https:\/\/www\.bbc\.com\/w\/index\.php\?.*title=Special.*", # "https:\/\/www\.bbc\.com\/w\/index\.php\?.*title=User_talk:.*", # "https:\/\/www\.bbc\.com\/w\/index\.php\?.*title=User:.*", # "https:\/\/www\.bbc\.com\/w\/index\.php\?.*title=Template:.*", # "https:\/\/www\.bbc\.com\/w\/index\.php\?.*title=Template_talk:.*" # ), allow_domains=("www.bbc.com")), callback='parse_item', follow='true') ] def parse_item(self, response): item = BbcItem() title_tmp = response.xpath( '//*[@id="page"]//h1//text()').extract_first() title = title_tmp if title: title = title.encode('utf8') item['title'] = title content_tmp = response.xpath( '//*[@id="page"]/div[2]/div[2]/div/div[1]/div[1]//p//text() | //*[@id="page"]/div[2]/div[2]/div/div[1]/div[1]//h2//text()' ).extract() content = '' for con in content_tmp: if con[-1] == '.': con = con + ' ' content = content + con.encode('utf-8') item['content'] = content link = str(response.url) item['url'] = link.encode('utf-8') return item
class MySpider(CrawlSpider): name = "af" allowed_domains = ["afconsult.com"] start_urls = [ "http://www.afconsult.com/sv/jobba-hos-oss/event-seminarier--massor/" ] rules = (Rule(SgmlLinkExtractor( allow=(), restrict_xpaths=('//*[@id="CalendarContainer"]/div')), callback="parser", follow=True), ) def parser(self, response): i = 0 for div in response.xpath('//*[@id="CalendarContainer"]/div/div/a'): item = AfeventItem() print "response.xpath" item['title'] = div.xpath( '//*[@id="mainContent"]/main/section[3]/div[1]/div[1]/article/h1/text()' ).extract()[i] item['venue'] = div.xpath( '//*[@id="mainContent"]/main/section[3]/div[1]/div[1]/article/p[2]/text()' ).extract()[i] item['date'] = div.xpath( '//*[@id="mainContent"]/main/section[3]/div[1]/div[1]/article/span/text()' ).extract()[i] item['time'] = div.xpath( '//*[@id="mainContent"]/main/section[3]/div[1]/div[1]/article/p[2]/span[2]/span/text()|//*[@id="mainContent"]/main/section[3]/div[1]/div[1]/article/p[3]/span/text()[3]' ).extract()[i] item['url'] = div.xpath( '//*[@id="mainContent"]/section/div/div/nav/ul/li[4]/a/@href' ).extract()[i] follow_url_1 = div.xpath( '//*[@id="mainContent"]/section/div/div/nav/ul/li[4]/a/@href' ).extract()[i] follow_url = 'http://www.afconsult.com' + follow_url_1 request = Request(follow_url, callback=self.parse_url) request.meta['item'] = item if i < len( response.xpath('//*[@id="CalendarContainer"]/div/div/a')): i = i + 1 print i yield request def parse_url(self, response): item = response.meta['item'] item['description'] = ''.join( response.xpath( '//*[@id="mainContent"]/main/section[3]/div[1]/div[1]/article//text()' ).extract()) print "parse_url" yield item
class HwzSpider(CrawlSpider): name = "hwz" allowed_domains = ["hardwarezone.com.sg"] start_urls = [ "http://forums.hardwarezone.com.sg/current-affairs-lounge-17/" ] rules = ( # Extract links matching 'garage-sales-18/.*html' (but not matching 'subsection.php') # and follow links from them (since no callback means follow=True by default). # Rule(SgmlLinkExtractor(allow=('garage\-sales\-18/.*\.html', )), callback='parse_item', follow=True), Rule(SgmlLinkExtractor( allow=('current\-affairs\-lounge\-17/.*\.html', )), callback='parse_item', follow=True), # Extract links matching 'item.php' and parse them with the spider's method parse_item #Rule(SgmlLinkExtractor(allow=('item\.php', )), callback='parse_item'), ) def insert_posts(self, posts): return """ When writing crawl spider rules, avoid using parse as callback, since the CrawlSpider uses the parse method itself to implement its logic. So if you override the parse method, the crawl spider will no longer work. """ def parse_item(self, response): hxs = HtmlXPathSelector(response) posts = hxs.select("//div[@id='posts']/div[@class='post-wrapper']") items = [] for post in posts: item = {} item['author_id'] = ''.join( post.select(".//a[@class='bigusername']/text()").extract()) item['url'] = response.url item['body'] = '\n'.join( map(lambda x: x.strip('\t\n\r'), post.select(".//td[@class='alt1']/div/text()").extract())) item['title'] = '\n'.join( map(lambda x: x.strip('\t\n\r'), post.select( "//h2[@class='header-gray']/text()").extract())) item['date_posted'] = ''.join( map(lambda x: x.strip(' \t\n\r#').strip(), post.select(".//td[@class='thead']/text()").extract()) ) # todo: deal with Today and Yesterday # item['date_posted'] = normalizeFriendlyDate(' '.join(map(lambda x:x.strip(' \t\n\r'),post.select(".//td[@class='thead']/text()").extract()))) # todo: deal with Today and Yesterday items.append(item) # self.insert_posts(items) print(items) return items
def __init__(self, process_idx, book_class, *args, **kwargs): self.idx = int(process_idx) self.book_class = book_class if self.idx > 0: self.start_urls = [ "http://m.88dushu.com/wapsort/" + book_class + "-" + process_idx + "0/" ] allow_url = r'http://m.88dushu.com/wapsort/' + book_class + r'-' + process_idx + r'[1-9]/' else: self.start_urls = [ "http://m.88dushu.com/wapsort/" + book_class + "-1/" ] allow_url = r'http://m.88dushu.com/wapsort/' + book_class + r'-[1-9]/' self.rules = ( Rule( SgmlLinkExtractor(allow=(allow_url, ), restrict_xpaths=('//a[text()="%s"]' % (self.nextpage2)))), Rule(SgmlLinkExtractor( allow=(r'http://m.88dushu.com/info/\d+/', ), restrict_xpaths=('//div[@class="block_img"]')), callback='parse_book', follow=False), Rule(SgmlLinkExtractor(allow=(r'http://m.88dushu.com/mulu/\d+/', ), restrict_xpaths=('//a[text()="%s"]' % (self.startRead))), follow=True), Rule(SgmlLinkExtractor( allow=(r'http://m.88dushu.com/mulu/\d+-\d+/', ), restrict_xpaths=('//a[text()="%s"]' % (self.nextpage))), follow=True), Rule(SgmlLinkExtractor( allow=(r'http://m.88dushu.com/book/\d+-\d+/', ), restrict_xpaths=('//ul[@class="chapter" and not(@id)]')), callback='parse_content', follow=False), ) super(ListSpider, self).__init__(*args, **kwargs)
class QQNewsSpider(CrawlSpider): # 爬虫名称 name = "tutorial" # 设置下载延时 download_delay = 1 # 允许域名 allowed_domains = ["news.cnblogs.com"] # 开始URL start_urls = ["https://news.cnblogs.com"] # 爬取规则,不带callback表示向该类url递归爬取 rules = [ Rule(SgmlLinkExtractor( allow=(r'https://news.cnblogs.com/n/page/\d', ))), Rule(SgmlLinkExtractor(allow=(r'https://news.cnblogs.com/n/\d+', )), callback='parse_item'), ] # 解析内容函数 def parse_item(self, response): print('***********************') item = QqnewsItem() # 当前URL title = response.selector.xpath( '//*[@id="news_title"]/a')[0].extract().decode('utf-8') item['title'] = title print(title) author = response.selector.xpath('//div[@id="news_info"]/span/a/text()' )[0].extract().decode('utf-8') item['author'] = author release_date = response.selector.xpath( '//div[@id="news_info"]/span[@class="time"]/text()')[0].extract( ).decode('utf-8') item['release_date'] = release_date yield item
class CnblogsSpider(CrawlSpider): # 爬虫名称 name = 'cnblogs' # 唯一标识,启动spider时即指定该名称 # 下载延时 download_delay = 2 allowed_domains = ['news.cnblogs.com'] start_urls = ['https://news.cnblogs.com/'] # 爬取规则,不带callback表示向该类url递归爬取 rules = ( # 下面是符合规则的网址,但是不抓取内容,只是提取该页的链接 Rule(SgmlLinkExtractor(allow=(r'https://news.cnblogs.com/n/page/\d', )) ), # 下面是符合规则的网址,提取内容 Rule(SgmlLinkExtractor(allow=(r'https://news.cnblogs.com/n/\d+', )), callback='parse')) # 解析内容函数 def parse(self, response): # 当前URL for resp in response.selector.xpath('//div[@class="content"]'): item = ScrapyspiderItem() title = resp.xpath('h2/a/text()').extract() item['title'] = title[0].decode('utf-8') url = resp.xpath('h2/a/@href').extract() item['url'] = 'https://news.cnblogs.com' + url[0].decode('utf-8') author = resp.xpath( 'div[@class="entry_footer"]/a/text()').extract() item['author'] = author[0].strip().decode('utf-8') date = resp.xpath( 'div[@class="entry_footer"]/span[@class="gray"]/text()' ).extract() item['date'] = date[0].decode('utf-8') yield item
class RecursiveScraperSpider(CrawlSpider) : name = "rs" allowed_domains = ["cse.iitd.ernet.in"] start_urls = ["http://www.cse.iitd.ernet.in/~naveen"] rules = ( Rule(SgmlLinkExtractor(allow=("cse\.iltd\.ernet\.in/\~naveen/.*", )), callback='parse_item', follow= True), ) def parse_item(self, response) : sel = Selector(response) item = RecursivescraperItem() item['URL'] = response.request.url item['content'] = sel.xpath('/html/body/table/tbody/tr[3]/td[1]/text()[1]').extract() return item
class CSDNBlogCrawlSpider(CrawlSpider): name = "CSDNBlogCrawlSpider" allowed_domains = ['blog.csdn.net'] start_urls = ['http://blog.csdn.net/u012150179/article/details/11749017'] rules = [ Rule(SgmlLinkExtractor(allow=('/u012150179/article/details'), restrict_xpaths=('//li[@class="next_article"]')), callback='parse_item', follow=True) ] def parse_item(self, response): # print "parse_item>>>>>>" item = {} sel = Selector(response) blog_url = str(response.url) blog_name = sel.xpath('//div[@id="article_details"]/div/h1/span/a/text()').extract() item['blog_name'] = [n.encode('utf-8') for n in blog_name] item['blog_url'] = blog_url.encode('utf-8') yield item def getImageUrl(self, item): imageurl = item.xpath("a[@class='img']/img/@src").extract() if imageurl: return imageurl[0] else: return '' def getLink(self, item): link = item.xpath("a[@class='img']/@href").extract() if link: return link[0] else: return '' def getUpdateTime(self, item): updatetime = item.xpath("span/text()").extract() if updatetime: return updatetime[0] else: return item.xpath("span/font/text()").extract()[0] def getName(self, item): name = item.xpath("a/strong/font/font/text()").extract() if name: return name[0] else: return item.xpath("a/strong/font/text()").extract()[0]
class ZhilianSpider(scrapy.Spider): name = "zhilian" # allowed_domains = start_urls = ["http://jobs.zhaopin.com/bj2140003/"] rules = (Rule( SgmlLinkExtractor(allow=(r'http://jobs.zhaopin.com/[0-9]+.htm', )), callback='parse_page', follow=True), ) def parse_page(self, response): sel = Selector(response) item = ZhiLianItems() item['name'] = sel.xpath('/html/body/div[5]/div[1]/div[1]/h1/text()') item['company'] = sel.xpath( '/html/body/div[5]/div[1]/div[1]/h2/a/text()') return item
class ArticleSpider(CrawlSpider): name = "article" allowed_domains = ["en.wikipedia.org"] start_urls = [ "http://en.wikipedia.org/wiki/Python_%28programming_language%29" ] rules = [ Rule(SgmlLinkExtractor(allow=('(/wiki/)((?!:).)*$'), ), callback="parse_item", follow=True) ] def parse_item(self, response): item = Article() title = response.xpath('//h1/text()')[0].extract() print("Title is: " + title) item['title'] = title return item
class NoticiasSpider(CrawlSpider): name = 'NoticiasSpider' allowed_domains = ['20minutos.es'] start_urls = ['http://www.20minutos.es/'] rules = ( Rule(SgmlLinkExtractor(allow=(r'deportes/noticia/(\w|\d|-|/)*/', )), callback='parse_news', follow=False), ) def parse_news(self, response): hxs = HtmlXPathSelector(response) elemento = Noticia() elemento['titulo'] = hxs.select('//h1[contains(@class, "article-title")]/text()')[0].extract() elemento['titulo'] = elemento['titulo'].encode('utf-8') elemento['fecha'] = hxs.select('//a[contains(@title, "Noticias del ")]/text()')[0].extract() elemento['fecha'] = elemento['fecha'].encode('utf-8') elemento['enlace'] = response.url return elemento
class StackSpider(CrawlSpider): name = "stackcrawl" allowed_domains = ["stackoverflow.com"] start_urls = [ "http://stackoverflow.com/questions?sort=newest", ] rules = (Rule(SgmlLinkExtractor(allow=('&page=\d')), callback='parse', follow=True), ) def parse(self, response): hxs = HtmlXPathSelector(response) questions = hxs.xpath('//div[@class="summary"]/h3') for question in questions: item = StackItem() item['title'] = question.xpath( 'a[@class="question-hyperlink"]/text()').extract()[0] item['url'] = question.xpath( 'a[@class="question-hyperlink"]/@href').extract()[0] yield item
def parse(self,response): link_ex = SgmlLinkExtractor(allow=(r'https://movie.douban.com/subject/\d+')) for i in link_ex.extract_links(response): yield Request(i.url,callback=self.parse_item,headers=self.headers)