def parse(self, response): book_urls = response.xpath('//div[@class="info"]/a/@href') for url in book_urls: book_url_item = MasterItem() book_url_item['book_url'] = url print('book_url: ', url) print('===' * 20) yield book_url_item
def parse(self, response): '''解析每页图书详情地址''' print(response.url) lists = response.css( ('#subject_list ul li.subject-item a.nbg::attr(href)')).extract() if lists: for i in lists: item = MasterItem() item['url'] = i yield item
def parse(self, response): courses_list = response.selector.css('div.course_dl_list') item = MasterItem() for course in courses_list: item['url'] = course.css('a::attr(href)').extract_first() yield item next_url = response.css('a.btn-next::attr(href)').extract_first() print('下一页URL:\n\t\t %s' % next_url) if next_url != response.url: url = response.urljoin(next_url) print(url, response.url) yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response): #解析每页图书的详情信息 print(response.url) items = response.css('#subject_list .subject-item .info a::attr(href)').extract() if items: for i in items: item = MasterItem() item['url'] = i yield item #获取下一页的url地址 next = response.css('.next a::attr(href)').extract_first() if next: url = response.urljoin(next) yield scrapy.Request(url=url, callback=self.parse )
class myspider(CrawlSpider): name = 'master' allowed_domains = ['58.com'] item = MasterItem() start_urls = ['http://cd.58.com/ershoufang/'] rules = (Rule(LinkExtractor( allow=('http://cd.58.com/ershoufang/\d{14}x.shtml.*?', )), callback='parse_item', follow=True), ) def parse_item(self, response): item = self.item item['url'] = response.url return item
def parse(self, response): '''解析每页的图书详情的url地址信息''' print(response.url) lists = response.css( '#subject_list ul li.subject-item a.nbg::attr(href)').extract() if lists: for i in lists: item = MasterItem() item['url'] = i yield item # 获取下一页的url地址 next_url = response.css("span.next a::attr(href)").extract_first() # 判断若非最后一页 if next_url: url = response.urljoin(next_url) # 构造下一页招聘列表信息的爬取 yield scrapy.Request(url=url, callback=self.parse)
class myspider(CrawlSpider): name = 'master' # allowed_domains = ['58.com'] item = MasterItem() start_urls = ['https://movie.douban.com/top250?'] rules = ( #https://movie.douban.com/top250?start=225&filter= Rule(LinkExtractor(allow=(r'https://movie.douban.com/top250?.*?',)), callback='parse_item', follow=True), ) def parse_item(self,response): item = self.item print(item) item['url'] = response.url return item
def parse_url(self,response): item = MasterItem() item["url"] = response.url yield item