Exemple #1
0
 def parse(self, response):
     book_urls = response.xpath('//div[@class="info"]/a/@href')
     for url in book_urls:
         book_url_item = MasterItem()
         book_url_item['book_url'] = url
         print('book_url: ', url)
         print('===' * 20)
         yield book_url_item
Exemple #2
0
 def parse(self, response):
     '''解析每页图书详情地址'''
     print(response.url)
     lists = response.css(
         ('#subject_list ul li.subject-item a.nbg::attr(href)')).extract()
     if lists:
         for i in lists:
             item = MasterItem()
             item['url'] = i
             yield item
Exemple #3
0
    def parse(self, response):
        courses_list = response.selector.css('div.course_dl_list')
        item = MasterItem()
        for course in courses_list:
            item['url'] = course.css('a::attr(href)').extract_first()
            yield item

        next_url = response.css('a.btn-next::attr(href)').extract_first()
        print('下一页URL:\n\t\t %s' % next_url)
        if next_url != response.url:
            url = response.urljoin(next_url)
            print(url, response.url)
            yield scrapy.Request(url=url, callback=self.parse)
Exemple #4
0
    def parse(self, response):
        #解析每页图书的详情信息
        print(response.url)
        items = response.css('#subject_list .subject-item .info a::attr(href)').extract()
        if items:
            for i in items:
                item = MasterItem()
                item['url'] = i
                yield item

        #获取下一页的url地址
        next = response.css('.next a::attr(href)').extract_first()
        if next:
            url = response.urljoin(next)
            yield scrapy.Request(url=url, callback=self.parse )
Exemple #5
0
class myspider(CrawlSpider):

    name = 'master'
    allowed_domains = ['58.com']
    item = MasterItem()
    start_urls = ['http://cd.58.com/ershoufang/']
    rules = (Rule(LinkExtractor(
        allow=('http://cd.58.com/ershoufang/\d{14}x.shtml.*?', )),
                  callback='parse_item',
                  follow=True), )

    def parse_item(self, response):
        item = self.item
        item['url'] = response.url
        return item
Exemple #6
0
 def parse(self, response):
     '''解析每页的图书详情的url地址信息'''
     print(response.url)
     lists = response.css(
         '#subject_list ul li.subject-item a.nbg::attr(href)').extract()
     if lists:
         for i in lists:
             item = MasterItem()
             item['url'] = i
             yield item
     # 获取下一页的url地址
     next_url = response.css("span.next a::attr(href)").extract_first()
     # 判断若非最后一页
     if next_url:
         url = response.urljoin(next_url)
         # 构造下一页招聘列表信息的爬取
         yield scrapy.Request(url=url, callback=self.parse)
Exemple #7
0
class myspider(CrawlSpider):

    name = 'master'
    # allowed_domains = ['58.com']
    item = MasterItem()
    start_urls = ['https://movie.douban.com/top250?']
    rules = (
        #https://movie.douban.com/top250?start=225&filter=
        Rule(LinkExtractor(allow=(r'https://movie.douban.com/top250?.*?',)), callback='parse_item',
             follow=True),
    )

    def parse_item(self,response):
        item = self.item
        print(item)
        item['url'] = response.url
        return item
 def parse_url(self,response):
     item = MasterItem()
     item["url"] = response.url
     yield item