Beispiel #1
0
    def parse(self, response):
        links = Selector(response).css(".p_in ul .bk a::attr(href)").extract()
        if links.__len__() == 0:
            next_page = None
        else:
            next_page = links[links.__len__() - 1]
        self.page_count = self.page_count - 1

        if self.page_count < 0:
            print(
                "warn:page count upper limit exceeded,did not complete all scraping"
            )
            return

        for i in response.css('.dw_table .el:not(.title)').extract():
            pi = PostItem.create(i, self.keys)
            if pi is None:
                self.invalid = self.invalid + 1
                print("found", self.invalid, "items dost not match key",
                      self.keys)
        self.page_index = self.page_index + 1
        print("finished scraping page", self.page_index)
        if next_page is not None:
            yield scrapy.Request(next_page, self.parse)