Beispiel #1
0
    def parse_list(self, response:scrapy.http.response.Response):
        art_links_selector = response.xpath('//*[@id="J_main-container"]//h2[@class="post-title"]/a')
        for art_link_selector in art_links_selector:
            link = art_link_selector.xpath('@href')
            title = art_link_selector.xpath('text()')

        # 首页的第二页按钮
        second_page = response.xpath('//*[@id="J_main-container"]'
                                     '//a[contains(@class, "home-browser-more-btn")]/@href').get()
        if second_page:
            yield response.follow(second_page, callback=self.parse_list)

        #
        next_page = response.xpath('//*[@id="J_main-container"]//ul[@class="pagination"]'
                       '/li[not(contains(@class, "disabled"))]/a[@aria-label="Next"]/@href').get()
        if next_page:
            yield response.follow(next_page, callback=self.parse_list)
    def parse(self, response: scrapy.http.response.Response):
        next_page = response.xpath(
            '//div[@class="navigation-wrapper"]/div/a[@class="next"]/@href'
        ).get()
        if next_page:
            print(next_page)
            self.count += 1
            if self.count < 20:
                yield response.follow(next_page, callback=self.parse)

        desc = response.xpath('//meta[@name="description"]/@content').get()
        tags = response.xpath('//span[@class="tag-links"]/a/text()').getall()
        res = self.extractor.extract(response.text)
        yield MeituanArticleSpiderItem(url=response.url,
                                       title=res['title'],
                                       content=res['content'],
                                       tags=tags,
                                       author=res['author'],
                                       publish_time=res['publish_time'])