Example #1
0
    def parse(self, response):
        se = Selector(response=response)
        books = se.xpath('//div[@class="zg_itemImmersion"]').extract()
        for book in books:
            item = MyspidersItem()
            se2 = Selector(text=book)

            author = se2.xpath(
                '//span[contains(@class, "a-size-small a-color-")]/text()'
            ).extract()[0]
            rank = se2.xpath(
                '//span[@class="zg_rankNumber"]/text()').extract()[0]
            name = se2.xpath('//img/@alt').extract()[0]

            price = se2.xpath('//span[@class="p13n-sc-price"]/text()')
            if len(price) == 0:
                price = se2.xpath(
                    '//span[@class="a-size-base a-color-price"]/text()'
                ).extract()[0]
            else:
                price = price.extract()[0]

            stars = se2.xpath(
                '//div[@class="a-icon-row a-spacing-none"]/a/@title').extract(
                )
            if len(stars) == 0:
                stars = 0
            else:
                stars = stars[0]

            reviewers = se2.xpath(
                '//div[@class="a-icon-row a-spacing-none"]/a[@class="a-size-small a-link-normal"]/text()'
            ).extract()
            if len(reviewers) == 0:
                reviewers = 0
            else:
                reviewers = reviewers[0]

            item['name'] = name
            item['author'] = author
            item['rank'] = rank.strip().replace('.', '')
            item['price'] = price
            item['stars'] = stars
            item['reviewers'] = reviewers

            yield item

        current_page = se.xpath(
            '//li[@class="zg_page zg_selected"]/following-sibling::*[1]/a/@href'
        ).extract()
        if len(current_page) != 0:
            url = current_page[0]
            response = requests.get(url)
            if response.status_code == 404:
                return

            yield scrapy.Request(url=url, callback=self.parse)
Example #2
0
 def parse(self, response):
     movies = Selector(response=response).xpath('//div[@class="channel-detail movie-item-title"]')
     for movie in movies:
         item = MyspidersItem()
         # print(movie)
         title = movie.xpath('./a/text()').extract()[0]
         link = f'https://maoyan.com{movie.xpath("./a/@href").extract()[0]}'
         print(title, link)
         item['title'] = title
         yield scrapy.Request(url=link, meta={'item': item}, callback=self.parse2, dont_filter=True)
Example #3
0
 def parse(self, response):
     all_movies = Selector(response=response).xpath("//div[@class='hd']")
     for movie in all_movies:
         title = movie.xpath('a/span/text()').extract_first().strip()
         url = movie.xpath('a/@href').extract_first().strip()
         # bs_info = bs(response.text, "html.parser")
         # all_title_url = bs_info.find_all("div", attrs={"class": "hd"})
         # # items = []
         # for b in all_title_url:
         #     title = b.a.find("span").text
         #     url = b.a.get("href")
         item = MyspidersItem()
         item["title"] = title
         item["url"] = url
         yield scrapy.Request(url,
                              callback=self.parse2,
                              meta={"item": item})
Example #4
0
    def parse(self, response):
        node_list = response.xpath("//div[@class='li_txt']")
        # 用来储存所有的item字段
        # items = []
        for node in node_list:
            item = MyspidersItem()

            name = node.xpath("./h3/text()").extract()
            title = node.xpath("./h4/text()").extract()
            info = node.xpath("./p/text()").extract()

            item['name'] = name[0]
            item['title'] = title[0]
            item['info'] = info[0]
            # items.append(item)
            # 返回提取到的每个item数据 给管道文件处理
            # 		return items
            yield item
Example #5
0
    def getinfor(self, response):
        # comment_time = pinglun.xpath('.//div[@class="time"]/meta/@content').extract()[0]
        comments = Selector(
            response=response).xpath('//div[@class="comment_con"]')
        for comment in comments:
            # item = response.meta['item']
            item = MyspidersItem()
            comment_contents = comment.xpath('./p/span/text()')
            comment_time = comment.xpath(
                './/div[@class="time"]/meta/@content').extract()[0]
            username = comment.xpath(
                './/a[@class="a_underline user_name"]/span/text()').extract(
                )[0]
            comment_content = comment_contents.extract_first().strip()

            s = SnowNLP(comment_content)
            sentiments = s.sentiments
            item['sentiments'] = sentiments
            item['comments'] = comment_content
            item['username'] = username
            item['comment_time'] = comment_time
            yield item
Example #6
0
    def parse(self, response):
        # 获取所有节点数据
        node_list = response.xpath('//tr[@class="even"]|//tr[@class="odd"]')
        for node in node_list:
            item = MyspidersItem()
            item['profession'] = node.xpath('./td[1]/a/text()').extract()[0]
            item['link'] = 'https://hr.tencent.com/' + node.xpath(
                './td[1]/a/@href').extract()[0]
            # 提取补刀默认值为None,确定只有一条数据的时候使用
            item['category'] = node.xpath('./td[2]/text()').extract_first()
            item['num'] = node.xpath('./td[3]/text()').extract()[0]
            item['address'] = node.xpath('./td[4]/text()').extract()[0]
            item['pub_date'] = node.xpath('./td[5]/text()').extract()[0]

            yield scrapy.Request(url=item['link'],
                                 callback=self.parse_detail,
                                 meta={'key': item})

        # 翻页
        next_url = response.xpath('//*[@id="next"]/@href').extract_first()
        if next_url is not None:
            next_url = 'https://hr.tencent.com/' + next_url
            yield scrapy.Request(url=next_url, callback=self.parse)
Example #7
0
 def parse(self, response):
     movies = response.xpath('//ul[@class="top-list  fn-clear"]/li')
     for each_movie in movies:
         item = MyspidersItem()
         item['name'] = each_movie.xpath('./h5/a/@title').extract()[0]
         yield item