def parse(self, response): se = Selector(response=response) books = se.xpath('//div[@class="zg_itemImmersion"]').extract() for book in books: item = MyspidersItem() se2 = Selector(text=book) author = se2.xpath( '//span[contains(@class, "a-size-small a-color-")]/text()' ).extract()[0] rank = se2.xpath( '//span[@class="zg_rankNumber"]/text()').extract()[0] name = se2.xpath('//img/@alt').extract()[0] price = se2.xpath('//span[@class="p13n-sc-price"]/text()') if len(price) == 0: price = se2.xpath( '//span[@class="a-size-base a-color-price"]/text()' ).extract()[0] else: price = price.extract()[0] stars = se2.xpath( '//div[@class="a-icon-row a-spacing-none"]/a/@title').extract( ) if len(stars) == 0: stars = 0 else: stars = stars[0] reviewers = se2.xpath( '//div[@class="a-icon-row a-spacing-none"]/a[@class="a-size-small a-link-normal"]/text()' ).extract() if len(reviewers) == 0: reviewers = 0 else: reviewers = reviewers[0] item['name'] = name item['author'] = author item['rank'] = rank.strip().replace('.', '') item['price'] = price item['stars'] = stars item['reviewers'] = reviewers yield item current_page = se.xpath( '//li[@class="zg_page zg_selected"]/following-sibling::*[1]/a/@href' ).extract() if len(current_page) != 0: url = current_page[0] response = requests.get(url) if response.status_code == 404: return yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response): movies = Selector(response=response).xpath('//div[@class="channel-detail movie-item-title"]') for movie in movies: item = MyspidersItem() # print(movie) title = movie.xpath('./a/text()').extract()[0] link = f'https://maoyan.com{movie.xpath("./a/@href").extract()[0]}' print(title, link) item['title'] = title yield scrapy.Request(url=link, meta={'item': item}, callback=self.parse2, dont_filter=True)
def parse(self, response): all_movies = Selector(response=response).xpath("//div[@class='hd']") for movie in all_movies: title = movie.xpath('a/span/text()').extract_first().strip() url = movie.xpath('a/@href').extract_first().strip() # bs_info = bs(response.text, "html.parser") # all_title_url = bs_info.find_all("div", attrs={"class": "hd"}) # # items = [] # for b in all_title_url: # title = b.a.find("span").text # url = b.a.get("href") item = MyspidersItem() item["title"] = title item["url"] = url yield scrapy.Request(url, callback=self.parse2, meta={"item": item})
def parse(self, response): node_list = response.xpath("//div[@class='li_txt']") # 用来储存所有的item字段 # items = [] for node in node_list: item = MyspidersItem() name = node.xpath("./h3/text()").extract() title = node.xpath("./h4/text()").extract() info = node.xpath("./p/text()").extract() item['name'] = name[0] item['title'] = title[0] item['info'] = info[0] # items.append(item) # 返回提取到的每个item数据 给管道文件处理 # return items yield item
def getinfor(self, response): # comment_time = pinglun.xpath('.//div[@class="time"]/meta/@content').extract()[0] comments = Selector( response=response).xpath('//div[@class="comment_con"]') for comment in comments: # item = response.meta['item'] item = MyspidersItem() comment_contents = comment.xpath('./p/span/text()') comment_time = comment.xpath( './/div[@class="time"]/meta/@content').extract()[0] username = comment.xpath( './/a[@class="a_underline user_name"]/span/text()').extract( )[0] comment_content = comment_contents.extract_first().strip() s = SnowNLP(comment_content) sentiments = s.sentiments item['sentiments'] = sentiments item['comments'] = comment_content item['username'] = username item['comment_time'] = comment_time yield item
def parse(self, response): # 获取所有节点数据 node_list = response.xpath('//tr[@class="even"]|//tr[@class="odd"]') for node in node_list: item = MyspidersItem() item['profession'] = node.xpath('./td[1]/a/text()').extract()[0] item['link'] = 'https://hr.tencent.com/' + node.xpath( './td[1]/a/@href').extract()[0] # 提取补刀默认值为None,确定只有一条数据的时候使用 item['category'] = node.xpath('./td[2]/text()').extract_first() item['num'] = node.xpath('./td[3]/text()').extract()[0] item['address'] = node.xpath('./td[4]/text()').extract()[0] item['pub_date'] = node.xpath('./td[5]/text()').extract()[0] yield scrapy.Request(url=item['link'], callback=self.parse_detail, meta={'key': item}) # 翻页 next_url = response.xpath('//*[@id="next"]/@href').extract_first() if next_url is not None: next_url = 'https://hr.tencent.com/' + next_url yield scrapy.Request(url=next_url, callback=self.parse)
def parse(self, response): movies = response.xpath('//ul[@class="top-list fn-clear"]/li') for each_movie in movies: item = MyspidersItem() item['name'] = each_movie.xpath('./h5/a/@title').extract()[0] yield item