def parse_second_cate(self, res: Response): if "captchacharacters" in res.text: print(f"出现验证码:{res.url}") return res.follow(res.url) bs = res.html_tree() text = "查看所有的结果" if text in res.text: tag = bs.find(text=text).parent.parent url = self.domain + tag.get("href").strip() self.redis_client.sadd(Amazon.CATE_LINK_SET, url) # data = bs.find_all("a", {"class": "a-link-normal s-navigation-item"}) try: data = bs.find("div", { "id": "departments" }).find_all("a", {"class": "a-link-normal s-navigation-item"}) except: return if data: for cate in data: text = cate.get_text().strip() link = self.domain + cate.get("href").strip() self.redis_client.sadd(Amazon.CATE_LINK_SET, link) if text: self.redis_client.sadd(Amazon.KEYWORD_SEARCH_SET, text) print(f"{text}:{link}") yield res.follow(link)
async def parse(self, response: Response): cate_data = {} bs = response.html_tree() a = bs.find_all("a", {"class": "a-link-normal fsdLink fsdDeptLink"}) for cate in a[::-1]: cate_name = cate.get_text().strip() cate_data["cate_name"] = cate_name link = self.domain + cate.get("href").strip() self.redis_client.sadd(Amazon.CATE_LINK_SET, link) self.redis_client.sadd(Amazon.KEYWORD_SEARCH_SET, cate_name) cate_data["link"] = link print(cate_data) yield response.follow(link, callback=self.parse_second_cate)
def parse(self, response: Response): # print(response.text) if "captchacharacters" in response.text: print(f"出现验证码:{response.url}") return response.follow(response.url) bs = response.html_tree() goods_tags = bs.find_all("div", {"class": "a-section a-spacing-medium"}) if not goods_tags: return for tag in goods_tags: item = {} item["asin_id"] = tag.parent.parent.parent.parent.get("data-asin") item["img_url"] = tag.find("img").get("src") item["goods_name"] = tag.find("img").get("alt") try: item["star_rating"] = tag.find("span", { "class": "a-icon-alt" }).string except: item["star_rating"] = "" try: item["price"] = tag.find("span", { "class": "a-offscreen" }).string except: item["price"] = "" try: item["goods_detail_link"] = self.domain + tag.find( "a", { "class": "a-link-normal s-no-outline" }).get("href") self.redis_client.sadd(Amazon.GOODS_DEATAIL_LINK_SET, item["goods_detail_link"]) except: item["goods_detail_link"] = "" try: item["goods_comment_num"] = tag.find("span", { "class": "a-size-base" }).string except: item["goods_comment_num"] = "" try: item["goods_comment_link"] = self.domain + tag.find( "span", { "class": "a-size-base" }).parent.get("href") except: item["goods_comment_link"] = "" print(item) self.redis_client.hset( Amazon.GOODS_LIST_ITEM_HASH, item["asin_id"], json.dumps(item, ensure_ascii=True), ) try: next_page_url = self.domain + bs.find("li", { "class": "a-last" }).a.get("href") except: return if next_page_url: yield response.follow(next_page_url)