Ejemplo n.º 1
0
 def parse_second_cate(self, res: Response):
     if "captchacharacters" in res.text:
         print(f"出现验证码:{res.url}")
         return res.follow(res.url)
     bs = res.html_tree()
     text = "查看所有的结果"
     if text in res.text:
         tag = bs.find(text=text).parent.parent
         url = self.domain + tag.get("href").strip()
         self.redis_client.sadd(Amazon.CATE_LINK_SET, url)
     # data = bs.find_all("a", {"class": "a-link-normal s-navigation-item"})
     try:
         data = bs.find("div", {
             "id": "departments"
         }).find_all("a", {"class": "a-link-normal s-navigation-item"})
     except:
         return
     if data:
         for cate in data:
             text = cate.get_text().strip()
             link = self.domain + cate.get("href").strip()
             self.redis_client.sadd(Amazon.CATE_LINK_SET, link)
             if text:
                 self.redis_client.sadd(Amazon.KEYWORD_SEARCH_SET, text)
             print(f"{text}:{link}")
             yield res.follow(link)
Ejemplo n.º 2
0
 async def parse(self, response: Response):
     cate_data = {}
     bs = response.html_tree()
     a = bs.find_all("a", {"class": "a-link-normal fsdLink fsdDeptLink"})
     for cate in a[::-1]:
         cate_name = cate.get_text().strip()
         cate_data["cate_name"] = cate_name
         link = self.domain + cate.get("href").strip()
         self.redis_client.sadd(Amazon.CATE_LINK_SET, link)
         self.redis_client.sadd(Amazon.KEYWORD_SEARCH_SET, cate_name)
         cate_data["link"] = link
         print(cate_data)
         yield response.follow(link, callback=self.parse_second_cate)
Ejemplo n.º 3
0
 def parse(self, response: Response):
     # print(response.text)
     if "captchacharacters" in response.text:
         print(f"出现验证码:{response.url}")
         return response.follow(response.url)
     bs = response.html_tree()
     goods_tags = bs.find_all("div",
                              {"class": "a-section a-spacing-medium"})
     if not goods_tags:
         return
     for tag in goods_tags:
         item = {}
         item["asin_id"] = tag.parent.parent.parent.parent.get("data-asin")
         item["img_url"] = tag.find("img").get("src")
         item["goods_name"] = tag.find("img").get("alt")
         try:
             item["star_rating"] = tag.find("span", {
                 "class": "a-icon-alt"
             }).string
         except:
             item["star_rating"] = ""
         try:
             item["price"] = tag.find("span", {
                 "class": "a-offscreen"
             }).string
         except:
             item["price"] = ""
         try:
             item["goods_detail_link"] = self.domain + tag.find(
                 "a", {
                     "class": "a-link-normal s-no-outline"
                 }).get("href")
             self.redis_client.sadd(Amazon.GOODS_DEATAIL_LINK_SET,
                                    item["goods_detail_link"])
         except:
             item["goods_detail_link"] = ""
         try:
             item["goods_comment_num"] = tag.find("span", {
                 "class": "a-size-base"
             }).string
         except:
             item["goods_comment_num"] = ""
         try:
             item["goods_comment_link"] = self.domain + tag.find(
                 "span", {
                     "class": "a-size-base"
                 }).parent.get("href")
         except:
             item["goods_comment_link"] = ""
         print(item)
         self.redis_client.hset(
             Amazon.GOODS_LIST_ITEM_HASH,
             item["asin_id"],
             json.dumps(item, ensure_ascii=True),
         )
     try:
         next_page_url = self.domain + bs.find("li", {
             "class": "a-last"
         }).a.get("href")
     except:
         return
     if next_page_url:
         yield response.follow(next_page_url)