def parse_reviews(self, response): review_data = json.loads(response.text) full_reviews = review_data["result"]["data"] reviews = [] for full_review in full_reviews: rating = full_review["star"] comment = full_review["content"] review_time = full_review["update_time"] review_time = utils.transform_time_fmt( review_time, src_fmt="%H:%M, %d thg %m, %Y", dst_fmt=DEFAULT_TIME_FORMAT) reviews.append( dict(rating=rating, comment=comment, review_time=review_time)) self.item_scraped_count += 1 self.print_num_scraped_items(every=20) meta = response.meta yield Product(domain=self.allowed_domains[0], product_id=meta["product_id"], url=meta["url"], brand=meta["brand"], category=meta["category"], model=meta["model"], info=meta["info"], tags=meta["tags"], price=meta["price"], seller=meta["seller"], reviews=reviews, ratings=meta["ratings"], others=meta["others"])
def parse_item(self, response): url = response.url meta = response.meta category = meta["category"] brand = meta["brand"] model = meta["model"] price = response.css( "ul.pdt-ul-price div[itemprop=price]::attr(content)" ).extract_first().strip() intro = response.css("div.pdtl-des ::text").extract() intro = ". ".join(intro) intro = re.sub("\s+", " ", intro) info = response.css("div.pd-info-left ::text").extract() info = ". ".join([elm.strip() for elm in info]) info = re.sub("\s+", " ", info) info = intro + ". " + info self.item_scraped_count += 1 self.print_num_scraped_items(every=20) yield Product(domain=self.allowed_domains[0], product_id="", url=url, brand=brand, category=category, model=model, info=info, price=price, seller="", reviews=[], ratings={})
def parse_reviews(self, response): review_data = json.loads(response.text) rating_data = review_data.get("stars", {}) ratings = {} for i in range(1, 6): try: count = int(rating_data[str(i)]["count"]) except: count = 0 ratings.update({i: count}) full_reviews = review_data["data"] reviews = [] for review in full_reviews: title = review.get("title", "") content = review.get("content", "") comment = title + ". " + content rating = review.get("rating", "") review_time = review.get("created_at") review_time = utils.convert_unix_time(review_time) try: bought_time = review["created_by"]["purchased_at"] bought_time = utils.convert_unix_time(bought_time) except: bought_time = "" reviews.append( dict(rating=rating, comment=comment, review_time=review_time, bought_time=bought_time)) self.item_scraped_count += 1 self.print_num_scraped_items(every=20) meta = response.meta yield Product( domain=self.allowed_domains[0], product_id=meta["product_id"], url=meta["url"], brand=meta["brand"], category=meta["category"], model=meta["model"], info=meta["info"], price=meta["price"], seller=meta["seller"], reviews=reviews, ratings=ratings, )
def parse_reviews(self, response): meta = response.meta divs = response.css(".product-comment__item") ratings = {r: 0 for r in range(5)} reviews = [] for div in divs: try: review_time = div.css(".comment-item__time .date::text").extract_first() review_time = utils.transform_time_fmt(review_time, src_fmt="%d/%m/%Y %H:%M") json_str = div.css(".rating-stars::attr(data-rating)").extract_first() json_data = json.loads(json_str) rating = int(float(json_data["rating"])) comment = div.css(".comment-item__content ::text").extract_first().strip() # comment = [c.strip() for c in comment] # comment = " ".join(comment) except: continue if rating >= 1: curr_count = ratings.get(rating, 0) ratings.update({rating: curr_count+1}) reviews.append(dict(review_time=review_time, rating=rating, comment=comment)) self.item_scraped_count += 1 self.print_num_scraped_items(every=20) yield Product( domain=self.allowed_domains[0], product_id=meta["product_id"], url=meta["url"], brand=meta["brand"], category=meta["category"], model=meta["model"], info=meta["info"], price=meta["price"], seller=meta["seller"], tags=meta["tags"], reviews=reviews, ratings=ratings )
def parse_review(self, response): item = response.meta["item"] ratings, reviews = {}, [] json_data = json.loads(response.text) scores = json_data["model"]["ratings"]["scores"] or [] ratings = {5-i: rating for i, rating in enumerate(scores)} full_reviews = json_data["model"]["items"] or [] reviews = [] for full_review in full_reviews: rating = full_review["rating"] review_time = full_review["zonedReviewTime"] review_time = utils.convert_unix_time(review_time) bought_time = full_review["zonedBoughtDate"] bought_time = utils.convert_unix_time(bought_time) review_title = full_review.get("reviewTitle", "") or "" review_content = full_review.get("reviewContent", "") or "" comment = review_title + " " + review_content reviews.append(dict(rating=rating, review_time=review_time, comment=comment, bought_time=bought_time)) self.item_scraped_count += 1 self.print_num_scraped_items(every=5) yield Product( domain=self.allowed_domains[0], product_id=item["product_id"], url=item["url"], brand=item["brand"], category=item["category"], model=item["model"], info=item["info"], price=item["price"], seller=item["seller"], reviews=reviews, ratings=ratings )
def parse_item(self, response): url = response.url meta = response.meta category = meta["category"] try: product_id = re.match(".*-(.*).offer=.*", url).group(1) except: product_id = "" brand_seller_text = response.css(".product-detail__title-brand>a::text").extract() if len(brand_seller_text) > 0: brand = brand_seller_text[0] else: brand = "" if len(brand_seller_text) > 1: seller = brand_seller_text[1].strip().replace(":\xa0", "") else: seller = "" model = response.css(".product-detail__title>h1::text").extract_first() price = response.css(".product-detail__price-info ::text").extract_first() intro = response.css(".short-des__content ::text").extract() intro = ". ".join(intro) intro = re.sub("\s+", " ", intro) try: specs = response.css(".product-specs__table")[0].css("::text").extract() specs = " ".join(specs) except: specs = "" try: info = response.css(".product-detail__description ::text").extract() info = " ".join([elm.replace("\xa0", " ") for elm in info]) info = re.sub("\s+", " ", info) except: info = "" info = intro + ". " + specs + ". " + info tags = response.css(".product-tag__list>a::text").extract() tags = ",".join(tags) item = dict(product_id=product_id, model=model, category=category, url=url, price=price, brand=brand, seller=seller, tags=tags, info=info) # Crawl ratings and reviews review_url = response.css(".product-comment__list::attr(data-allreviews)").extract_first() if review_url is not None: review_url = self.base_url + review_url yield Request(review_url, self.parse_reviews, meta=item, errback=self.errback) else: self.item_scraped_count += 1 self.print_num_scraped_items(every=20) yield Product( domain=self.allowed_domains[0], product_id=item["product_id"], url=item["url"], brand=item["brand"], category=item["category"], model=item["model"], info=item["info"], price=item["price"], seller=item["seller"], tags=item["tags"], reviews=[], ratings={} )
def parse_item(self, response): url = response.url category = response.meta["category"] intro_div = response.css("#tr-intro-productdt") product_id = response.css("#productNo::attr(value)").extract_first() model = intro_div.css(".tr-prd-name2::text").extract_first().strip() brand = intro_div.css( ".tr-thuonghieu-reg>a::text").extract_first().strip() seller_url = intro_div.css( ".tr-gn-supplier a::attr(href)").extract_first() # Crawl seller name seller = "" response = self.pm.get_response(seller_url) if response is not None: root = html.document_fromstring(response.content) name_elms = root.cssselect(".tr-pr-name1") if len(name_elms) > 0: seller = name_elms[0].text # intro = intro_div.css(".tr-short-content::text").extract() # intro = [elm.strip() for elm in intro] # intro = " ".join(intro) price = intro_div.css(".th-detail-price::text").extract_first().strip() info = response.css( "#tr-detail-productdt .tr-prd-info-content ::text").extract() info = " ".join([elm.strip() for elm in info]) # Calculate rating count num_reviews = response.css( "#tr-productdt-rank .vote-count::text").extract_first() num_reviews = 0 if num_reviews is None else int(num_reviews) ratings = response.css( "#tr-productdt-rank " ".tr-rank-percent>div:nth-child(3)::text").extract() ratings = [float(r[:-1]) for r in ratings] ratings = { 5 - i: int(round(num_reviews * r / 100)) for i, r in enumerate(ratings) } # Crawl all reviews of product num_page_reviews = int(math.ceil(num_reviews / 5)) reviews = self.crawl_review(url=None, raw_html=response.text) for page in range(2, num_page_reviews + 1): url = "https://www.yes24.vn/Product/" \ "GetProductComment?productNo={}&page={}".format(product_id, page) reviews.extend(self.crawl_review(url)) self.item_scraped_count += 1 self.print_num_scraped_items(every=20) yield Product(domain=self.allowed_domains[0], product_id=product_id, url=url, brand=brand, category=category, model=model, info=info, price=price, seller=seller, reviews=reviews, ratings=ratings)