Example #1
0
    def parse_reviews(self, response):
        review_data = json.loads(response.text)
        full_reviews = review_data["result"]["data"]

        reviews = []
        for full_review in full_reviews:
            rating = full_review["star"]
            comment = full_review["content"]
            review_time = full_review["update_time"]
            review_time = utils.transform_time_fmt(
                review_time,
                src_fmt="%H:%M, %d thg %m, %Y",
                dst_fmt=DEFAULT_TIME_FORMAT)

            reviews.append(
                dict(rating=rating, comment=comment, review_time=review_time))

        self.item_scraped_count += 1
        self.print_num_scraped_items(every=20)

        meta = response.meta
        yield Product(domain=self.allowed_domains[0],
                      product_id=meta["product_id"],
                      url=meta["url"],
                      brand=meta["brand"],
                      category=meta["category"],
                      model=meta["model"],
                      info=meta["info"],
                      tags=meta["tags"],
                      price=meta["price"],
                      seller=meta["seller"],
                      reviews=reviews,
                      ratings=meta["ratings"],
                      others=meta["others"])
    def parse_item(self, response):
        url = response.url
        meta = response.meta
        category = meta["category"]
        brand = meta["brand"]
        model = meta["model"]

        price = response.css(
            "ul.pdt-ul-price div[itemprop=price]::attr(content)"
        ).extract_first().strip()

        intro = response.css("div.pdtl-des ::text").extract()
        intro = ". ".join(intro)
        intro = re.sub("\s+", " ", intro)

        info = response.css("div.pd-info-left ::text").extract()
        info = ". ".join([elm.strip() for elm in info])
        info = re.sub("\s+", " ", info)

        info = intro + ". " + info

        self.item_scraped_count += 1
        self.print_num_scraped_items(every=20)

        yield Product(domain=self.allowed_domains[0],
                      product_id="",
                      url=url,
                      brand=brand,
                      category=category,
                      model=model,
                      info=info,
                      price=price,
                      seller="",
                      reviews=[],
                      ratings={})
Example #3
0
    def parse_reviews(self, response):
        review_data = json.loads(response.text)

        rating_data = review_data.get("stars", {})
        ratings = {}
        for i in range(1, 6):
            try:
                count = int(rating_data[str(i)]["count"])
            except:
                count = 0
            ratings.update({i: count})

        full_reviews = review_data["data"]
        reviews = []
        for review in full_reviews:
            title = review.get("title", "")
            content = review.get("content", "")
            comment = title + ". " + content
            rating = review.get("rating", "")
            review_time = review.get("created_at")
            review_time = utils.convert_unix_time(review_time)

            try:
                bought_time = review["created_by"]["purchased_at"]
                bought_time = utils.convert_unix_time(bought_time)
            except:
                bought_time = ""

            reviews.append(
                dict(rating=rating,
                     comment=comment,
                     review_time=review_time,
                     bought_time=bought_time))

        self.item_scraped_count += 1
        self.print_num_scraped_items(every=20)

        meta = response.meta
        yield Product(
            domain=self.allowed_domains[0],
            product_id=meta["product_id"],
            url=meta["url"],
            brand=meta["brand"],
            category=meta["category"],
            model=meta["model"],
            info=meta["info"],
            price=meta["price"],
            seller=meta["seller"],
            reviews=reviews,
            ratings=ratings,
        )
Example #4
0
    def parse_reviews(self, response):
        meta = response.meta
        divs = response.css(".product-comment__item")

        ratings = {r: 0 for r in range(5)}
        reviews = []
        for div in divs:
            try:
                review_time = div.css(".comment-item__time .date::text").extract_first()
                review_time = utils.transform_time_fmt(review_time, src_fmt="%d/%m/%Y %H:%M")

                json_str = div.css(".rating-stars::attr(data-rating)").extract_first()
                json_data = json.loads(json_str)
                rating = int(float(json_data["rating"]))

                comment = div.css(".comment-item__content ::text").extract_first().strip()
                # comment = [c.strip() for c in comment]
                # comment = " ".join(comment)
            except:
                continue

            if rating >= 1:
                curr_count = ratings.get(rating, 0)
                ratings.update({rating: curr_count+1})

                reviews.append(dict(review_time=review_time, rating=rating, comment=comment))

        self.item_scraped_count += 1
        self.print_num_scraped_items(every=20)

        yield Product(
            domain=self.allowed_domains[0],
            product_id=meta["product_id"],
            url=meta["url"],
            brand=meta["brand"],
            category=meta["category"],
            model=meta["model"],
            info=meta["info"],
            price=meta["price"],
            seller=meta["seller"],
            tags=meta["tags"],
            reviews=reviews,
            ratings=ratings
        )
Example #5
0
    def parse_review(self, response):
        item = response.meta["item"]

        ratings, reviews = {}, []
        json_data = json.loads(response.text)
        scores = json_data["model"]["ratings"]["scores"] or []
        ratings = {5-i: rating for i, rating in enumerate(scores)}

        full_reviews = json_data["model"]["items"] or []
        reviews = []
        for full_review in full_reviews:
            rating = full_review["rating"]

            review_time = full_review["zonedReviewTime"]
            review_time = utils.convert_unix_time(review_time)

            bought_time = full_review["zonedBoughtDate"]
            bought_time = utils.convert_unix_time(bought_time)

            review_title = full_review.get("reviewTitle", "") or ""
            review_content = full_review.get("reviewContent", "") or ""
            comment = review_title + " " + review_content

            reviews.append(dict(rating=rating, review_time=review_time,
                                comment=comment, bought_time=bought_time))

        self.item_scraped_count += 1
        self.print_num_scraped_items(every=5)

        yield Product(
            domain=self.allowed_domains[0],
            product_id=item["product_id"],
            url=item["url"],
            brand=item["brand"],
            category=item["category"],
            model=item["model"],
            info=item["info"],
            price=item["price"],
            seller=item["seller"],
            reviews=reviews,
            ratings=ratings
        )
Example #6
0
    def parse_item(self, response):
        url = response.url
        meta = response.meta
        category = meta["category"]

        try:
            product_id = re.match(".*-(.*).offer=.*", url).group(1)
        except:
            product_id = ""
        brand_seller_text = response.css(".product-detail__title-brand>a::text").extract()
        if len(brand_seller_text) > 0:
            brand = brand_seller_text[0]
        else:
            brand = ""
        if len(brand_seller_text) > 1:
            seller = brand_seller_text[1].strip().replace(":\xa0", "")
        else:
            seller = ""
        model = response.css(".product-detail__title>h1::text").extract_first()
        price = response.css(".product-detail__price-info ::text").extract_first()

        intro = response.css(".short-des__content ::text").extract()
        intro = ". ".join(intro)
        intro = re.sub("\s+", " ", intro)

        try:
            specs = response.css(".product-specs__table")[0].css("::text").extract()
            specs = " ".join(specs)
        except:
            specs = ""

        try:
            info = response.css(".product-detail__description ::text").extract()
            info = " ".join([elm.replace("\xa0", " ") for elm in info])
            info = re.sub("\s+", " ", info)
        except:
            info = ""

        info = intro + ". " + specs + ". " + info
        tags = response.css(".product-tag__list>a::text").extract()
        tags = ",".join(tags)

        item = dict(product_id=product_id, model=model, category=category, url=url,
                    price=price, brand=brand, seller=seller, tags=tags, info=info)

        # Crawl ratings and reviews
        review_url = response.css(".product-comment__list::attr(data-allreviews)").extract_first()
        if review_url is not None:
            review_url = self.base_url + review_url
            yield Request(review_url, self.parse_reviews, meta=item, errback=self.errback)
        else:
            self.item_scraped_count += 1
            self.print_num_scraped_items(every=20)

            yield Product(
                domain=self.allowed_domains[0],
                product_id=item["product_id"],
                url=item["url"],
                brand=item["brand"],
                category=item["category"],
                model=item["model"],
                info=item["info"],
                price=item["price"],
                seller=item["seller"],
                tags=item["tags"],
                reviews=[],
                ratings={}
            )
Example #7
0
    def parse_item(self, response):
        url = response.url
        category = response.meta["category"]
        intro_div = response.css("#tr-intro-productdt")
        product_id = response.css("#productNo::attr(value)").extract_first()
        model = intro_div.css(".tr-prd-name2::text").extract_first().strip()
        brand = intro_div.css(
            ".tr-thuonghieu-reg>a::text").extract_first().strip()
        seller_url = intro_div.css(
            ".tr-gn-supplier a::attr(href)").extract_first()

        # Crawl seller name
        seller = ""
        response = self.pm.get_response(seller_url)
        if response is not None:
            root = html.document_fromstring(response.content)
            name_elms = root.cssselect(".tr-pr-name1")
            if len(name_elms) > 0:
                seller = name_elms[0].text

        # intro = intro_div.css(".tr-short-content::text").extract()
        # intro = [elm.strip() for elm in intro]
        # intro = " ".join(intro)

        price = intro_div.css(".th-detail-price::text").extract_first().strip()
        info = response.css(
            "#tr-detail-productdt .tr-prd-info-content ::text").extract()
        info = " ".join([elm.strip() for elm in info])

        # Calculate rating count
        num_reviews = response.css(
            "#tr-productdt-rank .vote-count::text").extract_first()
        num_reviews = 0 if num_reviews is None else int(num_reviews)

        ratings = response.css(
            "#tr-productdt-rank "
            ".tr-rank-percent>div:nth-child(3)::text").extract()
        ratings = [float(r[:-1]) for r in ratings]
        ratings = {
            5 - i: int(round(num_reviews * r / 100))
            for i, r in enumerate(ratings)
        }

        # Crawl all reviews of product
        num_page_reviews = int(math.ceil(num_reviews / 5))

        reviews = self.crawl_review(url=None, raw_html=response.text)
        for page in range(2, num_page_reviews + 1):
            url = "https://www.yes24.vn/Product/" \
                  "GetProductComment?productNo={}&page={}".format(product_id, page)
            reviews.extend(self.crawl_review(url))

        self.item_scraped_count += 1
        self.print_num_scraped_items(every=20)

        yield Product(domain=self.allowed_domains[0],
                      product_id=product_id,
                      url=url,
                      brand=brand,
                      category=category,
                      model=model,
                      info=info,
                      price=price,
                      seller=seller,
                      reviews=reviews,
                      ratings=ratings)