Esempio n. 1
0
    def parse_category(self, response):
        meta = dict(response.meta)

        # Navigate to item
        li_elms = response.css("li.pl18-item-li")
        items = []
        for li in li_elms:
            brand = li.css(".pl18-item-brand ::text").extract_first().strip()
            names = li.css(".pl18-item-name ::text").extract()
            names = [name.strip() for name in names]
            model = " ".join(names).strip()
            url = self.base_url + li.css(
                ".pl18-item-name a::attr(href)").extract_first()

            items.append(dict(brand=brand, model=model, url=url))

        self.logger.info("Parse url {}, Num item urls : {}".format(
            response.url, len(items)))
        for item in items:
            item_url = item.get("url")
            if utils.is_valid_url(item_url):
                item.update({"category": meta["category"]})
                yield Request(item_url,
                              self.parse_item,
                              meta=item,
                              errback=self.errback)

        # Navigate to next page
        if meta["page_idx"] < self.page_per_category_limit and len(items) > 0:
            meta["page_idx"] += 1
            next_page = meta["category_url_fmt"].format(meta["page_idx"])
            yield Request(next_page,
                          self.parse_category,
                          meta=meta,
                          errback=self.errback)
Esempio n. 2
0
    def parse_category(self, response):
        meta = dict(response.meta)
        item_urls = response.css(
            ".product-box-list>div>a::attr(href)").extract()

        self.logger.info("Parse url {}, Num item urls : {}".format(
            response.url, len(item_urls)))
        for item_url in item_urls:
            if utils.is_valid_url(item_url):
                yield Request(item_url,
                              self.parse_item,
                              meta=meta,
                              errback=self.errback,
                              headers=self.headers)

        # Navigate to next page
        if meta["page_idx"] < self.page_per_category_limit and len(
                item_urls) > 0:
            meta["page_idx"] += 1
            next_page = meta["category_url_fmt"].format(meta["page_idx"])
            yield Request(next_page,
                          self.parse_category,
                          meta=meta,
                          errback=self.errback,
                          headers=self.headers)
Esempio n. 3
0
    def crawl_reviews(self, url):
        # url = "https://my.lazada.vn/pdp/review/getReviewList?
        # itemId=102463766&pageSize=15&filter=0&sort=0&pageNo=1"
        ratings, reviews = {}, []
        if utils.is_valid_url(url):
            json_data = json.loads(self.get_response(url).content.decode("utf-8"))
            scores = json_data["model"]["ratings"]["scores"] or []
            ratings = {5-i: rating for i, rating in enumerate(scores)}

            full_reviews = json_data["model"]["items"] or []
            reviews = []
            for full_review in full_reviews:
                rating = full_review["rating"]

                review_time = full_review["zonedReviewTime"]
                review_time = utils.convert_unix_time(review_time)

                bought_time = full_review["zonedBoughtDate"]
                bought_time = utils.convert_unix_time(bought_time)

                review_title = full_review.get("reviewTitle", "") or ""
                review_content = full_review.get("reviewContent", "") or ""
                comment = review_title + " " + review_content

                reviews.append(dict(rating=rating, review_time=review_time,
                                    comment=comment, bought_time=bought_time))

        # for review in reviews:
        #     print("Time : {} - Star : {} - Comment : {}".format(
        #         review["review_time"], review["rating"], review["comment"]))

        return ratings, reviews
Esempio n. 4
0
    def parse_category(self, response):
        meta = dict(response.meta)

        # Find item data in script tag
        scripts = response.css("script").extract()
        prefix = "<script>window.pageData="
        postfix = "</script>"
        data = "{}"
        for script in scripts:
            if script.startswith(prefix):
                data = script
                break
        data = data[len(prefix):-len(postfix)]
        print(scripts)
        data = json.loads(data)

        # data = json.loads(response.text)
        items = data["mods"]["listItems"]

        self.logger.info("Parse url {}, Num item urls : {}".format(response.url, len(items)))
        for item in items:
            # item_url = self.base_url + item_url
            item_url = "https:" + item["productUrl"]
            item_data = dict(product_id=item["itemId"], model=item["name"], price=item["priceShow"],
                             description=item["description"], num_reviews=item["review"],
                             brand=item["brandName"], seller=item["sellerName"],
                             category=meta["category"])
            # print("\n\n\nItem url : {}\n\n\n".format(item_url))
            if utils.is_valid_url(item_url):
                yield Request(item_url, self.parse_item,
                              meta=dict(item=item_data),
                              errback=self.errback)
            else:
                print("\n\nERROR         XXXXXXXx     xXXX\nItem url : {}\n\n\n".format(item_url))
        # Navigate to next page
        print("\n\n\n======== NEXT NEXT NEXT NEXT NEXT =========\n\n")
        print("------  Page:  {} ---- Len(items): {}   --------\n".format(meta["page_idx"]+1, len(items)))
        if meta["page_idx"] < self.page_per_category_limit and len(items) > 0:
            meta["page_idx"] += 1
            print("\n\n\n======== NEXT NEXT NEXT NEXT NEXT =========\n\n")
            print("------    {}    --------".format(meta["page_idx"]))
            print("\n\n\n======== NEXT NEXT NEXT NEXT NEXT =========\n\n")
            next_page = meta["category_url_fmt"].format(meta["page_idx"])
            yield Request(next_page, self.parse_category, meta=meta, errback=self.errback)
Esempio n. 5
0
    def parse_category(self, response):
        meta = dict(response.meta)

        # Navigate to item
        item_urls = response.css("div.product-item a.product-item__thumbnail::attr(href)").extract()

        if len(item_urls) == 0:
            utils.save_str(response.text, "./Temp/adayroi_empty_category.html")

        self.logger.info("Parse url {}, Num item urls : {}".format(response.url, len(item_urls)))
        for item_url in item_urls:
            item_url = self.base_url + item_url
            if utils.is_valid_url(item_url):
                yield Request(item_url, self.parse_item, meta=meta, errback=self.errback)

        # Navigate to next page
        if (meta["page_idx"] + 1) < self.page_per_category_limit and len(item_urls) > 0:
            meta["page_idx"] += 1
            next_page = meta["category_url_fmt"].format(meta["page_idx"])
            yield Request(next_page, self.parse_category, meta=meta, errback=self.errback)
Esempio n. 6
0
    def parse_category_from_id(self, category_id, num_items):
        # Get all item
        print("\nCategory id : {}, Number items : {}".format(
            category_id, num_items))
        item_urls_fmt = "https://www.sendo.vn/m/wap_v2/category/product?" \
                        "category_id={}&p=1&s={}&sortType=default_listing_desc"
        all_item_url = item_urls_fmt.format(category_id, num_items)
        try:
            json_data = json.loads(
                self.get_response(all_item_url).content.decode("utf-8"))
            full_items = json_data["result"]["data"]
        except:
            self.logger.error(
                "\nError when all items of cat_id : {}, total_items : {}".
                format(category_id, num_items))
            return 0

        for full_item in full_items[:7]:

            cat_path = full_item["cat_path"]
            item_url_key = cat_path.replace(".html/", "")
            item_url = "https://www.sendo.vn/m/wap_v2/full/san-pham/{}".format(
                item_url_key)

            url = self.base_url + "/" + cat_path
            category = self.map_id_category.get(category_id,
                                                {}).get("Category name", "")
            item = dict(category=category,
                        category_id=category_id,
                        url=url,
                        product_id=full_item["product_id"],
                        model=full_item["name"],
                        price=full_item["final_price"],
                        seller=full_item["shop_name"])
            if utils.is_valid_url(item_url):
                yield Request(item_url,
                              self.parse_item,
                              meta=item,
                              errback=self.errback)
Esempio n. 7
0
    def parse_item(self, response):
        item = response.meta["item"]
        url = response.url
        item.update({"url": url})

        # Extract full info of product
        info = ""
        try:
            scripts = response.css("script").extract()
            keyword = "pageUrl"
            page_url = ""
            for script in scripts:
                start_index = script.find(keyword)
                if start_index >= 0:
                    start_index = start_index + len(keyword) + 3
                    end_index = script.find('"', start_index)
                    page_url = "https:" + script[start_index: end_index]
                    break
            if utils.is_valid_url(page_url):
                yield Request(page_url, self.parse_info, meta=response.meta, errback=self.errback)

        except:
            print("Error when extract info of item ", url)
Esempio n. 8
0
    def parse_category(self, response):
        meta = dict(response.meta)

        # # Get category id
        # scripts = response.css("script").extract()
        # pre = "window.__INITIAL_STATE__="
        # post = "</script>"
        # str_data = None
        # for script in scripts:
        #     if pre in script:
        #         str_data = script
        #         break
        #
        # if str_data is None:
        #     return 0
        #
        # start_index = str_data.find(pre) + len(pre)
        # end_index = str_data.find(post, start_index)
        # str_data = str_data[start_index: end_index]
        #
        # try:
        #     json_data = json.loads(str_data)
        #     category_id = json_data["data"]["ListingInfo"]["active"]["data"]["categoryId"]
        #
        # except:
        #     self.logger.error("\nError when parse json data to get category id of ", meta["category"])
        #     return 0

        # Get total items of category
        # item_urls_fmt = "https://www.sendo.vn/m/wap_v2/category/product?" \
        #                 "category_id={}&p={}&s={}&sortType=default_listing_desc"
        # page_id = random.randint(1, 6)
        # url = item_urls_fmt.format(category_id, page_id, 5)
        # try:
        #     json_data = json.loads(self.get_response(url).content.decode("utf-8"))
        #     total_items = json_data["result"]["meta_data"]["total_count"]
        # except:
        #     self.logger.error("\nError when get number items of "
        #                       "category {}, cat_id : {}".format(meta["category"], category_id))
        #     return 0

        # Get all item
        # total_items = 500
        # all_item_url = item_urls_fmt.format(category_id, 1, total_items)
        try:
            # json_data = json.loads(self.get_response(all_item_url).content.decode("utf-8"))
            json_data = json.loads(response.text)
            full_items = json_data["result"]["data"]
        except:
            self.logger.error(
                "\nError when all items of category {}, cat_id : {}".format(
                    meta["category"], meta["category_id"]))
            return 0

        self.logger.info("Parse url {}, Num item urls : {}".format(
            response.url, len(full_items)))
        for full_item in full_items:

            cat_path = full_item["cat_path"]
            item_url_key = cat_path.replace(".html/", "")
            item_url = "https://www.sendo.vn/m/wap_v2/full/san-pham/{}".format(
                item_url_key)

            url = self.base_url + "/" + cat_path
            item = dict(category=meta["category"],
                        category_id=meta["category_id"],
                        url=url,
                        product_id=full_item["product_id"],
                        model=full_item["name"],
                        price=full_item["final_price"],
                        seller=full_item["shop_name"])

            if utils.is_valid_url(item_url):
                yield Request(item_url,
                              self.parse_item,
                              meta=item,
                              errback=self.errback)

        # Navigate to next page
        if meta["page_idx"] < self.page_per_category_limit and len(
                full_items) > 0:
            meta["page_idx"] += 1
            next_page = meta["category_url_fmt"].format(
                meta["category_id"], meta["page_idx"])
            yield Request(next_page,
                          self.parse_category,
                          meta=meta,
                          errback=self.errback)