Exemple #1
0
    def parseDepartment(self, response):
        hxs = HtmlXPathSelector(response)
        products = hxs.select("//div[@class='zg_itemImmersion']")

        items = []

        for product in products:
            item = ProductItem()

            #TODO: the index for the title is sometimes out of range - sometimes it can't find that tag (remove the [0] to debug)
            list_name = product.select(
                "div[@class='zg_itemWrapper']//div[@class='zg_title']/a/text()"
            ).extract()
            if list_name:
                item['list_name'] = list_name[0]
            else:
                # if there's no product name don't include this product in the list, move on to the next
                continue

            url = product.select(
                "div[@class='zg_itemWrapper']//div[@class='zg_title']/a/@href"
            ).extract()
            if url:
                item['url'] = url[0].strip()
            else:
                # if there's no product url don't include this product in the list, move on to the next
                # one of the products in Lawn & Garden is missing a name and url, also in Sports & Outdoors
                continue

            #TODO: this needs to be refined, many prices etc. extract all prices? new, used etc
            prices = product.select(
                "div[@class='zg_itemWrapper']//div[@class='zg_price']")

            price = prices.select("strong[@class='price']/text()").extract()
            listprice = prices.select(
                "span[@class='listprice']/text()").extract()

            # some of the items don't have a price
            if price:
                item['price'] = price[0]

            # some items don't have a "list price"
            if listprice:
                item['listprice'] = listprice[0]

            # extract rank and ignore last character of the string (it's .)
            item['rank'] = product.select(
                ".//span[@class='zg_rankNumber']/text()").extract()[0][:-1]

            #dept_name = hxs.select("//ul[@id='zg_browseRoot']//span[@class='zg_selected']/text()").extract()[0].strip()
            item['department'] = response.meta['dept_name']  #dept_name

            # add url of bestsellers page this was found on
            item['bspage_url'] = response.url

            # pass the item to the parseProduct function to extract info from product page
            request = Request(item['url'], callback=self.parseProduct)
            request.meta['item'] = item

            yield request
    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        #TODO: !! select currency

        # extract tabs and their corresponding departments
        tabs = hxs.select("//ul[@id='tab-set']/li/a")

        departments = {}
        for tab in tabs:
            department_name = tab.select("text()").extract()[0]
            tab_id = tab.select("@href").extract()[0].replace("#", "")
            departments[tab_id] = department_name

        # for each deparment extract products from corresponding tab
        for tab in departments:

            department = departments[tab]

            # in compound output the Jewelry department is missing because it is a duplicate of Watches

            products = hxs.select("//div[@id='%s']/div[@class='OProduct']" %
                                  tab)

            # counter to keep track of products rank
            rank = 0
            for product in products:

                item = ProductItem()
                item['department'] = department

                rank += 1
                item['rank'] = str(rank)

                product_link = product.select(".//div[@class='Oname']/a")

                product_name = product_link.select("text()").extract()
                product_url = product_link.select("@href").extract()

                if product_name:
                    item['list_name'] = product_name[0].strip()

                if product_url:
                    item['url'] = product_url[0]

                    # if there's no url move on to next product
                else:
                    continue

                #TODO: change price to USD
                price = product.select(
                    ".//div[@class='Oprice']/span[@class='Ovalue']/span[@class='Ovalue']/text()"
                ).extract()
                if price:
                    item['price'] = price[0]

                # pass the item to the parseProduct method
                request = Request(item['url'], callback=self.parseProduct)
                request.meta['item'] = item
                yield request
Exemple #3
0
    def parsePage(self, response):
        #TODO: add department?

        hxs = HtmlXPathSelector(response)
        products = hxs.select("//li[@class='productbox']")

        products_per_page = 40
        page_nr = response.meta['page']

        # counter to keep track of product rank
        rank = 0

        for product in products:
            item = ProductItem()

            rank += 1

            product_link = product.select(".//a[@class='toplink']")
            url = product_link.select("@href").extract()
            if url:
                item['url'] = url[0]
            else:
                continue

            item['SKU'] = product.select("@data-sku").extract()[0]

            # compute global item rank using rank on current page and page number
            item['rank'] = str((page_nr - 1) * products_per_page + rank)

            product_name = product_link.select(
                "div[@class='prodname']/text()").extract()
            brand_name = product_link.select(
                "div[@class='prodname']/div[@class='prodbrandname emphasis']/text()"
            ).extract()

            if product_name:
                item['list_name'] = product_name[0].strip()
            if brand_name:
                item['brand'] = brand_name[0].strip()

            #TODO: also "Reg price", extract that as well?
            listprice = product.select(
                ".//div[@class='wasprice']/span/text()").extract()
            if listprice:
                item['listprice'] = listprice[0]

            price = product.select(
                ".//div[@class='price secondarytext midtitle']/text() | .//div[@class='price noticetext midtitle']/text()"
            ).extract()
            if price:
                item['price'] = price[0]

            # add date
            item['date'] = datetime.date.today().isoformat()

            # pass item to parseProduct method
            request = Request(item['url'], callback=self.parseProduct)
            request.meta['item'] = item
            yield request
Exemple #4
0
 def _scrape_product_links(self, response):
     parent_elts = response.css('#cat_bestSellers .item')
     for parent in parent_elts:
         product = ProductItem()
         url = parent.css('.product-name a::attr(href)').extract()[0]
         product['list_name'] = parent.css('.product-name a::text').extract()[0]
         price = parent.css('.special-price .price::text')
         listprice = parent.css('.old-price .price::text')
         price = price or parent.css('.price')[0].css('::text')
         try:
             listprice = listprice or parent.css('#old-price-')[0].css('::text')
         except IndexError:
             listprice = price
         product['price'] = ''.join(price.extract()).strip()
         product['listprice'] = ''.join(listprice.extract()).strip()
         yield url, product
    def parsePage(self, response, department):
        hxs = HtmlXPathSelector(response)
        products = hxs.select("//div[@class='productThumbnail showQuickView']")

        if not products:
            return

        # counter to hold rank of product
        rank = 0

        for product in products:
            item = ProductItem()

            rank += 1
            item['rank'] = str(rank)

            # get item department from response's meta
            item['department'] = department

            # extract name and url from bestsellers list
            product_link = product.select("div[@class='shortDescription']/a")
            name = product_link.select("text()").extract()
            if name:
                item['list_name'] = name[0]
            url = product_link.select("@href").extract()
            if url:
                item['url'] = url[0]

            # if there's no url move on to next product
            else:
                continue

            #TODO: add net price?

            # price = product.select(".//div[@class='prices']//span[@class='priceBig']/text()").extract()
            # if price:
            #     item['price'] = price[0]

            # call parseProduct method on each product]
            request = Request(item['url'], callback=self.parseProduct)
            request.meta['item'] = item

            yield request
Exemple #6
0
    def parsePage(self, response):
        hxs = HtmlXPathSelector(response)
        root_url = "http://www.bestbuy.com"
        products_per_page = 15

        max_products = self.max_products
        root_url = "http://www.bestbuy.com"

        # find page number by adding 1 to the previous one
        if 'page_nr' not in response.meta:
            page_nr = 1
        else:
            page_nr = response.meta['page_nr'] + 1

        # find product rank using page number and number of items per page
        rank = (page_nr - 1) * 15

        products = hxs.select("//div[@class='hproduct']")
        for product in products:
            item = ProductItem()
            rank += 1
            item['rank'] = str(rank)
            product_link = product.select("div[@class='info-main']/h3/a")
            item['list_name'] = product_link.select(
                "text()").extract()[0].strip()
            url = product_link.select("@href").extract()
            if url:
                item['url'] = root_url + url[0]
            item['department'] = response.meta['department']
            item['category'] = response.meta['category']
            item['bspage_url'] = response.url
            item['date'] = datetime.date.today().isoformat()

            item['SKU'] = product.select(
                ".//strong[@class='sku']/text()").extract()[0]

            #TODO: extract product model?

            saleprice = product.select(
                "div[@class='info-side']/div/h4[@class='price sale']/span/text()"
            ).extract()
            if saleprice:
                item['price'] = saleprice[0]

            # regular price
            regprice = product.select(
                "div[@class='info-side']/h4[@class='price regular']/span/text()"
            ).extract()
            if regprice:
                item['regprice'] = regprice[0]

            if rank > max_products:
                break

            if not url:
                yield item
            else:
                # send this product page to be parsed by parseProduct
                # ! duplicates are removed: products that are in more than one category will appear in only one of them
                #TODO: include duplicates if they are from different categories?
                yield Request(item['url'],
                              callback=self.parseProduct,
                              meta={"item": item})

        # select next page, if any, parse it too with this method
        if rank < max_products:
            next_page = hxs.select(
                "//ul[@class='pagination']/li/a[@class='next']/@href").extract(
                )
            if next_page:
                page_url = root_url + next_page[0]
                request = Request(url=page_url, callback=self.parsePage)
                request.meta['department'] = response.meta['department']
                request.meta['category'] = response.meta['category']
                request.meta['page_nr'] = page_nr
                yield request
    def parsePage(self, response):

        hxs = HtmlXPathSelector(response)

        # products in overall bestsellers list
        products = hxs.select("//div[@class='prodloop_cont']")

        # products in by-department bestsellers lists
        products2 = hxs.select("//div[@class='topSellersView']")

        # department name if any (for department-wise bestsellers pages)
        dept_name = ""

        #TODO: some items don't have the department field. check in nodepts_toysrus.txt
        department = hxs.select("//div[@id='breadCrumbs']/text()").extract()
        if department:
            # remove part before > and ignore first character from div content
            dept_name = department[0].split(">")[-1][1:].strip()

        # keep counter to set rank of product
        rank = 0

        for product in products:
            item = ProductItem()
            rank += 1

            item['rank'] = str(rank)
            
            # get product name in bestsellers list page
            name = product.select("a[@class='prodtitle']/text()").extract()
            item['list_name'] = name[0]

            # get relative url of product page and add its root prefix
            root_url = "http://www.toysrus.com"
            url = product.select("a[@class='prodtitle']/@href").extract()
            if url:
                item['url'] = root_url + url[0]

            # if there's no url move on to the next product
            else:
                continue

            # get price ("our price")
            price = product.select("div[@class='prodPrice familyPrices']/span[@class='ourPrice2']/text()").extract()
            if price:
                item['price'] = price[0]

            # get list price
            listprice = product.select("div[@class='prodPrice familyPrices']/span[@class='listPrice2']/text()").extract()
            if listprice:
                item['listprice'] = listprice[0]

            # send the item to be parsed by parseProduct
            request = Request(item['url'], callback = self.parseProduct)
            request.meta['item'] = item

            yield request

        for product in products2:
            item = ProductItem()

            name = product.select(".//li[@class='productTitle']/a/text()").extract()
            item['list_name'] = name[0]

            root_url = "http://www.toysrus.com"
            url = product.select(".//li[@class='productTitle']/a/@href").extract()
            if url:
                item['url'] = root_url + url[0]

            # if there's no url move on to the next product
            else:
                continue

            if dept_name:
                item['department'] = dept_name

            # eliminate final . from rank
            item['rank'] = product.select(".//div[@class='itemNumber']/text()").extract()[0][:-1]

            # add bestsellers page product was found on as a field
            item['bspage_url'] = response.url

            # get price ("our price")
            price = product.select(".//li[@class='prodPrice familyPrices']/span[@class='ourPrice2']/text()").extract()
            if price:
                item['price'] = price[0]

            # get list price
            listprice = product.select(".//li[@class='prodPrice familyPrices']/span[@class='listPrice2']/text()").extract()
            if listprice:
                item['listprice'] = listprice[0]

            # send the item to be parsed by parseProduct
            request = Request(item['url'], callback = self.parseProduct)
            request.meta['item'] = item

            yield request
    def parseDepartment(self, response):

        # some of the products are duplicates across departments, they will only appear once on the final list

        hxs = HtmlXPathSelector(response)

        department = response.meta['department']

        #TODO: what if there is pagination? haven't encountered it so far

        products = hxs.select("//div[@class='prodInfo']")

        # counter to keep track of product's rank
        rank = 0

        for product in products:
            item = ProductItem()

            # if inspect option was activated, add info on the context of the product element on the page
            if self.inspect:
                item['prod_context'] = product.select(
                    "ancestor::*[1]").extract()

            rank += 1
            item['rank'] = str(rank)

            product_link = product.select(
                "div[@class='prodInfoBox']/a[@class='prodLink ListItemLink']")

            product_name = product_link.select("text()").extract()
            product_url = product_link.select("@href").extract()

            if product_name:
                item['list_name'] = product_name[0]

            if product_url:
                item['url'] = self.root_url + product_url[0]
            else:
                # if there's no url move on to the next product
                continue

            item['department'] = department

            #TODO: some of the products have the "From" prefix before the price, should I include that?
            price_div = product.select(
                ".//div[@class='camelPrice'] | .//span[@class='camelPrice']")
            price1 = price_div.select(
                "span[@class='bigPriceText2']/text()").extract()
            price2 = price_div.select(
                "span[@class='smallPriceText2']/text()").extract()

            if price1 and price2:
                item['price'] = price1[0] + price2[0]

            #TODO: include out of stock products? :
            else:
                price1 = price_div.select(
                    "span[@class='bigPriceTextOutStock2']/text()").extract()
                price2 = price_div.select(
                    "span[@class='smallPriceTextOutStock2']/text()").extract()

                if price1 and price2:
                    item['price'] = price1[0] + price2[0]

            #TODO: are list prices always retrieved correctly?
            listprice = product.select(
                ".//div[@class='PriceMLtgry']/text").extract()
            if listprice:
                item['listprice'] = listprice[0]

            item['bspage_url'] = response.url

            # pass the item to the parseProduct method
            request = Request(item['url'], callback=self.parseProduct)
            request.meta['item'] = item

            yield request