Example #1
0
 def get_description(self, hxs):
     short_description = hxs.select('//div[@class="productIntroCopy"]').extract()[0]
     try:
         suitable_for = ''.join(hxs.select('//div[@id="suitableFor"]//h4 | //div[@id="suitableFor"]//p | //div[@id="suitableFor"]//div').extract())
         short_description += suitable_for
     except:
         print "There's no suitable_for section"
     try:
         ingredients = basic.clean_string(' '.join(hxs.select('//div[@class="pd_panel"][not(@id)]//div[@class="pd_HTML"]/p | //div[@class="pd_panel"][not(@id)]//div[@class="pd_HTML"]//div').extract()))
         if ingredients != '':
             ingredients = basic.cdata(ingredients)
     except:
         print "No ingredients found!"
         ingredients = None
     try:
         patient_information_url = hxs.select('//div[@class="downloadMedia"]//a/@href').extract()[0]
     except:
         print "No patient information found!"
         patient_information_url = None
     try:
         offer = hxs.select('//div[@id="mainOffer"]//a/text()').extract()[0]
     except:
         print "No special offer found!"
         offer = None
     try:
         promotion = hxs.select('//div[@id="otherOffers"]//a/text()').extract()
     except:
         print "No promotion found!"
         promotion = None
     try:
         sponsored = hxs.select('//div[@class="sponsored"]//p/text()').extract()[0]
     except:
         print "No sponsor message found!"
         sponsored = None
     description = ''.join(hxs.select('//div[@id="detailedInfo"]//div[@class="pd_panelInner"]//div[@class="pd_HTML"]').extract())
     description = basic.clean_string(description)
     description_overflow = len(description)/2000
     desc = []
     if description_overflow > 0:
         for i in range(0, description_overflow + 1):
             if i < description_overflow:
                 desc.append(description[2000*(i):2000*(i+1)-1])
             else:
                 desc.append(description[2000*i:])
     else:
         desc = [description]
     try:
         in_stock = hxs.select('//div[@class="icon_pl_stock"]/text()').extract()[0]
     except:
         in_stock = ""
     return [basic.cdata(basic.clean_string(short_description))], [sponsored], desc, in_stock, [ingredients], patient_information_url, [offer], promotion
Example #2
0
    def get_server_path(self, url):
        images_array = []
        for i in range(0, len(url)):
            url[i] = basic.clean_string(url[i])

            images_array.append(self.images_store + "/full/" + hashlib.sha1(url[i]).hexdigest() + ".jpg")

        return images_array
Example #3
0
    def get_server_path(self, url):
        images_array = []
        for i in range(0, len(url)):
            url[i] = basic.clean_string(url[i])

            images_array.append(self.images_store + "/full/" +
                                hashlib.sha1(url[i]).hexdigest() + ".jpg")

        return images_array
Example #4
0
 def get_prices(self, hxs):
     tag = hxs.select('//dl[@class="lineItemList"]/dt/text()').extract()
     value = hxs.select('//dl[@class="lineItemList"]/dd/text()').extract()
     old_price = []
     discount = []
     price = []
     if len(tag) > 1:
         old_price = [basic.clean_string(value[0])]
     try:
         discount = [basic.clean_string(value[len(value) - 1])]
     except IndexError:
         print "This product has no price."
     try:
         price = hxs.select('//span[@class="topAlignedPrice"]/text()').extract()
     except IndexError:
         print "This product has no price."
     if not old_price and not discount and not price:
         price = hxs.select('//dl[@class="inlineList"]/dd/text()').extract()
     return self.clean_price(old_price), self.clean_price(discount), self.clean_price(price)
Example #5
0
 def get_size(self, hxs):
     try:
         size = hxs.select('//span[@class="size"]/text()').extract()[0]
         size = basic.clean_string(size)
         size = size.replace("|", "")
         price_per_size = hxs.select('//span[@class="pricePerSize"]/text()').extract()[0]
         return [size], [price_per_size]
     except:
         print "No size found"
         return None, None
 def get_prices(self, hxs):
     tag = hxs.select('//dl[@class="lineItemList"]/dt/text()').extract()
     value = hxs.select('//dl[@class="lineItemList"]/dd/text()').extract()
     old_price = []
     discount = []
     price = []
     if len(tag) > 1:
         old_price = [basic.clean_string(value[0])]
     try:
         discount = [basic.clean_string(value[len(value) - 1])]
     except IndexError:
         print "This product has no price."
     try:
         price = hxs.select('//span[@class="topAlignedPrice"]/text()').extract()
     except IndexError:
         print "This product has no price."
     if not old_price and not discount and not price:
         price = hxs.select('//dl[@class="inlineList"]/dd/text()').extract()
     return self.clean_price(old_price), self.clean_price(discount), self.clean_price(price)
Example #7
0
 def shl_basic_info(self, hxs):
     name = hxs.select('div[@class="cat-ens-prod-info"]/h1/text()').extract()
     name = basic.clean_string_field(name)
     price = hxs.select('div[@class="cat-ens-prod-info"]/span/text()').extract()
     price = basic.clean_spaces_field(basic.clean_string_field(price))
     style = hxs.select('div[@class="cat-ens-prod-info"]/text()').extract()
     if len(style) > 2:
         style = [basic.clean_string(style[1])]
     else:
         style = []
     return name, price, style
Example #8
0
 def get_basic_info(self, hxs):
     """Gets basic info about products.
     Returns description and promo text"""
     description = hxs.select('//li[@class="cat-pro-desc"]').extract()[0]
     description = basic.clean_string(description)
     description = [basic.cdata(description)]
     promo_text = hxs.select('//span[@class="cat-pro-promo-text"]/text()').extract()
     if not promo_text:
         promo_text = hxs.select('//span[@class="cat-pro-promo-text"]/font').extract()
     if promo_text:
         promo_text = basic.cdata_field(promo_text)
     return description, promo_text
Example #9
0
 def get_basic_info(self, hxs):
     name = hxs.select('//div[@id="proddetail"]/h1/text()').extract()
     price = hxs.select('//div[@id="proddetail"]/div[@class="yourprice bigprice"]/text()').extract()
     description = basic.cdata(hxs.select('//div[@id="details"]').extract()[0])
     description = basic.clean_string(description)
     old_price = hxs.select('//span[@class="yourprice_product"]/text()').extract()
     if not price:
         price = hxs.select('//span[@id="PriceDisplay"]/text()').extract()
     if old_price:
         old_price = [re.sub('[^0-9.]', '', old_price[0])]
     price = [re.sub('[^0-9.]', '', price[0])]
     return name, price, old_price, [description]
Example #10
0
 def get_basic_info(self, hxs):
     name = hxs.select('//div[@id="proddetail"]/h1/text()').extract()
     price = hxs.select(
         '//div[@id="proddetail"]/div[@class="yourprice bigprice"]/text()'
     ).extract()
     description = basic.cdata(
         hxs.select('//div[@id="details"]').extract()[0])
     description = basic.clean_string(description)
     old_price = hxs.select(
         '//span[@class="yourprice_product"]/text()').extract()
     if not price:
         price = hxs.select('//span[@id="PriceDisplay"]/text()').extract()
     if old_price:
         old_price = [re.sub('[^0-9.]', '', old_price[0])]
     price = [re.sub('[^0-9.]', '', price[0])]
     return name, price, old_price, [description]
Example #11
0
 def get_price(self, hxs):
     """Getting product prices.
     Gets regular and discount price if there is one."""
     price = hxs.select('//span[@id="divUnitPrice"]/text()').extract()
     if not price:
         price = hxs.select('//div[@id="product_price"]/span[1]/text()').extract()
     if not price:
         price = hxs.select('//div[@id="product_price"]/text()').extract()
     discount = hxs.select('//div[@id="product_price"]/span[@class="pc-salePrice"]/text()').extract()
     price = basic.clean_string(price[0])
     price = re.sub(" +", " ", price)
     price = price.replace("Price:", "")
     price = price.replace("Prix:", "")
     price = basic.cdata(price.strip())
     if discount:
         discount = basic.cdata_field(discount)
     return [price], discount
Example #12
0
    def get_variants(self, hxs, response):
        page = hxs.select("//html").extract()
        page = " ".join(page)
        dict_one = {}
        test_one = []

        temp = page.split('<div class="color">')
        temp = temp[1].split("</div>")
        temp = temp[0].split("<select name")

        viewstate, eventvalidation, previouspage, hiddenfield, view_page, even_page, pre_page, hidd_page = self.get_vars(
            response, hxs
        )

        if len(temp) == 1:
            color = hxs.select('//div[@class="color"]/text()').extract()
            value = hxs.select('//input[@id="ctl00_ContentPlaceHolder1_Variant1Hidden"]/@value').extract()
            color[0] = color[0].replace("  ", "")
            color = basic.clean_string(color[0])
            value = value[0]

        #            color = basic.clean_string(color[0])
        #            color = color.replace("  ","")
        #
        #            dict['color'] = color
        #            dict['color_value'] = value[0]

        else:
            test_color = basic.get_middle_text(temp[1], "farge</option>", "</select>")
            color = basic.get_middle_text(test_color[0], '">', "</option>")
            value = basic.get_middle_text(test_color[0], 'value="', '">')

            for i in range(0, len(color)):
                color[i] = color[i].replace("  ", "")
            #
            #                dict['color'] = color
            #                dict['color_value'] = value

        size_temp = page.split('<div class="size">')
        size_temp = size_temp[1].split("</div>")
        size_temp = size_temp[0].split("<select name")

        if len(size_temp) == 1:
            size = hxs.select('//div[@class="size"]/text()').extract()
            size = basic.clean_string(size[0])
            size = [size.replace("   ", "")]

            size_val = hxs.select('//input[@id="ctl00_ContentPlaceHolder1_Variant2Hidden"]/@value').extract()

            if size[0] == "":
                for i in range(len(value)):
                    resp_page = self.get_data(response, hidd_page, view_page, pre_page, even_page, value[i])

                    a_page = resp_page.split('<div class="siz')
                    a_page = a_page[1].split("</select>")

                    if len(a_page) == 1:

                        size = basic.get_middle_text(a_page[0], 'e">', '<input type="hidden"')
                        size_val = basic.get_middle_text(a_page[0], 'value="', '"')
                        size_val = size_val[0]
                        size_val = [size_val]

                    else:
                        a_page = basic.get_middle_text(a_page[0], "se</option>", "</select>")
                        size = basic.get_middle_text(a_page[0], '">', "</option>")
                        size_val = basic.get_middle_text(a_page[0], 'value="', '">')

                    dict_one["color"] = color[i]
                    dict_one["color_value"] = value[i]
                    dict_one["size_value"] = size_val

                    for x in range(0, len(size)):
                        size[x] = basic.clean_string(size[x])
                        size[x] = size[x].replace("   ", "")

                        dict_one["size"] = size

                    test_one.append(basic.cdata(json.dumps(dict_one)))

            else:
                dict_one["color"] = color

                dict_one["color_value"] = value
                dict_one["size"] = size
                dict_one["size_value"] = size_val
                test_one.append(basic.cdata(simplejson.dumps(dict_one)))

        else:
            test_size = basic.get_middle_text(size_temp[1], "se</option>", "</select>")
            size = basic.get_middle_text(test_size[0], '">', "</option>")
            size_val = basic.get_middle_text(test_size[0], 'value="', '">')

            for x in range(0, len(size)):
                size[x] = basic.clean_string(size[x])
                size[x] = size[x].replace("   ", "")

            dict_one["color"] = color
            dict_one["color_value"] = value
            dict_one["size"] = size
            dict_one["size_value"] = size_val

            test_one.append(basic.cdata(json.dumps(dict_one)))

        return test_one
Example #13
0
 def get_name(self, hxs):
     name = hxs.select('//div[@id="cat-pro-con-detail"]/h1/text()').extract()[0]
     name = [basic.clean_string(name)]
     return name
Example #14
0
    def parse(self, response):
        self.counter += 1
        basic.print_status(self.counter, self.total)
        hxs = HtmlXPathSelector(response)
        item = KennethItem()
        #main try for script, run general except if error happens in code (send
        # url on mail where it happened)
        try:
            cur_url = response.url
                # search for noResultContent div on the page, if it exists keep
                # track, that product doesn't exist on
                # their page, otherwise continue scraping page
            available = hxs.select('//div[@id="noResultsContent"]').extract()

            if not available:
                index = self.products['urls'].index(cur_url)
                cur_id = self.get_product_id(cur_url)
                id = self.products['product_ids'][index]
                page = hxs.select('//div[@id="mainContent"]').extract()
                page = " ".join(page)
                item['name'], item['description'] = self.get_basic_info(hxs)
                price, new_p, old_p = self.get_prices(hxs)
                if new_p:
                    item['new_price'] = new_p
                    item['old_price'] = old_p
                else:
                    item['price'] = price
                desc = basic.clean_string(item['description'][0])
                item['description'] = [desc]
                urls = self.get_color_image(hxs)
                new = self.get_image_server_path(urls, id)
                item['color_image_urls'] = new
                self.export(item['color_image_urls'], [id], "swatchImage")
                jsons, images = self.we_also_recommend(cur_id, id)
                item['product_page'] = [cur_url]
                item['product_id'] = [id]
                item['add_to_cart_id'] = [cur_id]
                item['recommended_product'] = jsons
                item['in_stock'] = ["IN_STOCK"]
                self.products['status'][index] = "ran"
                images_or_404 = self.get_colors(hxs, page, id)
                if images_or_404 == 404:
                    item['in_stock'] = ["NOT_AVAILABLE"]
                self.xml.create_xml(item)
                item['image_urls'] = []
                if images_or_404 != 404:
                    item['image_urls'] += images_or_404
                item['image_urls'] += urls
                item['image_urls'] += images
                #self.export(item['image_urls'])
                #item['image_urls'] = [] #uncomment for donwloading images 

            else:
                # part for handling products that are not available
                cur_id = self.get_product_id(cur_url)
                cur_url = "http://www.kennethcole.com/product/index.jsp?"
                cur_url += "productId=" + str(cur_id)
                index = self.products['urls'].index(cur_url)
                self.products['status'][index] = "no_avail"
                item['product_id'] = [self.products['product_ids'][index]]
                if self.products['product_ids'][index]:
                    item['name'] = [self.products['names'][index]]
                else:
                    item['name'] = ["not available"]
                item['in_stock'] = ["NOT_AVAILABLE"]
                self.xml.create_xml(item)
                self.exc.code_handler(102, cur_url)
        except:
            # part for catching errors and keeping track of numbers of
            # it and urls where it happened
            print "Error occured scraping this product"
            index = self.products['urls'].index(cur_url)
            self.products['status'][index] = "error"
            self.exc.code_handler(100, cur_url)
        return item
Example #15
0
 def get_basic_info(self, hxs):
     name = hxs.select('//h1[@class="fn"]/text()').extract()
     name = [basic.clean_string("".join(name))]
     brand = hxs.select('//span[@class="brand"]/text()').extract()
     name = [name[0].replace(u"\xa0", "")]
     return name, brand
 def get_basic_info(self, hxs):
     name = hxs.select('//h1[@class="fn"]/text()').extract()
     name = [basic.clean_string("".join(name))]
     brand = hxs.select('//span[@class="brand"]/text()').extract()
     name = [name[0].replace(u"\xa0", "")]
     return name, brand
Example #17
0
    def get_variants(self, hxs, response):
        page = hxs.select('//html').extract()
        page = " ".join(page)
        dict_one = {}
        test_one = []

        temp = page.split('<div class="color">')
        temp = temp[1].split('</div>')
        temp = temp[0].split('<select name')

        viewstate, eventvalidation, previouspage, hiddenfield, view_page, even_page, pre_page, hidd_page = self.get_vars(
            response, hxs)

        if (len(temp) == 1):
            color = hxs.select('//div[@class="color"]/text()').extract()
            value = hxs.select(
                '//input[@id="ctl00_ContentPlaceHolder1_Variant1Hidden"]/@value'
            ).extract()
            color[0] = color[0].replace("  ", "")
            color = basic.clean_string(color[0])
            value = value[0]

        #            color = basic.clean_string(color[0])
        #            color = color.replace("  ","")
        #
        #            dict['color'] = color
        #            dict['color_value'] = value[0]

        else:
            test_color = basic.get_middle_text(temp[1], 'farge</option>',
                                               '</select>')
            color = basic.get_middle_text(test_color[0], '">', '</option>')
            value = basic.get_middle_text(test_color[0], 'value="', '">')

            for i in range(0, len(color)):
                color[i] = color[i].replace("  ", "")
            #
            #                dict['color'] = color
            #                dict['color_value'] = value

        size_temp = page.split('<div class="size">')
        size_temp = size_temp[1].split('</div>')
        size_temp = size_temp[0].split('<select name')

        if (len(size_temp) == 1):
            size = hxs.select('//div[@class="size"]/text()').extract()
            size = basic.clean_string(size[0])
            size = [size.replace("   ", "")]

            size_val = hxs.select(
                '//input[@id="ctl00_ContentPlaceHolder1_Variant2Hidden"]/@value'
            ).extract()

            if size[0] == "":
                for i in range(len(value)):
                    resp_page = self.get_data(response, hidd_page, view_page,
                                              pre_page, even_page, value[i])

                    a_page = resp_page.split('<div class="siz')
                    a_page = a_page[1].split('</select>')

                    if len(a_page) == 1:

                        size = basic.get_middle_text(a_page[0], 'e">',
                                                     '<input type="hidden"')
                        size_val = basic.get_middle_text(
                            a_page[0], 'value="', '"')
                        size_val = size_val[0]
                        size_val = [size_val]

                    else:
                        a_page = basic.get_middle_text(a_page[0],
                                                       'se</option>',
                                                       '</select>')
                        size = basic.get_middle_text(a_page[0], '">',
                                                     '</option>')
                        size_val = basic.get_middle_text(
                            a_page[0], 'value="', '">')

                    dict_one["color"] = color[i]
                    dict_one["color_value"] = value[i]
                    dict_one["size_value"] = size_val

                    for x in range(0, len(size)):
                        size[x] = basic.clean_string(size[x])
                        size[x] = size[x].replace("   ", "")

                        dict_one["size"] = size

                    test_one.append(basic.cdata(json.dumps(dict_one)))

            else:
                dict_one["color"] = color

                dict_one["color_value"] = value
                dict_one['size'] = size
                dict_one['size_value'] = size_val
                test_one.append(basic.cdata(simplejson.dumps(dict_one)))

        else:
            test_size = basic.get_middle_text(size_temp[1], 'se</option>',
                                              '</select>')
            size = basic.get_middle_text(test_size[0], '">', '</option>')
            size_val = basic.get_middle_text(test_size[0], 'value="', '">')

            for x in range(0, len(size)):
                size[x] = basic.clean_string(size[x])
                size[x] = size[x].replace("   ", "")

            dict_one["color"] = color
            dict_one["color_value"] = value
            dict_one['size'] = size
            dict_one['size_value'] = size_val

            test_one.append(basic.cdata(json.dumps(dict_one)))

        return test_one