def get_description(self, hxs): short_description = hxs.select('//div[@class="productIntroCopy"]').extract()[0] try: suitable_for = ''.join(hxs.select('//div[@id="suitableFor"]//h4 | //div[@id="suitableFor"]//p | //div[@id="suitableFor"]//div').extract()) short_description += suitable_for except: print "There's no suitable_for section" try: ingredients = basic.clean_string(' '.join(hxs.select('//div[@class="pd_panel"][not(@id)]//div[@class="pd_HTML"]/p | //div[@class="pd_panel"][not(@id)]//div[@class="pd_HTML"]//div').extract())) if ingredients != '': ingredients = basic.cdata(ingredients) except: print "No ingredients found!" ingredients = None try: patient_information_url = hxs.select('//div[@class="downloadMedia"]//a/@href').extract()[0] except: print "No patient information found!" patient_information_url = None try: offer = hxs.select('//div[@id="mainOffer"]//a/text()').extract()[0] except: print "No special offer found!" offer = None try: promotion = hxs.select('//div[@id="otherOffers"]//a/text()').extract() except: print "No promotion found!" promotion = None try: sponsored = hxs.select('//div[@class="sponsored"]//p/text()').extract()[0] except: print "No sponsor message found!" sponsored = None description = ''.join(hxs.select('//div[@id="detailedInfo"]//div[@class="pd_panelInner"]//div[@class="pd_HTML"]').extract()) description = basic.clean_string(description) description_overflow = len(description)/2000 desc = [] if description_overflow > 0: for i in range(0, description_overflow + 1): if i < description_overflow: desc.append(description[2000*(i):2000*(i+1)-1]) else: desc.append(description[2000*i:]) else: desc = [description] try: in_stock = hxs.select('//div[@class="icon_pl_stock"]/text()').extract()[0] except: in_stock = "" return [basic.cdata(basic.clean_string(short_description))], [sponsored], desc, in_stock, [ingredients], patient_information_url, [offer], promotion
def get_server_path(self, url): images_array = [] for i in range(0, len(url)): url[i] = basic.clean_string(url[i]) images_array.append(self.images_store + "/full/" + hashlib.sha1(url[i]).hexdigest() + ".jpg") return images_array
def get_prices(self, hxs): tag = hxs.select('//dl[@class="lineItemList"]/dt/text()').extract() value = hxs.select('//dl[@class="lineItemList"]/dd/text()').extract() old_price = [] discount = [] price = [] if len(tag) > 1: old_price = [basic.clean_string(value[0])] try: discount = [basic.clean_string(value[len(value) - 1])] except IndexError: print "This product has no price." try: price = hxs.select('//span[@class="topAlignedPrice"]/text()').extract() except IndexError: print "This product has no price." if not old_price and not discount and not price: price = hxs.select('//dl[@class="inlineList"]/dd/text()').extract() return self.clean_price(old_price), self.clean_price(discount), self.clean_price(price)
def get_size(self, hxs): try: size = hxs.select('//span[@class="size"]/text()').extract()[0] size = basic.clean_string(size) size = size.replace("|", "") price_per_size = hxs.select('//span[@class="pricePerSize"]/text()').extract()[0] return [size], [price_per_size] except: print "No size found" return None, None
def shl_basic_info(self, hxs): name = hxs.select('div[@class="cat-ens-prod-info"]/h1/text()').extract() name = basic.clean_string_field(name) price = hxs.select('div[@class="cat-ens-prod-info"]/span/text()').extract() price = basic.clean_spaces_field(basic.clean_string_field(price)) style = hxs.select('div[@class="cat-ens-prod-info"]/text()').extract() if len(style) > 2: style = [basic.clean_string(style[1])] else: style = [] return name, price, style
def get_basic_info(self, hxs): """Gets basic info about products. Returns description and promo text""" description = hxs.select('//li[@class="cat-pro-desc"]').extract()[0] description = basic.clean_string(description) description = [basic.cdata(description)] promo_text = hxs.select('//span[@class="cat-pro-promo-text"]/text()').extract() if not promo_text: promo_text = hxs.select('//span[@class="cat-pro-promo-text"]/font').extract() if promo_text: promo_text = basic.cdata_field(promo_text) return description, promo_text
def get_basic_info(self, hxs): name = hxs.select('//div[@id="proddetail"]/h1/text()').extract() price = hxs.select('//div[@id="proddetail"]/div[@class="yourprice bigprice"]/text()').extract() description = basic.cdata(hxs.select('//div[@id="details"]').extract()[0]) description = basic.clean_string(description) old_price = hxs.select('//span[@class="yourprice_product"]/text()').extract() if not price: price = hxs.select('//span[@id="PriceDisplay"]/text()').extract() if old_price: old_price = [re.sub('[^0-9.]', '', old_price[0])] price = [re.sub('[^0-9.]', '', price[0])] return name, price, old_price, [description]
def get_basic_info(self, hxs): name = hxs.select('//div[@id="proddetail"]/h1/text()').extract() price = hxs.select( '//div[@id="proddetail"]/div[@class="yourprice bigprice"]/text()' ).extract() description = basic.cdata( hxs.select('//div[@id="details"]').extract()[0]) description = basic.clean_string(description) old_price = hxs.select( '//span[@class="yourprice_product"]/text()').extract() if not price: price = hxs.select('//span[@id="PriceDisplay"]/text()').extract() if old_price: old_price = [re.sub('[^0-9.]', '', old_price[0])] price = [re.sub('[^0-9.]', '', price[0])] return name, price, old_price, [description]
def get_price(self, hxs): """Getting product prices. Gets regular and discount price if there is one.""" price = hxs.select('//span[@id="divUnitPrice"]/text()').extract() if not price: price = hxs.select('//div[@id="product_price"]/span[1]/text()').extract() if not price: price = hxs.select('//div[@id="product_price"]/text()').extract() discount = hxs.select('//div[@id="product_price"]/span[@class="pc-salePrice"]/text()').extract() price = basic.clean_string(price[0]) price = re.sub(" +", " ", price) price = price.replace("Price:", "") price = price.replace("Prix:", "") price = basic.cdata(price.strip()) if discount: discount = basic.cdata_field(discount) return [price], discount
def get_variants(self, hxs, response): page = hxs.select("//html").extract() page = " ".join(page) dict_one = {} test_one = [] temp = page.split('<div class="color">') temp = temp[1].split("</div>") temp = temp[0].split("<select name") viewstate, eventvalidation, previouspage, hiddenfield, view_page, even_page, pre_page, hidd_page = self.get_vars( response, hxs ) if len(temp) == 1: color = hxs.select('//div[@class="color"]/text()').extract() value = hxs.select('//input[@id="ctl00_ContentPlaceHolder1_Variant1Hidden"]/@value').extract() color[0] = color[0].replace(" ", "") color = basic.clean_string(color[0]) value = value[0] # color = basic.clean_string(color[0]) # color = color.replace(" ","") # # dict['color'] = color # dict['color_value'] = value[0] else: test_color = basic.get_middle_text(temp[1], "farge</option>", "</select>") color = basic.get_middle_text(test_color[0], '">', "</option>") value = basic.get_middle_text(test_color[0], 'value="', '">') for i in range(0, len(color)): color[i] = color[i].replace(" ", "") # # dict['color'] = color # dict['color_value'] = value size_temp = page.split('<div class="size">') size_temp = size_temp[1].split("</div>") size_temp = size_temp[0].split("<select name") if len(size_temp) == 1: size = hxs.select('//div[@class="size"]/text()').extract() size = basic.clean_string(size[0]) size = [size.replace(" ", "")] size_val = hxs.select('//input[@id="ctl00_ContentPlaceHolder1_Variant2Hidden"]/@value').extract() if size[0] == "": for i in range(len(value)): resp_page = self.get_data(response, hidd_page, view_page, pre_page, even_page, value[i]) a_page = resp_page.split('<div class="siz') a_page = a_page[1].split("</select>") if len(a_page) == 1: size = basic.get_middle_text(a_page[0], 'e">', '<input type="hidden"') size_val = basic.get_middle_text(a_page[0], 'value="', '"') size_val = size_val[0] size_val = [size_val] else: a_page = basic.get_middle_text(a_page[0], "se</option>", "</select>") size = basic.get_middle_text(a_page[0], '">', "</option>") size_val = basic.get_middle_text(a_page[0], 'value="', '">') dict_one["color"] = color[i] dict_one["color_value"] = value[i] dict_one["size_value"] = size_val for x in range(0, len(size)): size[x] = basic.clean_string(size[x]) size[x] = size[x].replace(" ", "") dict_one["size"] = size test_one.append(basic.cdata(json.dumps(dict_one))) else: dict_one["color"] = color dict_one["color_value"] = value dict_one["size"] = size dict_one["size_value"] = size_val test_one.append(basic.cdata(simplejson.dumps(dict_one))) else: test_size = basic.get_middle_text(size_temp[1], "se</option>", "</select>") size = basic.get_middle_text(test_size[0], '">', "</option>") size_val = basic.get_middle_text(test_size[0], 'value="', '">') for x in range(0, len(size)): size[x] = basic.clean_string(size[x]) size[x] = size[x].replace(" ", "") dict_one["color"] = color dict_one["color_value"] = value dict_one["size"] = size dict_one["size_value"] = size_val test_one.append(basic.cdata(json.dumps(dict_one))) return test_one
def get_name(self, hxs): name = hxs.select('//div[@id="cat-pro-con-detail"]/h1/text()').extract()[0] name = [basic.clean_string(name)] return name
def parse(self, response): self.counter += 1 basic.print_status(self.counter, self.total) hxs = HtmlXPathSelector(response) item = KennethItem() #main try for script, run general except if error happens in code (send # url on mail where it happened) try: cur_url = response.url # search for noResultContent div on the page, if it exists keep # track, that product doesn't exist on # their page, otherwise continue scraping page available = hxs.select('//div[@id="noResultsContent"]').extract() if not available: index = self.products['urls'].index(cur_url) cur_id = self.get_product_id(cur_url) id = self.products['product_ids'][index] page = hxs.select('//div[@id="mainContent"]').extract() page = " ".join(page) item['name'], item['description'] = self.get_basic_info(hxs) price, new_p, old_p = self.get_prices(hxs) if new_p: item['new_price'] = new_p item['old_price'] = old_p else: item['price'] = price desc = basic.clean_string(item['description'][0]) item['description'] = [desc] urls = self.get_color_image(hxs) new = self.get_image_server_path(urls, id) item['color_image_urls'] = new self.export(item['color_image_urls'], [id], "swatchImage") jsons, images = self.we_also_recommend(cur_id, id) item['product_page'] = [cur_url] item['product_id'] = [id] item['add_to_cart_id'] = [cur_id] item['recommended_product'] = jsons item['in_stock'] = ["IN_STOCK"] self.products['status'][index] = "ran" images_or_404 = self.get_colors(hxs, page, id) if images_or_404 == 404: item['in_stock'] = ["NOT_AVAILABLE"] self.xml.create_xml(item) item['image_urls'] = [] if images_or_404 != 404: item['image_urls'] += images_or_404 item['image_urls'] += urls item['image_urls'] += images #self.export(item['image_urls']) #item['image_urls'] = [] #uncomment for donwloading images else: # part for handling products that are not available cur_id = self.get_product_id(cur_url) cur_url = "http://www.kennethcole.com/product/index.jsp?" cur_url += "productId=" + str(cur_id) index = self.products['urls'].index(cur_url) self.products['status'][index] = "no_avail" item['product_id'] = [self.products['product_ids'][index]] if self.products['product_ids'][index]: item['name'] = [self.products['names'][index]] else: item['name'] = ["not available"] item['in_stock'] = ["NOT_AVAILABLE"] self.xml.create_xml(item) self.exc.code_handler(102, cur_url) except: # part for catching errors and keeping track of numbers of # it and urls where it happened print "Error occured scraping this product" index = self.products['urls'].index(cur_url) self.products['status'][index] = "error" self.exc.code_handler(100, cur_url) return item
def get_basic_info(self, hxs): name = hxs.select('//h1[@class="fn"]/text()').extract() name = [basic.clean_string("".join(name))] brand = hxs.select('//span[@class="brand"]/text()').extract() name = [name[0].replace(u"\xa0", "")] return name, brand
def get_variants(self, hxs, response): page = hxs.select('//html').extract() page = " ".join(page) dict_one = {} test_one = [] temp = page.split('<div class="color">') temp = temp[1].split('</div>') temp = temp[0].split('<select name') viewstate, eventvalidation, previouspage, hiddenfield, view_page, even_page, pre_page, hidd_page = self.get_vars( response, hxs) if (len(temp) == 1): color = hxs.select('//div[@class="color"]/text()').extract() value = hxs.select( '//input[@id="ctl00_ContentPlaceHolder1_Variant1Hidden"]/@value' ).extract() color[0] = color[0].replace(" ", "") color = basic.clean_string(color[0]) value = value[0] # color = basic.clean_string(color[0]) # color = color.replace(" ","") # # dict['color'] = color # dict['color_value'] = value[0] else: test_color = basic.get_middle_text(temp[1], 'farge</option>', '</select>') color = basic.get_middle_text(test_color[0], '">', '</option>') value = basic.get_middle_text(test_color[0], 'value="', '">') for i in range(0, len(color)): color[i] = color[i].replace(" ", "") # # dict['color'] = color # dict['color_value'] = value size_temp = page.split('<div class="size">') size_temp = size_temp[1].split('</div>') size_temp = size_temp[0].split('<select name') if (len(size_temp) == 1): size = hxs.select('//div[@class="size"]/text()').extract() size = basic.clean_string(size[0]) size = [size.replace(" ", "")] size_val = hxs.select( '//input[@id="ctl00_ContentPlaceHolder1_Variant2Hidden"]/@value' ).extract() if size[0] == "": for i in range(len(value)): resp_page = self.get_data(response, hidd_page, view_page, pre_page, even_page, value[i]) a_page = resp_page.split('<div class="siz') a_page = a_page[1].split('</select>') if len(a_page) == 1: size = basic.get_middle_text(a_page[0], 'e">', '<input type="hidden"') size_val = basic.get_middle_text( a_page[0], 'value="', '"') size_val = size_val[0] size_val = [size_val] else: a_page = basic.get_middle_text(a_page[0], 'se</option>', '</select>') size = basic.get_middle_text(a_page[0], '">', '</option>') size_val = basic.get_middle_text( a_page[0], 'value="', '">') dict_one["color"] = color[i] dict_one["color_value"] = value[i] dict_one["size_value"] = size_val for x in range(0, len(size)): size[x] = basic.clean_string(size[x]) size[x] = size[x].replace(" ", "") dict_one["size"] = size test_one.append(basic.cdata(json.dumps(dict_one))) else: dict_one["color"] = color dict_one["color_value"] = value dict_one['size'] = size dict_one['size_value'] = size_val test_one.append(basic.cdata(simplejson.dumps(dict_one))) else: test_size = basic.get_middle_text(size_temp[1], 'se</option>', '</select>') size = basic.get_middle_text(test_size[0], '">', '</option>') size_val = basic.get_middle_text(test_size[0], 'value="', '">') for x in range(0, len(size)): size[x] = basic.clean_string(size[x]) size[x] = size[x].replace(" ", "") dict_one["color"] = color dict_one["color_value"] = value dict_one['size'] = size dict_one['size_value'] = size_val test_one.append(basic.cdata(json.dumps(dict_one))) return test_one