def create_subproducts(self, page): """Gets information about colors from javascript. Returns field of dicts with information about colors. Those are really color variants for product.""" try: tmp = page.split("var largeImages = new Array();")[1] except IndexError: print "This product has no images" else: tmp = tmp.split("colorDropdownArray")[0] images = basic.get_middle_text(tmp, "ProductGroupProduct(", ");") image_names = self.get_image_names(page) color_products = [] for im in images: product = {} attributes = im.split("',") product['normal_image_url'] = "http://qa.partylite.biz/imaging/resize?fileName=/productcatalog/production" product['normal_image_url'] += self.custom_clean_string(attributes[26], True) product['description'] = basic.cdata(self.custom_clean_string(attributes[27])) product['color_id'] = self.custom_clean_string(attributes[7], True) product['swatch_color'] = basic.cdata(self.custom_clean_string(attributes[9]).replace(" ", "")) product['name'] = basic.cdata(image_names[product['color_id']]) product['add_to_cart_id'] = self.custom_clean_string(attributes[0], True).replace(" ", "") product['price'] = self.custom_clean_string(attributes[10], True) color_products.append(product) return color_products return []
def parse(self, response): self.counter += 1 basic.print_status(self.counter, self.total) hxs = HtmlXPathSelector(response) item = BootsItem() item['product_id'], item['store_id'], item['lang_id'], item['catalog_id'] = self.get_ids(hxs) item['name'] = self.get_name(hxs) item['short_description'], sponsored, description, in_stock, item['ingredients'], patient_information_url, item['offer'], item['promotion'] = self.get_description(hxs) item['rating'] = self.get_rating(hxs) size, price_per_size = self.get_size(hxs) item['normal_image_url'], image_urls = self.get_images(hxs) brand, brand_image_url = self.get_brand(hxs) item['save_money'], item['old_price'] = self.get_oldies(hxs) for i in range(0, len(description)): tag = 'description_%d' % (i + 1) item[tag] = [basic.cdata(description[i])] if sponsored is not None: item['sponsored'] = sponsored item['in_stock'] = ["NOT_IN_STOCK"] if in_stock == "In stock": item['in_stock'] = ["IN_STOCK"] item['order_id'] = hxs.select('//input[@name="orderId"]/@value').extract() item['cat_entry_id'] = hxs.select('//input[@name="catEntryId"]/@value').extract() item['calculation_usage_id'] = hxs.select('//input[@name="calculationUsageId"]/@value').extract() if brand_image_url is not None: item['brand'] = brand item['brand_image_url'] = ["43662980-f344-11e1-a21f-0800200c9a66/full/" + self.get_image_sha1(brand_image_url)] image_urls.append(brand_image_url) if patient_information_url is not None: item['patient_information_url'] = [basic.cdata(patient_information_url)] prices, point_prices, collect_points, colors, color_image_urls, variant_ids = self.get_color_variants(hxs) if size is not None: item['size'] = size item['price_per_size'] = price_per_size elif variant_ids is None: prices, point_prices, collect_points, sizes, variant_ids = self.get_size_variants(hxs) if color_image_urls is not None: image_urls.extend(color_image_urls) if variant_ids is not None: self.xml.create_xml(item) if colors is not None: self.create_color_variants(prices, point_prices, colors, color_image_urls, variant_ids, collect_points, item['product_id']) else: self.create_size_variants(prices, point_prices, sizes, variant_ids, collect_points, item['product_id']) else: prices = hxs.select('//p[@class="price"]/text()').extract()[0] point_prices = hxs.select('//span[@class="pointsPrice"]/text()').extract()[0] collect_points = [basic.get_price(hxs.select('//p[@class="collectPoints"]/text()').extract()[0])] item['price'] = [basic.get_price(prices)] item['points_price'] = [basic.get_price(point_prices)] item['collect_points'] = collect_points self.xml.create_xml(item) item['image_urls'] = image_urls #raw_input("Press Enter to continue...") return item
def _create_shop_looks(self, ids, names, urls): item = ExpressItem() for i in range(0, len(ids)): item['product_id'] = [ids[i]] item['name'] = [basic.cdata(names[i])] item['normal_image_url'] = [basic.cdata(urls[i])] item['shop_look'] = ['True'] item['normal'] = ['False'] item['shop_line'] = ['False'] item['in_stock'] = ['IN_STOCK'] self.xml.create_xml(item)
def get_description(self, hxs): short_description = hxs.select('//div[@class="productIntroCopy"]').extract()[0] try: suitable_for = ''.join(hxs.select('//div[@id="suitableFor"]//h4 | //div[@id="suitableFor"]//p | //div[@id="suitableFor"]//div').extract()) short_description += suitable_for except: print "There's no suitable_for section" try: ingredients = basic.clean_string(' '.join(hxs.select('//div[@class="pd_panel"][not(@id)]//div[@class="pd_HTML"]/p | //div[@class="pd_panel"][not(@id)]//div[@class="pd_HTML"]//div').extract())) if ingredients != '': ingredients = basic.cdata(ingredients) except: print "No ingredients found!" ingredients = None try: patient_information_url = hxs.select('//div[@class="downloadMedia"]//a/@href').extract()[0] except: print "No patient information found!" patient_information_url = None try: offer = hxs.select('//div[@id="mainOffer"]//a/text()').extract()[0] except: print "No special offer found!" offer = None try: promotion = hxs.select('//div[@id="otherOffers"]//a/text()').extract() except: print "No promotion found!" promotion = None try: sponsored = hxs.select('//div[@class="sponsored"]//p/text()').extract()[0] except: print "No sponsor message found!" sponsored = None description = ''.join(hxs.select('//div[@id="detailedInfo"]//div[@class="pd_panelInner"]//div[@class="pd_HTML"]').extract()) description = basic.clean_string(description) description_overflow = len(description)/2000 desc = [] if description_overflow > 0: for i in range(0, description_overflow + 1): if i < description_overflow: desc.append(description[2000*(i):2000*(i+1)-1]) else: desc.append(description[2000*i:]) else: desc = [description] try: in_stock = hxs.select('//div[@class="icon_pl_stock"]/text()').extract()[0] except: in_stock = "" return [basic.cdata(basic.clean_string(short_description))], [sponsored], desc, in_stock, [ingredients], patient_information_url, [offer], promotion
def get_emb(self, hxs): emb = hxs.select('//div[@id="emb"]').extract() lettering_colors = hxs.select( '//select[@id="threadcolor"]/option/@value').extract() urls = [] d = {} colors = [] for i in range(1, len(lettering_colors)): d['type'] = "lettering colors" d['name'] = lettering_colors[i] url = "http://www.lydiasuniforms.com/images/lydias/threadcolor_" url += lettering_colors[i].lower().replace(' ', '_') + ".gif" d['url'] = self.get_server_path_single(url) urls.append(url) colors.append(basic.cdata(simplejson.dumps(d))) lettering = hxs.select( '//select[@id="lettering"]/option/@value').extract() l = {} letterings = [] for i in range(1, len(lettering)): l['type'] = "lettering" l['name'] = lettering[i] url = "http://www.lydiasuniforms.com/images/lydias/lettering_" url += lettering[i].lower().replace(' ', '_') + ".gif" l['url'] = self.get_server_path_single(url) letterings.append(basic.cdata(simplejson.dumps(l))) urls.append(url) logo = hxs.select('//select[@id="logoname"]/option/@value').extract() logos = {} log = [] for i in range(1, len(logo)): logos['type'] = "logo" logos['name'] = logo[i] url = "http://www.lydiasuniforms.com/images/logos/" url += logo[i].lower() + ".jpg" logos['url'] = self.get_server_path_single(url) urls.append(url) log.append(basic.cdata(simplejson.dumps(logos))) item = LydiasItem() item['color'] = colors item['lettering'] = letterings item['log'] = log xml.create_xml(item) xml.write_xml("emb") return urls print colors, letterings, log os._exit(0)
def get_serials(self, hxs): serials = hxs.select('//var[@class="hidden styleInfo"]/text()').extract() new = [] for serial in serials: d = simplejson.loads(serial) new.append(basic.cdata(simplejson.dumps(d))) return new
def get_colors(self, hxs): colors = hxs.select('//var[@class="styleInfo"]/text()').extract() new = [] for color in colors: d = simplejson.loads(color) new.append(basic.cdata(simplejson.dumps(d))) return new
def get_description(self, hxs): description = hxs.select( '//div[@id="FieldsetProductInfo"]/text()').extract()[3] features = hxs.select('//div[@id="FieldsetProductInfo"]/ul').extract() if features: features = [features[0][:2000]] return [basic.cdata(description)], basic.cdata_field(features)
def get_emb(self, hxs): emb = hxs.select('//div[@id="emb"]').extract() lettering_colors = hxs.select('//select[@id="threadcolor"]/option/@value').extract() urls = [] d = {} colors = [] for i in range(1, len(lettering_colors)): d['type'] = "lettering colors" d['name'] = lettering_colors[i] url = "http://www.lydiasuniforms.com/images/lydias/threadcolor_" url += lettering_colors[i].lower().replace(' ', '_') + ".gif" d['url'] = self.get_server_path_single(url) urls.append(url) colors.append(basic.cdata(simplejson.dumps(d))) lettering = hxs.select('//select[@id="lettering"]/option/@value').extract() l = {} letterings = [] for i in range(1, len(lettering)): l['type'] = "lettering" l['name'] = lettering[i] url = "http://www.lydiasuniforms.com/images/lydias/lettering_" url += lettering[i].lower().replace(' ', '_') + ".gif" l['url'] = self.get_server_path_single(url) letterings.append(basic.cdata(simplejson.dumps(l))) urls.append(url) logo = hxs.select('//select[@id="logoname"]/option/@value').extract() logos = {} log = [] for i in range(1, len(logo)): logos['type'] = "logo" logos['name'] = logo[i] url = "http://www.lydiasuniforms.com/images/logos/" url += logo[i].lower() + ".jpg" logos['url'] = self.get_server_path_single(url) urls.append(url) log.append(basic.cdata(simplejson.dumps(logos))) item = LydiasItem() item['color'] = colors item['lettering'] = letterings item['log'] = log xml.create_xml(item) xml.write_xml("emb") return urls print colors, letterings, log os._exit(0)
def create_sizes_subproducts(self, main_id, id, color_code, hxs): print color_code jsons = [] # if block for cases when color is provided if color_code != "": showmode = hxs.select( '//input[@name="showmode"]/@value').extract()[0] itemmode = hxs.select( '//input[@name="itemmode"]/@value').extract()[0] salemode = hxs.select( '//input[@name="salemode"]/@value').extract()[0] url = "http://www.lydiasuniforms.com/ajaxed/product-showoptions.asp?sku=%s&opt1=%s&opt2=-1&type2=l1type&" \ "type3=&showmode=%s&itemmode=%s&salemode=%s&rnum=193" % (main_id, color_code, showmode, itemmode, salemode) page = urllib2.urlopen(url).read() page = page.replace("'", "") page = page.replace("[", ",") page = page.replace(",,", "") temp = page.split("]") for i in range(0, len(temp) - 2): tmp = temp[i].split(",") item = {} # item['master_product_id'] = [id] item['size_short'] = tmp[0] item['price_url'] = self.get_size_price( str(main_id), str(color_code), tmp[0]) item['size'] = tmp[1] # item['product_id'] = [id + "_" + str(i)] # item['in_stock'] = ["IN_STOCK"] # xml.create_xml(item) jsons.append(basic.cdata(simplejson.dumps(item))) return jsons # when the color is not provided different block of code cause it's done differently on the page else: temp = hxs.select('//div[@class="not_size"]/text()').extract() for i in range(0, len(temp)): item = {} # item['master_product_id'] = [id] # item['product_id'] = [id + "_" + str(i)] item['size_short'] = temp[i] item['price_url'] = self.get_size_price( str(main_id), "", temp[i]) # item['in_stock'] = ["IN_STOCK"] # xml.create_xml(item) jsons.append(basic.cdata(simplejson.dumps(item))) return jsons
def get_basic_info(self, hxs): """Getting basic info about products (name, shown with).""" name = hxs.select('//div[@id="product_name"]/text()').extract() if name: name = basic.cdata_field(name) shown_with = hxs.select('//div[@id="shown_with_container"]').extract() if shown_with: shown_with = [basic.cdata(shown_with[0])] return name, shown_with
def make_colors_json(self, color_urls, color_names, color_codes): dict = {} jsons = [] for i in range(0, len(color_urls)): dict['color_url'] = self.get_server_path_single(color_urls[i]) dict['color_name'] = color_names[i] dict['color_short'] = color_codes[i] json = basic.cdata(simplejson.dumps(dict)) jsons.append(json) return jsons
def gold_coverage(self, hxs): ids = hxs.select('//div[@class="goldCoverage"]/input[@type="checkbox"]/@value').extract() labels = hxs.select('//div[@class="goldCoverage"]/label/text()').extract() d = {} new = [] for i in range(0, len(ids)): d['id'] = ids[i] d['name'] = labels[i] new.append(basic.cdata(simplejson.dumps(d))) return new
def gold_coverage(self, hxs): ids = hxs.select('//div[@class="goldCoverage"]/input[@type="checkbox"]/@value').extract() labels = hxs.select('//div[@class="goldCoverage"]/label/text()').extract() d = {} new = [] for i in range(0, len(ids)): d["id"] = ids[i] d["name"] = labels[i] new.append(basic.cdata(simplejson.dumps(d))) return new
def parse_can(self, response): """Parse function for scraping canadian sites. There is meta information send in request in this function about language.""" self.counter += 1 basic.print_status(self.counter, self.total) item = PartyliteItem() hxs = HtmlXPathSelector(response) image_urls = [] if 'redirect_urls' in response.request.meta: item['product_id'] = [self.get_id(response.request.meta['redirect_urls'][0])[0]] self.exc.code_handler(102, response.request.meta['redirect_urls']) if 'language' in response.request.meta: item['product_id'] = [self.get_id(response.request.meta['redirect_urls'][0])[0] + "_can" + "_" + response.meta['language']] try: index = self.products['product_ids'].index(self.get_id (response.request.meta['redirect_urls'][0])[0]) item['name'] = [basic.cdata(item['product_id'][0] + self.products['names'][index])] self.products['status'][index] = 'no_avail' except KeyError as e: print "This %s id is not in list" % (item['product_id'][0]) item['in_stock'] = ['NOT_AVAILABLE'] item['product_id'] = self.remove_spaces(item['product_id']) self.xml.create_xml(item) else: index = self.products['product_ids'].index(self.get_id(response.url)[0]) try: item['product_id'] = self.get_id(response.url) item['name'], item['shown_with'] = self.get_basic_info(hxs) item['description'] = self.get_description(hxs) if 'language' in response.meta: item['product_id'] = [item['product_id'][0] + "_can" + "_" + response.meta['language']] response.meta['item'] = item page = " ".join(hxs.select('//html').extract()) image_urls = self.get_more_images(page) item['normal_image_url'] = self.get_server_path_field(image_urls) item['in_stock'] = self.get_in_stock(hxs) color_products = self.create_subproducts(page) if color_products: self.write_subproducts(item['product_id'], color_products, xml) else: item['add_to_cart_id'] = self.get_add_to_cart_id(page) item['custom_price'], item['custom_discount'] = self.get_price(hxs) self.products['status'][index] = "ran" except StandardError: basic.print_error() self.products['status'][index] = "error" self.exc.code_handler(100, response.url) else: item['product_id'] = self.remove_spaces(item['product_id']) self.xml.create_xml(item) if image_urls: item['image_urls'] = image_urls return item
def parse(self, response): self.counter += 1 basic.print_status(self.counter, self.total) hxs = HtmlXPathSelector(response) item = SportmanItem() if "redirect_urls" in response.request.meta: cur_url = response.request.meta["redirect_urls"][0] else: cur_url = response.url index = self.products["urls"].index(cur_url) try: if "redirect_urls" in response.request.meta: item["product_id"] = [self.products["product_ids"][index]] item["name"] = [self.products["names"][index]] item["in_stock"] = ["NOT_AVAILABLE"] self.exc.code_handler(102, response.url) self.xml.create_xml(item) self.products["status"][index] = "no_avail" else: item["name"], item["short_desc"], item["description"], item["old_price"], item["custom_price"], item[ "product_id" ], item["sku"] = self.get_basic_info(hxs) item["in_stock"] = ["IN_STOCK"] viewstate, eventval, prevpage, hidden, view_page, even_page, pre_page, hidd_page = self.get_vars( response, hxs ) viewstate1 = viewstate[:2000] viewstate2 = viewstate[2000:4000] viewstate3 = viewstate[4000:6000] viewstate4 = viewstate[6000:8000] viewstate5 = viewstate[8000:10000] viewstate6 = viewstate[10000:] item["viewstate1"] = [basic.cdata(viewstate1)] item["viewstate2"] = [basic.cdata(viewstate2)] item["viewstate3"] = [basic.cdata(viewstate3)] item["viewstate4"] = [basic.cdata(viewstate4)] item["viewstate5"] = [basic.cdata(viewstate5)] item["viewstate6"] = [basic.cdata(viewstate6)] item["eventval"] = [basic.cdata(eventval)] item["size_options"] = self.get_variants(hxs, response) images_url = self.get_images(hxs) item["normal_image_url"] = self.get_server_path(images_url) self.xml.create_xml(item) item.clear() item["image_urls"] = self.get_images(hxs) self.products["status"][index] = "ran" except: self.exc.code_handler(100, response.url) self.products["status"][index] = "error" else: return item
def create_sizes_subproducts(self, main_id, id, color_code, hxs): print color_code jsons = [] # if block for cases when color is provided if color_code != "": showmode = hxs.select('//input[@name="showmode"]/@value').extract()[0] itemmode = hxs.select('//input[@name="itemmode"]/@value').extract()[0] salemode = hxs.select('//input[@name="salemode"]/@value').extract()[0] url = "http://www.lydiasuniforms.com/ajaxed/product-showoptions.asp?sku=%s&opt1=%s&opt2=-1&type2=l1type&" \ "type3=&showmode=%s&itemmode=%s&salemode=%s&rnum=193" % (main_id, color_code, showmode, itemmode, salemode) page = urllib2.urlopen(url).read() page = page.replace("'", "") page = page.replace("[", ",") page = page.replace(",,", "") temp = page.split("]") for i in range(0, len(temp) - 2): tmp = temp[i].split(",") item = {} # item['master_product_id'] = [id] item['size_short'] = tmp[0] item['price_url'] = self.get_size_price(str(main_id), str(color_code), tmp[0]) item['size'] = tmp[1] # item['product_id'] = [id + "_" + str(i)] # item['in_stock'] = ["IN_STOCK"] # xml.create_xml(item) jsons.append(basic.cdata(simplejson.dumps(item))) return jsons # when the color is not provided different block of code cause it's done differently on the page else: temp = hxs.select('//div[@class="not_size"]/text()').extract() for i in range(0, len(temp)): item = {} # item['master_product_id'] = [id] # item['product_id'] = [id + "_" + str(i)] item['size_short'] = temp[i] item['price_url'] = self.get_size_price(str(main_id), "", temp[i]) # item['in_stock'] = ["IN_STOCK"] # xml.create_xml(item) jsons.append(basic.cdata(simplejson.dumps(item))) return jsons
def get_basic_info(self, hxs): name = hxs.select('//div[@id="proddetail"]/h1/text()').extract() price = hxs.select('//div[@id="proddetail"]/div[@class="yourprice bigprice"]/text()').extract() description = basic.cdata(hxs.select('//div[@id="details"]').extract()[0]) description = basic.clean_string(description) old_price = hxs.select('//span[@class="yourprice_product"]/text()').extract() if not price: price = hxs.select('//span[@id="PriceDisplay"]/text()').extract() if old_price: old_price = [re.sub('[^0-9.]', '', old_price[0])] price = [re.sub('[^0-9.]', '', price[0])] return name, price, old_price, [description]
def make_json(self, ids, names, prices, images, urls): jsons = [] for i in range(0, len(ids)): json = "{" + ' "id" : "' + str(ids[i][0]) + '", ' json += '"name" : "' + str(names[i][0]) + '", ' # insert function for storing the right image path json += '"image_url" : "' + str(images[i]) + '", ' json += '"product_url" : "' + urls[i] + '", ' json += '"price" : "' + str(prices[i][0]) + '" } ' json = basic.cdata(json) jsons.append(json) return jsons
def get_basic_info(self, hxs): """Gets basic info about products. Returns description and promo text""" description = hxs.select('//li[@class="cat-pro-desc"]').extract()[0] description = basic.clean_string(description) description = [basic.cdata(description)] promo_text = hxs.select('//span[@class="cat-pro-promo-text"]/text()').extract() if not promo_text: promo_text = hxs.select('//span[@class="cat-pro-promo-text"]/font').extract() if promo_text: promo_text = basic.cdata_field(promo_text) return description, promo_text
def parse(self, response): self.counter += 1 basic.print_status(self.counter, self.total) hxs = HtmlXPathSelector(response) item = SportmanItem() if 'redirect_urls' in response.request.meta: cur_url = response.request.meta['redirect_urls'][0] else: cur_url = response.url index = self.products['urls'].index(cur_url) try: if 'redirect_urls' in response.request.meta: item['product_id'] = [self.products['product_ids'][index]] item['name'] = [self.products['names'][index]] item['in_stock'] = ["NOT_AVAILABLE"] self.exc.code_handler(102, response.url) self.xml.create_xml(item) self.products["status"][index] = "no_avail" else: item["name"], item["short_desc"], item["description"], item[ "old_price"], item["custom_price"], item[ "product_id"], item["sku"] = self.get_basic_info(hxs) item['in_stock'] = ['IN_STOCK'] viewstate, eventval, prevpage, hidden, view_page, even_page, pre_page, hidd_page = self.get_vars( response, hxs) viewstate1 = viewstate[:2000] viewstate2 = viewstate[2000:4000] viewstate3 = viewstate[4000:6000] viewstate4 = viewstate[6000:8000] viewstate5 = viewstate[8000:10000] viewstate6 = viewstate[10000:] item["viewstate1"] = [basic.cdata(viewstate1)] item["viewstate2"] = [basic.cdata(viewstate2)] item["viewstate3"] = [basic.cdata(viewstate3)] item["viewstate4"] = [basic.cdata(viewstate4)] item["viewstate5"] = [basic.cdata(viewstate5)] item["viewstate6"] = [basic.cdata(viewstate6)] item["eventval"] = [basic.cdata(eventval)] item["size_options"] = self.get_variants(hxs, response) images_url = self.get_images(hxs) item["normal_image_url"] = self.get_server_path(images_url) self.xml.create_xml(item) item.clear() item['image_urls'] = self.get_images(hxs) self.products["status"][index] = "ran" except: self.exc.code_handler(100, response.url) self.products["status"][index] = "error" else: return item
def make_reviews_json(self, title, text, author, location): jsons = [] print len(title) print len(text) print len(author) print len(location) os._exit(0) for i in range(0, len(title)): json = '{ "title" : " %s ", "text" : "%s", "author" : "%s", "location" :\ "%s" }' % (title[i], text[i], author[i], location[i]) json = basic.cdata(json) jsons.append(json) return jsons
def get_images(self, hxs): images = hxs.select('//ul[@id="prodDetailThumbs"]/li/a/@href').extract() tags = hxs.select('//ul[@id="prodDetailThumbs"]/li/@class').extract() images_list = [] d = {} img = [] for i in range(0, len(images)): d["image_url"] = self.get_server_path(images[i]) img.append(images[i]) if "site1sku" in tags[i]: d["product_serial"] = tags[i].replace("site1sku", "") else: d["product_serial"] = tags[i] images_list.append(basic.cdata(simplejson.dumps(d))) return images_list, img
def get_images(self, hxs): images = hxs.select('//ul[@id="prodDetailThumbs"]/li/a/@href').extract() tags = hxs.select('//ul[@id="prodDetailThumbs"]/li/@class').extract() images_list = [] d = {} img = [] for i in range(0, len(images)): d['image_url'] = self.get_server_path(images[i]) img.append(images[i]) if "site1sku" in tags[i]: d['product_serial'] = tags[i].replace("site1sku", "") else: d['product_serial'] = tags[i] images_list.append(basic.cdata(simplejson.dumps(d))) return images_list, img
def get_basic_info(self, hxs): name = hxs.select('//div[@id="proddetail"]/h1/text()').extract() price = hxs.select( '//div[@id="proddetail"]/div[@class="yourprice bigprice"]/text()' ).extract() description = basic.cdata( hxs.select('//div[@id="details"]').extract()[0]) description = basic.clean_string(description) old_price = hxs.select( '//span[@class="yourprice_product"]/text()').extract() if not price: price = hxs.select('//span[@id="PriceDisplay"]/text()').extract() if old_price: old_price = [re.sub('[^0-9.]', '', old_price[0])] price = [re.sub('[^0-9.]', '', price[0])] return name, price, old_price, [description]
def get_colors(self, page, color_names): """Gets color information with images from javascript on the page. Returns json with color name and imagself.images_store = "/" + settings['IMAGES_STORE']e url for that color, and returnes filed of image urls that can be used for download later""" script = basic.get_middle_text(page, 'var imageMap_0 = new Array();', '</script>')[0] colors = basic.get_middle_text(script, '] = ', ';') image_urls = [] colors_json = [] for i in range(0, len(color_names)): color = burton.replace_color_json(colors[i]) color = simplejson.loads(color) color['cname'] = color_names[i] color.pop('reg') image_urls.append(color['enh']) color['enh'] = self.get_server_path(color['enh']) colors_json.append(basic.cdata(simplejson.dumps(color))) return colors_json, image_urls
def get_price(self, hxs): """Getting product prices. Gets regular and discount price if there is one.""" price = hxs.select('//span[@id="divUnitPrice"]/text()').extract() if not price: price = hxs.select('//div[@id="product_price"]/span[1]/text()').extract() if not price: price = hxs.select('//div[@id="product_price"]/text()').extract() discount = hxs.select('//div[@id="product_price"]/span[@class="pc-salePrice"]/text()').extract() price = basic.clean_string(price[0]) price = re.sub(" +", " ", price) price = price.replace("Price:", "") price = price.replace("Prix:", "") price = basic.cdata(price.strip()) if discount: discount = basic.cdata_field(discount) return [price], discount
def get_recommended(self, hxs): """Gets recommended product information. Returns information about recommended products as dict""" rec = hxs.select('//div[@id="right_column_container"]/div') new = [] i = 0 for r in rec: d = {} #to do: see how to get full href(different accounts) if not i: d['link'] = r.select('div/a/@href').extract()[0] d['image'] = "http://www.partylite.biz/imaging/resize" d['image'] += r.select('div/a/img/@src').extract()[0] d['name'] = r.select('div/a/text()').extract()[0] new.append(basic.cdata(simplejson.dumps(d))) i += 1 return new
def get_variants(self, page): """Gets jsons for colors with all available sizes. In json are also fetched all information for sizes that are on the site """ script = basic.get_middle_text(page, 'var skuSizeColorObj = new Array();', '</script>')[0] sizes = [] image_urls = [] color_names = [] colors = script.split('skuSizeColorObj') for c in range(1, len(colors)): temp = basic.get_middle_text(colors[c], '= ', ';') # delete swatch image as it obviously won't be needed t = simplejson.loads(burton.replace_for_json(temp[0])) image_urls.append(t['swatchURL']) color_names.append(t['ColorDesc']) t['swatchURL'] = self.get_server_path(t['swatchURL']) sizes.append(basic.cdata(simplejson.dumps(t))) return sizes, image_urls, color_names
def get_basic_info(self, hxs): name = hxs.select('//div[@id="fragment-1"]/h2/text()').extract() short_desc = hxs.select( '//div[@class="description2"]/text()').extract() description = hxs.select( '//div[@id="fragment-1"]/div[@class="description"]').extract() description = sportman.delete_tags(re, description[0]) description = [basic.cdata(description)] old_price = hxs.select('//span[@class="oldprice"]/text()').extract() if (old_price != []): old_price = " ".join(old_price) old_price = old_price.split(':') old_price = old_price[1].replace('Kr', '') old_price = [old_price.replace(" ", "")] else: old_price = old_price price = hxs.select('//span[@class="nowprice"]/text()').extract() if (price != []): price = " ".join(price) price = price.split(':') price = price[1].replace('Kr', '') price = [price.replace(" ", "")] else: price = hxs.select('//span[@class="normalprice"]/text()').extract() price = " ".join(price) price = price.split(':') price = price[1].replace('Kr', '') price = [price.replace(" ", "")] id = hxs.select('//div[@class="articlenumber"]').extract() id = " ".join(id) id = id.replace(u"\xa0", "") id = basic.get_middle_text(id, 'Art.nr.', '</div>') sku = id id = [id[0]] return name, short_desc, description, old_price, price, id, sku
def get_sizes(self, id, hxs): showmode = hxs.select('//input[@name="showmode"]/@value').extract()[0] itemmode = hxs.select('//input[@name="itemmode"]/@value').extract()[0] salemode = hxs.select('//input[@name="salemode"]/@value').extract()[0] url = "http://www.lydiasuniforms.com/ajaxed/product-showoptions.asp?sku=%s&opt1=AV&opt2=-1&type2=l1type" % (id) url += "&type3=&showmode=%s&itemmode=%s&salemode=%s&rnum=429" % (showmode, itemmode, salemode) jsons = [] print "reading page..." page = urllib2.urlopen(url).read() print "page read" page = page.replace("'", "") page = page.replace("[", ",") page = page.replace(",,", "") temp = page.split("]") for i in range(0, len(temp) - 2): tmp = temp[i].split(",") json = '{ "size_short" : " %s ", "size_full" : "%s", "some_number" :\ "%s", "some_id" : "%s" }' % (tmp[0], tmp[1], tmp[2], tmp[3]) json = basic.cdata(json) jsons.append(json) return jsons
def parse(self, response): self.counter += 1 basic.print_status(self.counter, self.total) hxs = HtmlXPathSelector(response) item = BurtonItem() page = hxs.extract() if 'redirect_urls' in response.request.meta: cur_url = response.request.meta['redirect_urls'][0] else: cur_url = response.url index = self.products['urls'].index(cur_url) try: if 'redirect_urls' in response.request.meta: item['product_id'] = [self.products['product_ids'][index]] item['name'] = [self.products['names'][index]] item['in_stock'] = ["NOT_AVAILABLE"] self.exc.code_handler(102, response.url) self.xml.create_xml(item) self.products["status"][index] = "no_avail" else: item['product_id'], item['name'] = self.get_basic_info(hxs) item['description'], item['features'] = self.get_description( hxs) item['variants'], thumb_urls, color_names = self.get_variants( page) item['all_sizes'] = self.get_all_sizes(page) item['color_json'], image_urls = self.get_colors( page, color_names) item['price'], item['old_price'] = self.get_prices(hxs) item['in_stock'] = ['IN_STOCK'] item['product_link'] = [basic.cdata(response.url)] self.xml.create_xml(item) item['image_urls'] = image_urls + thumb_urls self.products["status"][index] = "ran" except: self.exc.code_handler(100, response.url) self.products["status"][index] = "error" else: return item
def get_basic_info(self, hxs): name = hxs.select('//div[@id="fragment-1"]/h2/text()').extract() short_desc = hxs.select('//div[@class="description2"]/text()').extract() description = hxs.select('//div[@id="fragment-1"]/div[@class="description"]').extract() description = sportman.delete_tags(re, description[0]) description = [basic.cdata(description)] old_price = hxs.select('//span[@class="oldprice"]/text()').extract() if old_price != []: old_price = " ".join(old_price) old_price = old_price.split(":") old_price = old_price[1].replace("Kr", "") old_price = [old_price.replace(" ", "")] else: old_price = old_price price = hxs.select('//span[@class="nowprice"]/text()').extract() if price != []: price = " ".join(price) price = price.split(":") price = price[1].replace("Kr", "") price = [price.replace(" ", "")] else: price = hxs.select('//span[@class="normalprice"]/text()').extract() price = " ".join(price) price = price.split(":") price = price[1].replace("Kr", "") price = [price.replace(" ", "")] id = hxs.select('//div[@class="articlenumber"]').extract() id = " ".join(id) id = id.replace(u"\xa0", "") id = basic.get_middle_text(id, "Art.nr.", "</div>") sku = id id = [id[0]] return name, short_desc, description, old_price, price, id, sku
def parse(self, response): self.counter += 1 basic.print_status(self.counter, self.total) hxs = HtmlXPathSelector(response) item = BurtonItem() page = hxs.extract() if 'redirect_urls' in response.request.meta: cur_url = response.request.meta['redirect_urls'][0] else: cur_url = response.url index = self.products['urls'].index(cur_url) try: if 'redirect_urls' in response.request.meta: item['product_id'] = [self.products['product_ids'][index]] item['name'] = [self.products['names'][index]] item['in_stock'] = ["NOT_AVAILABLE"] self.exc.code_handler(102, response.url) self.xml.create_xml(item) self.products["status"][index] = "no_avail" else: item['product_id'], item['name'] = self.get_basic_info(hxs) item['description'], item['features'] = self.get_description(hxs) item['variants'], thumb_urls, color_names = self.get_variants(page) item['all_sizes'] = self.get_all_sizes(page) item['color_json'], image_urls = self.get_colors(page, color_names) item['price'], item['old_price'] = self.get_prices(hxs) item['in_stock'] = ['IN_STOCK'] item['product_link'] = [basic.cdata(response.url)] self.xml.create_xml(item) item['image_urls'] = image_urls + thumb_urls self.products["status"][index] = "ran" except: self.exc.code_handler(100, response.url) self.products["status"][index] = "error" else: return item
def get_sizes(self, id, hxs): showmode = hxs.select('//input[@name="showmode"]/@value').extract()[0] itemmode = hxs.select('//input[@name="itemmode"]/@value').extract()[0] salemode = hxs.select('//input[@name="salemode"]/@value').extract()[0] url = "http://www.lydiasuniforms.com/ajaxed/product-showoptions.asp?sku=%s&opt1=AV&opt2=-1&type2=l1type" % ( id) url += "&type3=&showmode=%s&itemmode=%s&salemode=%s&rnum=429" % ( showmode, itemmode, salemode) jsons = [] print "reading page..." page = urllib2.urlopen(url).read() print "page read" page = page.replace("'", "") page = page.replace("[", ",") page = page.replace(",,", "") temp = page.split("]") for i in range(0, len(temp) - 2): tmp = temp[i].split(",") json = '{ "size_short" : " %s ", "size_full" : "%s", "some_number" :\ "%s", "some_id" : "%s" }' % (tmp[0], tmp[1], tmp[2], tmp[3]) json = basic.cdata(json) jsons.append(json) return jsons
def parse_whole_xml(self): xml_dir = "xml/{0}".format(self.name) file_url = "https://svc.celebratinghome.com/ZMags.svc/ProductInfo1" downloader = Downloader() if self.d['download']: downloader.get_file(xml_dir, file_url, "client_feed") else: if not os.path.exists('xml/{0}/client_feed.xml'.format(self.name)): basic.warning( "Feed file doesn't exist please de-select no download option" ) os._exit(2) self.number = 0 xml_item = ChomeItem() urls_all = [] for event, elem in iterparse('xml/{0}/client_feed.xml'.format( self.name)): if elem.tag == "{http://schemas.microsoft.com/ado/2007/08/dataservices/metadata}properties": for r in elem: p = "{http://schemas.microsoft.com/ado/2007/08/dataservices}" if r.tag == p + "Id" and r.text in self.no_urls[ 'product_ids']: index = self.no_urls['product_ids'].index(r.text) self.no_urls['status'][index] = 'ran' self.number += 1 urls = [] flag = 0 for x in elem: if x.tag == p + "Id": xml_item['product_id'] = [x.text] elif x.tag == p + "EngLongDesc" and x.text is not None: xml_item['description_english'] = [ self.escape(basic.cdata(x.text)) ] elif x.tag == p + "RetailPrice": xml_item['custom_price'] = [x.text[:-2]] elif x.tag == p + "SpnLongDesc" and x.text is not None: xml_item['description_spanish'] = [ self.escape(basic.cdata(x.text)) ] elif x.tag == p + "PartNumber": xml_item['add_to_cart_id'] = [x.text] elif x.tag == p + "MaxQty": xml_item['max_qty'] = [x.text] elif x.tag == p + "TimeType": xml_item['time_type'] = [x.text] elif x.tag == p + "SpnName" and x.text is not None: xml_item['name_spanish'] = [x.text] elif x.tag == p + "EngName": xml_item['name_english'] = [x.text] elif x.tag == p + "ImagePath_Large" and x.text is not None: urls.append(self.get_absolute(x.text)) xml_item['normal_image_url'] = [ self.get_server_path( self.get_absolute(x.text)) ] elif x.tag == p + "IsActive": if x.text == 0: xml_item['in_stock'] = ["NOT_IN_STOCK"] else: xml_item['in_stock'] = ['IN_STOCK'] else: for i in range(1, 4): tag = p + "Alternate%sImagePath_Large" % ( str(i)) if x.tag == tag and x.text is not None: urls.append(self.get_absolute(x.text)) xml_item['normal_image_url'].append( self.get_server_path( self.get_absolute(x.text))) # change image paths for normal_image_url and return urls self.xml.create_xml(xml_item) urls_all += urls for i in range(0, len(self.no_urls['status'])): if self.no_urls['status'][i] != 'ran': self.no_urls['status'][i] = 'not_found' return urls_all
def get_basic_info(self, hxs): name = hxs.select('//div[@id="productInfoTop"]/h1/text()').extract() description = basic.cdata(hxs.select('//div[@id="productDescription"]').extract()[0]) return name, [description]
def get_description(self, hxs): description = hxs.select('//div[@id="FieldsetProductInfo"]/text()').extract()[3] features = hxs.select('//div[@id="FieldsetProductInfo"]/ul').extract() if features: features = [features[0][:2000]] return [basic.cdata(description)], basic.cdata_field(features)
def get_all_sizes(self, page): script = basic.get_middle_text(page, 'var distsizeobj=new Array();', 'var indexcolor=0;')[0] all_sizes = basic.get_middle_text(script, ']="', '";') return [basic.cdata(simplejson.dumps(all_sizes))]
def get_all_sizes(self, page): script = basic.get_middle_text(page, 'var distsizeobj=new Array();', 'var indexcolor=0;')[0] all_sizes = basic.get_middle_text(script, ']="','";') return [basic.cdata(simplejson.dumps(all_sizes))]
def get_variants(self, hxs, response): page = hxs.select("//html").extract() page = " ".join(page) dict_one = {} test_one = [] temp = page.split('<div class="color">') temp = temp[1].split("</div>") temp = temp[0].split("<select name") viewstate, eventvalidation, previouspage, hiddenfield, view_page, even_page, pre_page, hidd_page = self.get_vars( response, hxs ) if len(temp) == 1: color = hxs.select('//div[@class="color"]/text()').extract() value = hxs.select('//input[@id="ctl00_ContentPlaceHolder1_Variant1Hidden"]/@value').extract() color[0] = color[0].replace(" ", "") color = basic.clean_string(color[0]) value = value[0] # color = basic.clean_string(color[0]) # color = color.replace(" ","") # # dict['color'] = color # dict['color_value'] = value[0] else: test_color = basic.get_middle_text(temp[1], "farge</option>", "</select>") color = basic.get_middle_text(test_color[0], '">', "</option>") value = basic.get_middle_text(test_color[0], 'value="', '">') for i in range(0, len(color)): color[i] = color[i].replace(" ", "") # # dict['color'] = color # dict['color_value'] = value size_temp = page.split('<div class="size">') size_temp = size_temp[1].split("</div>") size_temp = size_temp[0].split("<select name") if len(size_temp) == 1: size = hxs.select('//div[@class="size"]/text()').extract() size = basic.clean_string(size[0]) size = [size.replace(" ", "")] size_val = hxs.select('//input[@id="ctl00_ContentPlaceHolder1_Variant2Hidden"]/@value').extract() if size[0] == "": for i in range(len(value)): resp_page = self.get_data(response, hidd_page, view_page, pre_page, even_page, value[i]) a_page = resp_page.split('<div class="siz') a_page = a_page[1].split("</select>") if len(a_page) == 1: size = basic.get_middle_text(a_page[0], 'e">', '<input type="hidden"') size_val = basic.get_middle_text(a_page[0], 'value="', '"') size_val = size_val[0] size_val = [size_val] else: a_page = basic.get_middle_text(a_page[0], "se</option>", "</select>") size = basic.get_middle_text(a_page[0], '">', "</option>") size_val = basic.get_middle_text(a_page[0], 'value="', '">') dict_one["color"] = color[i] dict_one["color_value"] = value[i] dict_one["size_value"] = size_val for x in range(0, len(size)): size[x] = basic.clean_string(size[x]) size[x] = size[x].replace(" ", "") dict_one["size"] = size test_one.append(basic.cdata(json.dumps(dict_one))) else: dict_one["color"] = color dict_one["color_value"] = value dict_one["size"] = size dict_one["size_value"] = size_val test_one.append(basic.cdata(simplejson.dumps(dict_one))) else: test_size = basic.get_middle_text(size_temp[1], "se</option>", "</select>") size = basic.get_middle_text(test_size[0], '">', "</option>") size_val = basic.get_middle_text(test_size[0], 'value="', '">') for x in range(0, len(size)): size[x] = basic.clean_string(size[x]) size[x] = size[x].replace(" ", "") dict_one["color"] = color dict_one["color_value"] = value dict_one["size"] = size dict_one["size_value"] = size_val test_one.append(basic.cdata(json.dumps(dict_one))) return test_one
def get_description(self, hxs): description = description = hxs.select('//div[@id="item_description"]').extract() description = [basic.cdata(basic.remove_tags(description[0]))] description = [description[0].replace(u"\u2044", "/")] return description
def parse_whole_xml(self): xml_dir = "xml/{0}".format(self.name) file_url = "https://svc.celebratinghome.com/ZMags.svc/ProductInfo1" downloader = Downloader() if self.d['download']: downloader.get_file(xml_dir, file_url, "client_feed") else: if not os.path.exists('xml/{0}/client_feed.xml'.format(self.name)): basic.warning("Feed file doesn't exist please de-select no download option") os._exit(2) self.number = 0 xml_item = ChomeItem() urls_all = [] for event, elem in iterparse('xml/{0}/client_feed.xml'.format(self.name)): if elem.tag == "{http://schemas.microsoft.com/ado/2007/08/dataservices/metadata}properties": for r in elem: p = "{http://schemas.microsoft.com/ado/2007/08/dataservices}" if r.tag == p + "Id" and r.text in self.no_urls['product_ids']: index = self.no_urls['product_ids'].index(r.text) self.no_urls['status'][index] = 'ran' self.number += 1 urls = [] flag = 0 for x in elem: if x.tag == p + "Id": xml_item['product_id'] = [x.text] elif x.tag == p + "EngLongDesc" and x.text is not None: xml_item['description_english'] = [self.escape(basic.cdata(x.text))] elif x.tag == p + "RetailPrice": xml_item['custom_price'] = [x.text[:-2]] elif x.tag == p + "SpnLongDesc" and x.text is not None: xml_item['description_spanish'] = [self.escape(basic.cdata(x.text))] elif x.tag == p + "PartNumber": xml_item['add_to_cart_id'] = [x.text] elif x.tag == p + "MaxQty": xml_item['max_qty'] = [x.text] elif x.tag == p + "TimeType": xml_item['time_type'] = [x.text] elif x.tag == p + "SpnName" and x.text is not None: xml_item['name_spanish'] = [x.text] elif x.tag == p + "EngName": xml_item['name_english'] = [x.text] elif x.tag == p + "ImagePath_Large" and x.text is not None: urls.append(self.get_absolute(x.text)) xml_item['normal_image_url'] = [self.get_server_path(self.get_absolute(x.text))] elif x.tag == p + "IsActive": if x.text == 0: xml_item['in_stock'] = ["NOT_IN_STOCK"] else: xml_item['in_stock'] = ['IN_STOCK'] else: for i in range(1, 4): tag = p + "Alternate%sImagePath_Large" % (str(i)) if x.tag == tag and x.text is not None: urls.append(self.get_absolute(x.text)) xml_item['normal_image_url'].append(self.get_server_path(self.get_absolute(x.text))) # change image paths for normal_image_url and return urls self.xml.create_xml(xml_item) urls_all += urls for i in range(0, len(self.no_urls['status'])): if self.no_urls['status'][i] != 'ran': self.no_urls['status'][i] = 'not_found' return urls_all