class ExpressSpider(CrawlSpider): name = "express" allowed_domains = ["example.com"] start_urls = ["http://www.example.com"] temp_msg = "" handle_httpstatus_list = [404] counter = 0 def __init__(self, *a, **kw): super(ExpressSpider, self).__init__(*a, **kw) dispatcher.connect(self.spider_closed, signals.spider_closed) terminal = CommonTerminal(sys.argv, self.name) self.log = Logger() self.d = terminal.get_arguments() self.xml = VariantsXml() self.exc = ZmagsException(5) shops = CreateShops(self.d['file'], self.xml) try: shops.get() except IndexError: print "This sheet has no shop look or line" self.get_lists_from_excel() self.add_properties(self.xml) self.start_urls = self.url_list[:2] self.total = len(self.start_urls) def parse(self, response): self.counter += 1 basic.print_status(self.counter, self.total) hxs = HtmlXPathSelector(response) item = ExpressItem() index = self.url_list.index(response.url) self.url_list[index] = self.counter flag = 0 shop_look = 0 # main try that catches all unhandled errors try: hxs = HtmlXPathSelector(response) if response.url != "http://www.zmags.com/": error_404 = hxs.select('//img[@alt="404 Error Page Not Found"]').extract() flag = 1 if not error_404: flag = 1 available = hxs.select('//span[@class="glo-tex-error"]/text()').extract() page = " ".join(hxs.select('//html').extract()) #part for creating main product in xml id = self.get_product_id(hxs)[0] if id != self.id_list[index]: msg = "\nNot equal, id in sheet {0}, on site {1}".format(self.id_list[index], id) self.temp_msg += msg item['product_id'] = [id] item['name'] = self.get_name(hxs) item['description'], item['promo_text'] = self.get_basic_info(hxs) item['master_price'], item['discount_price'] = self.get_product_prices(hxs) item['shop_look'] = ['False'] item['normal'] = ['True'] item['shop_line'] = ['False'] item['in_stock'] = ["NOT_IN_STOCK"] if available[0] != "This item is no longer available for purchase.": item['category_id'], item['subcategory_id'] = self.get_categories(hxs) item['add_to_cart_id'] = self.get_add_to_cart_id(hxs) color_names, urls, swatch_image_names, jsons = self.get_swatch_images(hxs) #urls = basic.cdata_field(self.map_url_to_server(urls, id, True)) item['color_image_url'] = self.create_color_json(urls, color_names) item['in_stock'] = ["IN_STOCK"] item['product_page'] = [response.url] self.xml.create_xml(item) product_images, images_grouped = self.parse_jsons(jsons, color_names) ids, sizes, prices = self.get_variants(page) # calling function that will handle creating all child products self.create_child_products(id, ids, sizes, prices, images_grouped) item['image_urls'] = urls + product_images if self.shop_look_list[index]: self.parse_for_shop_look(hxs, self.shop_look_list[index], id, page, images_grouped, response.url, index) if self.shop_line_list[index]: self.parse_for_shop_look(hxs, self.shop_line_list[index], id, page, images_grouped, response.url, index) else: self.xml.create_xml(item) self.exc.code_handler(102, response.url) else: self.exc.code_handler(104, response.url) else: basic.not_provided() self.exc.code_handler(101, response.url) if not flag: item['product_id'] = [self.id_list[index]] item['in_stock'] = ["NOT_AVAILABLE"] item['name'] = ["not available"] self.xml.create_xml(item) except StandardError: self.exc.code_handler(100, response.url) #if it's last product write xml and run end_operations return item def parse_for_shop_look(self, hxs, id, product_id, page, images_grouped, product_url, index): """Special parse function for shop looks and lines. It gets same info stored in different format, mostly json and reference to master product id that is actually shop look/line id. TO DO: see if there is need to specially handle the case for not available""" item = ExpressItem() item['master_product_id'] = [id] item['product_id'] = [id + "_" + product_id] if self.ordered: item['order_index'] = [self.order_list[index]] item['style'] = [product_id] item['product_page'] = [product_url] item['category_id'], item['subcategory_id'] = self.get_categories(hxs) item['add_to_cart_id'] = self.get_add_to_cart_id(hxs) # below is part fot creating swatch images and images json color_names, urls, swatch_image_names, jsons = self.get_swatch_images(hxs) i = 0 colors = [] for k in color_names: d = {'name': k, 'swatch_url': urls[i], 'image_url': self.get_absolute_url(images_grouped[k])} i += 1 colors.append(simplejson.dumps(d)) item['colors'] = basic.cdata_field(colors) item['price'], item['discount_price'] = self.get_product_prices(hxs) item['description'], item['promo_text'] = self.get_basic_info(hxs) item['name'] = self.get_name(hxs) # below is part for creating variants json ids, sizes, prices = self.get_variants(page) variants = [] for k in ids: d = {'color': k, 'prices': prices[k], 'ids': ids[k]} try: d['sizes'] = sizes[k] except StandardError: print "This product has no sizes" variants.append(simplejson.dumps(d)) item['variants'] = basic.cdata_field(variants) self.xml.create_xml(item) def parse_shop_look(self, hxs): products = hxs.select('//div[@id="cat-ens-prod-item"]') i = 0 # do this with actual id item = ExpressItem() whole_page = hxs.extract() whole_page = "".join(whole_page) ensemble_id = basic.get_middle_text(whole_page, "ensembleId: '", "',") name = hxs.select('//div[@id="cat-ens-prod-con"]/h1/text()').extract() name = basic.clean_string_field(name) item['ensemble_id'] = ensemble_id item['normal_image_url'] = self.shl_get_image(hxs) item['product_id'] = ["DUMMIE1"] item['shop_look'] = ['True'] item['normal'] = ['False'] item['shop_line'] = ['False'] item['in_stock'] = ['IN_STOCK'] item['name'] = name xml.create_xml(item) item.clear() for p in products: i += 1 item = ExpressItem() item['master_product_id'] = ['DUMMIE1'] item['product_id'] = ["DUMMIE1_" + str(i)] item['name'], item['price'], item['style'] = self.shl_basic_info(p) page = p.extract() item['variants'] = basic.cdata_field([self.shl_create_variants(self.get_variants(page))]) item['colors'] = basic.cdata_field(self.shl_get_swatches(p)) xml.create_xml(item) # return images for download here once it's needed def get_categories(self, hxs): category_id = hxs.select('//input[@name="categoryId"]/@value').extract() sub_category_id = hxs.select('//input[@name="subCategoryId"]/@value').extract() return category_id, sub_category_id def get_add_to_cart_id(self, hxs): return hxs.select('//input[@name="productId"]/@value').extract() def shl_get_image(self, hxs): page = hxs.extract() image = basic.get_middle_text(page, 'imagesets = "', '";') image = "http://t.express.com/com/scene7/s7d5/=/is/image/expressfashion/%s/i81" % (image[0]) return [image] def shl_create_variants(self, f): """Creates variants for shop look products. Stored in dict with all info and returned as json""" d_main = {} n = [] colors = [p for p in f[0]] for c in colors: d = {'color': c, 'ids': f[0][c]} try: d['sizes'] = f[1][c] except StandardError: print "This product has no sizes" d['prices'] = f[2][c] n.append(d) d_main['variants'] = n return simplejson.dumps(n) def shl_get_swatches(self, hxs): """Function for getting swatches for shop look way. Stores information in dict (name, swatch_url and image url)""" p = hxs.select('div[@class="cat-ens-prod-info"]/div[@class="cat-ens-prod-swatch-display"]') p = p.select('span/text()').extract() l = [] d = {} for c in p: temp = c.split(",") d['name'] = temp[0] d['swatch_url'] = temp[1] d['image_url'] = temp[2] l.append(simplejson.dumps(d)) return l def shl_basic_info(self, hxs): name = hxs.select('div[@class="cat-ens-prod-info"]/h1/text()').extract() name = basic.clean_string_field(name) price = hxs.select('div[@class="cat-ens-prod-info"]/span/text()').extract() price = basic.clean_spaces_field(basic.clean_string_field(price)) style = hxs.select('div[@class="cat-ens-prod-info"]/text()').extract() if len(style) > 2: style = [basic.clean_string(style[1])] else: style = [] return name, price, style def create_color_json(self, urls, names): d = {} n = [] for i in range(0, len(urls)): d['url'] = urls[i] d['name'] = names[i] n.append(simplejson.dumps(d)) return n def get_basic_info(self, hxs): """Gets basic info about products. Returns description and promo text""" description = hxs.select('//li[@class="cat-pro-desc"]').extract()[0] description = basic.clean_string(description) description = [basic.cdata(description)] promo_text = hxs.select('//span[@class="cat-pro-promo-text"]/text()').extract() if not promo_text: promo_text = hxs.select('//span[@class="cat-pro-promo-text"]/font').extract() if promo_text: promo_text = basic.cdata_field(promo_text) return description, promo_text def get_name(self, hxs): name = hxs.select('//div[@id="cat-pro-con-detail"]/h1/text()').extract()[0] name = [basic.clean_string(name)] return name def get_product_prices(self, hxs): """Gets product prices, regular and discount if it exists. If no discount returns empty field.""" price = hxs.select('//li[@class="cat-pro-price"]/strong/text()').extract() discount_price = [] if not price: price = hxs.select('//li[@class="cat-pro-price"]/span[@class="cat-glo-tex-oldP"]/text()').extract() discount_price = hxs.select('//li[@class="cat-pro-price"]/span[@class="cat-glo-tex-saleP"]/text()').extract() if discount_price: discount_price = [re.sub('[^0-9.,]', '', discount_price[0])] price = [re.sub('[^0-9.,]', '', price[0])] return price, discount_price def get_product_id(self, hxs): """Gets product sku from the page as a field""" sku = hxs.select('//input[@name="omnitureStyleID"]/@value').extract()[0] sku = sku.replace(";", "") return [sku] def get_swatch_images(self, hxs): """Function for getting swatch images info (names, urls, image names and urls). Also it gets and json as list of json urls for getting images set for every color.""" urls = hxs.select('//li[@id="widget-product-swatches"]/a/img/@src').extract() color_names = hxs.select('//li[@id="widget-product-swatches"]/a/img/@alt').extract() swatch_image_names = self.get_swatch_image_name(urls) if not swatch_image_names and not color_names: color_names.append("no_color") swatch_image_names = self.get_imagesets(hxs) jsons = self.get_json(swatch_image_names) return color_names, urls, swatch_image_names, jsons def get_imagesets(self, hxs): """Function for getting image set in case where there is no color for product. Gets image set info from the javascript on the page and selects only first one, if there is more because there is only one color to associate with (no_color)""" page = hxs.extract() print len(page) iset = basic.get_middle_text(page, 'imagesets = "', '"; //Change') iset = iset[0].split(',') return [iset[0]] def get_swatch_image_name(self, image_sites): """Gets swatch image name from swatch image url""" image_names = [] for x in range(0, len(image_sites)): name = basic.get_middle_text(image_sites[x], "fashion/", "_s")[0] image_names.append(name) return image_names def get_json(self, image_names): """Gets list of jsons from list of swatch images names""" jsons = [] for i in range(0, len(image_names)): json = "http://s7d5.scene7.com/is/image/expressfashion/" + image_names[i] + "?req=imageset,json" jsons.append(json) return jsons def parse_jsons(self, jsons, color_names): """Parsing json from json urls. Returning all images in field, also returns them grouped by colors, so those groups can be used later when creating child products in xml""" images = [] images_grouped = {} for i in range(0, len(jsons)): json = urllib2.urlopen(jsons[i]).read() image = basic.get_middle_text(json, '"expressfashion/', ";") rest_of_images = basic.get_middle_text(json, ',expressfashion/', ";") temp = image + rest_of_images images_grouped = basic.add_to_dict(images_grouped, color_names[i], temp) images += temp return self.get_absolute_url(images), images_grouped def get_absolute_url(self, images): """Gets absolute path for images. Receives field of relative path images and returns absolute paths""" image_urls = [] for x in range(0, len(images)): image_url = "http://s7d5.scene7.com/is/image/expressfashion/" + images[x] image_url += "?width=351" image_urls.append(image_url) return image_urls def get_variants(self, page): """Getting variants from javascript on the page. Returns three dicts ids, sizes and prices. Format of the dicts is like (key = color, value = field of (ids, sizes and prices))""" temp = page.split("// Load the product variants")[1] temp = temp.split("// Set the field to update with the product variant")[0] variants = temp.split("// Create the variant") sizes = {} ids = {} prices = {} for i in range(1, len(variants)): color = basic.get_middle_text(variants[i], "Color','", "')") if color: color = color[0] else: color = "no_color" ids = basic.add_to_dict(ids, color, basic.get_middle_text(variants[i], "setId('", "')")[0]) if variants[i].find("Size','") != -1: sizes = basic.add_to_dict(sizes, color, basic.get_middle_text(variants[i], "Size','", "')")[0]) prices = basic.add_to_dict(prices, color, basic.get_middle_text(variants[i], 'numericPrice="', '"')[0]) return ids, sizes, prices def get_image_url(self, images, is_swatch=False): """Returns path for images on our servers. If it's for swatch it return also swatch paths.""" image_paths = [] thumb_paths = [] for x in range(0, len(images)): path = normal_image_url + images[x] + ".jpg" thumb_path = thumb_image_url + images[x] + ".jpg" image_paths.append(path) thumb_paths.append(thumb_path) if is_swatch is True: return image_paths else: return image_paths, thumb_paths def create_child_products(self, main_id, ids, sizes, prices, images_grouped): """Creating child products (both colors and sizes). Arguments it gets are: main_id as product id of the master product, images_grouped that is a dict of images grouped by color (field i field) and dicts ids, sizes and prices (e.g. dict with color names as keys and fields of ids for it as values 'black': ['32854, '32855''])""" item = ExpressItem() i = 0 for k in ids: cur_id = main_id + "_" + chr(i + 97) item['product_id'] = [cur_id] item['master_product_id'] = [main_id] item['color'] = [k] # use this for some other path (our server) # images, thumbs = self.get_image_url(images_grouped[i]) if images_grouped: images = self.get_absolute_url(images_grouped[k]) # item['normal_image_url'], item['thumb_image_url'] = self.map_url_to_server(images,main_id) item['normal_image_url'] = basic.cdata_field(self.map_url_to_server(images, main_id)) self.xml.create_xml(item) item.clear() j = 0 for val in ids[k]: item['product_id'] = [cur_id + "_" + chr(j + 97)] item['master_product_id'] = [cur_id] if len(sizes): item['size'] = [sizes[k][j]] item['size_option_id'] = [ids[k][j]] item['price'] = [prices[k][j]] self.xml.create_xml(item) j += 1 i += 1 def map_url_to_server(self, urls, main_id, is_swatch=False): return urls new = [] new1 = [] for i in range(0, len(urls)): new.append(image_path + "/" + main_id + "/full/" + hashlib.sha1(urls[i]).hexdigest() + ".jpg") if is_swatch is False: new1.append(image_path + "/" + main_id + "/thumb/" + hashlib.sha1(urls[i]).hexdigest() + ".jpg") if is_swatch is True: return new else: return new, new1 def spider_closed(self, spider): """Handles spider_closed signal from end of scraping. Handles usual end operations for scraper like writing xml, exporting to database and sending appropriate mail message.""" msg = "" if self.counter < self.total: msg += "\nScraper didn't go through all products, please report" msg += "\n\nScraped %d product out of %d\n\n" % (self.counter, self.total) # filename for writing xml self.xml.write_xml(self.name, self.d['file']) msg += self.exc.create_message(self.counter) msg += "\n{0}".format(self.temp_msg) exp = CommonExport() # part for exporting to database here if self.d['upload']: try: exp.xml_to_db(self.name, self.d['file'], "e2b3b658-16d5-4059-a9df-3c212c817d2c") msg += "\n\nExport to database successful" except StandardError: msg += "\n\nExport to database failed" else: msg += "\n\nUpload to database not selected" msg += self.log.get_message() from modules.mail import Mail mail = Mail() mail.send_mail(msg, "Express scraper report") def get_lists_from_excel(self): xls = CommonExcel(basic.get_excel_path(self.name, self.d['file'])) self.ordered = True try: self.url_list = xls.read_excel_collumn_for_urls(4, 1) self.id_list = xls.read_excel_collumn_for_ids(0, 1) self.shop_look_list = xls.read_excel_collumn(1, 1) self.shop_line_list = xls.read_excel_collumn(2, 1) try: self.order_list = xls.read_excel_collumn_for_ids(6, 1) except: self.ordered = False self.log.add_message("No order provided in this sheet.") except IOError as e: msg = "I/O error {0}: {1}".format(e.errno, e.strerror) msg += "\nError occurred for given file: {0}".format(self.d['file']) self.exc.code_handler(103, msg=msg) except StandardError: msg = "Error reading excel file" msg += "\nError occurred for given file: {0}".format(self.d['file']) self.exc.code_handler(103, msg=msg) def add_properties(self, xml): xml.add_property("size_option_id", "Size Option Id", "text") xml.add_property("color_image_url", "Color Image Url", "text_list") xml.add_property("colors", "Colors", "text_list") xml.add_property("variants", "Variants", "text_list") xml.add_property("style", "Style", "text") xml.add_property("mode", "Mode", "text") xml.add_property("shop_look", "Shop look", "boolean") xml.add_property("shop_line", "Shop line", "boolean") xml.add_property("normal", "Normal", "boolean") xml.add_property("ensemble_id", "Ensemble ID", "text") xml.add_property("promo_text", "Promo text", "text") xml.add_property("in_stock", "In Stock", "text") xml.add_property("product_page", "Product page", "text") xml.add_property("master_price", "Master Price", "decimal") xml.add_property("subcategory_id", "Sub Category ID", "text") xml.add_property("add_to_cart_id", "Add to cart ID", "text") xml.add_property("order_index", "Order Index", "integer")
class LydiasSpider(CrawlSpider): name = "lydias" allowed_domains = ["example.com"] start_urls = ["http://www.example.com"] counter = 0 def __init__(self, *a, **kw): super(LydiasSpider, self).__init__(*a, **kw) dispatcher.connect(self.spider_closed, signals.spider_closed) terminal = DatabaseTerminal(sys.argv, self.name) self.d = terminal.get_arguments() self.xml = VariantsXml() self.exc = ZmagsException(5) if self.d['database']: self.database = Database() self.database.connect() self.products, self.no_urls = self.database.select_products( self.d['catalog_id'], self.d['product_id']) self.database.disconnect() else: self.get_lists_from_excel() # fix for bug with links they provide self.products['urls'] = basic.cut_string_field(self.products['urls'], "&cat=") self.handle_not_provided() self.start_urls = self.products['urls'] self.images_store = "/" + settings['IMAGES_STORE'] lydias.add_properties(self.xml) self.total = len(self.products['urls']) def parse(self, response): self.counter += 1 basic.print_status(self.counter, self.total) hxs = HtmlXPathSelector(response) item = LydiasItem() if 'redirect_urls' in response.request.meta: cur_url = response.request.meta['redirect_urls'][0] else: cur_url = response.url index = self.products['urls'].index(cur_url) id = self.products['product_ids'][index] try: available = hxs.select('//div[@id="searchfor"]/text()').extract() if not available: item['product_id'] = [id] item['name'], item['price'], item['old_price'], item[ 'description'] = self.get_basic_info(hxs) item['rating'], item['custom_rating'] = self.get_rating(hxs) chart = self.absolute_path(self.get_size_image(hxs)) item['sizes_chart_image_url'] = self.get_server_path(chart) color_urls, color_names, product_image, color_codes = self.get_image_swatches( hxs) color_urls = self.absolute_path(color_urls) item['color_image_url'] = self.make_colors_json( color_urls, color_names, color_codes) item['in_stock'] = ["IN_STOCK"] item['embroidery'] = self.get_embroidery(hxs) default_images = self.absolute_path(self.get_extra_images(hxs)) item['default_image_url'] = self.get_server_path( default_images) self.xml.create_xml(item) product_image = self.absolute_path(product_image) self.create_subproducts(id, color_names, product_image, color_codes, hxs) item[ 'image_urls'] = product_image + color_urls + chart + default_images self.products['status'][index] = "ran" else: self.exc.code_handler(102, response.url) item['product_id'] = [id] item['in_stock'] = ["NOT_AVAILABLE"] self.products['status'][index] = "not_avail" self.xml.create_xml(item) except: self.products['status'][index] = "error" self.exc.code_handler(100, response.url) return item # function for checking if product has embroidery or not def get_embroidery(self, hxs): page = hxs.select('//html').extract()[0] if "document.getElementById('logocolor').disabled = true;" in page: return ["True"] else: return ["False"] # function for creating json with all information for colors def make_colors_json(self, color_urls, color_names, color_codes): dict = {} jsons = [] for i in range(0, len(color_urls)): dict['color_url'] = self.get_server_path_single(color_urls[i]) dict['color_name'] = color_names[i] dict['color_short'] = color_codes[i] json = basic.cdata(simplejson.dumps(dict)) jsons.append(json) return jsons # function for getting image server path def get_server_path_single(self, url): # return url return self.images_store + "/full/" + hashlib.sha1( url).hexdigest() + ".jpg" # function for getting image path for field of images def get_server_path(self, urls): # return urls new = [] for url in urls: new.append(self.images_store + "/full/" + hashlib.sha1(url).hexdigest() + ".jpg") return new #function for getting basic information for product def get_basic_info(self, hxs): name = hxs.select('//div[@id="proddetail"]/h1/text()').extract() price = hxs.select( '//div[@id="proddetail"]/div[@class="yourprice bigprice"]/text()' ).extract() description = basic.cdata( hxs.select('//div[@id="details"]').extract()[0]) description = basic.clean_string(description) old_price = hxs.select( '//span[@class="yourprice_product"]/text()').extract() if not price: price = hxs.select('//span[@id="PriceDisplay"]/text()').extract() if old_price: old_price = [re.sub('[^0-9.]', '', old_price[0])] price = [re.sub('[^0-9.]', '', price[0])] return name, price, old_price, [description] # function for getting rating, both number and sentence (e.g. Rating 5 out of 6 votes) def get_rating(self, hxs): temp = hxs.select('//div[@id="Customerssay"]/p[2]/text()').extract() if temp: rating = basic.get_middle_text(temp[0].replace(" ", ""), "Rating:", "out") return rating, temp else: return [], temp #function for getting reviews, returning rating and field of json reviews # or empty fields if there's no reviews def get_reviews(self, hxs): reviews = hxs.select('//div[@class="prodReview"]') if reviews: title = reviews[0].select( 'p[@class="review_title"]/text()').extract() text = reviews[0].select( 'p[@class="review_text"]/text()').extract() author = reviews[0].select( 'p[@class="review_author"]/text()').extract() location = reviews[0].select( 'p[@class="review_location"]/text()').extract() jsons = self.make_reviews_json(title, text, author, location) return jsons else: return [] # function for making json for reviews # currently not in use. cause there are no reviews in DPW design def make_reviews_json(self, title, text, author, location): jsons = [] print len(title) print len(text) print len(author) print len(location) os._exit(0) for i in range(0, len(title)): json = '{ "title" : " %s ", "text" : "%s", "author" : "%s", "location" :\ "%s" }' % (title[i], text[i], author[i], location[i]) json = basic.cdata(json) jsons.append(json) return jsons #function for getting size chart image def get_size_image(self, hxs): temp = hxs.select( '//div[@class="TabbedPanelsContent cells"]/img/@src').extract() return temp #function for getting image swatches, returning fields (image_urls, image name, product color image) def get_image_swatches(self, hxs): colors = hxs.select('//div[@class="lolite"]') color_images = [] color_names = [] products_image = [] color_codes = [] for color in colors: color_images.append(color.select('a/img/@src').extract()[0]) color_names.append(color.select('a/img/@alt').extract()[0]) #if zoom image needed, this is the place to get it products_image.append(color.select('a/@rev').extract()[0]) color_codes.append( color.select('a/@onclick').extract()[0].split(",")[1].replace( "'", "")) return color_images, color_names, products_image, color_codes #function for getting additional images, returns field of images or empty field if there is no def get_extra_images(self, hxs): additional_images = hxs.select( '//div[@id="AddImg"]/script/text()').extract() if additional_images: temp = basic.get_middle_text(additional_images[0], '"', '"') thumb_images = temp[0].split(",") return thumb_images else: return [] #function for getting product id from the page def get_product_id(self, hxs): temp = hxs.select('//div[@id="wrap"]/script/text()').extract() id = basic.get_middle_text(temp[0], 'productid","', '"') return id[0] # function for getting sizes from another url, retunrning field of jsons for sizes # one id from the page is 115NB, if needed here to hardcode for testing # currently not in use def get_sizes(self, id, hxs): showmode = hxs.select('//input[@name="showmode"]/@value').extract()[0] itemmode = hxs.select('//input[@name="itemmode"]/@value').extract()[0] salemode = hxs.select('//input[@name="salemode"]/@value').extract()[0] url = "http://www.lydiasuniforms.com/ajaxed/product-showoptions.asp?sku=%s&opt1=AV&opt2=-1&type2=l1type" % ( id) url += "&type3=&showmode=%s&itemmode=%s&salemode=%s&rnum=429" % ( showmode, itemmode, salemode) jsons = [] print "reading page..." page = urllib2.urlopen(url).read() print "page read" page = page.replace("'", "") page = page.replace("[", ",") page = page.replace(",,", "") temp = page.split("]") for i in range(0, len(temp) - 2): tmp = temp[i].split(",") json = '{ "size_short" : " %s ", "size_full" : "%s", "some_number" :\ "%s", "some_id" : "%s" }' % (tmp[0], tmp[1], tmp[2], tmp[3]) json = basic.cdata(json) jsons.append(json) return jsons # function that handles creating subproducts, can be implemented for the usual way product for every combination # of size and color if needed def create_subproducts(self, id, color_names, product_image, color_codes, hxs): item = LydiasItem() # if no colors for specific product do this part and call to creating size children with empty string instead # of actual color name if len(color_names) == 0: item['master_product_id'] = [id] item['product_id'] = [id + "_" + "0"] item['color'] = ["NO_COLOR"] item['custom_size'] = self.create_sizes_subproducts( id, id + "_" + "0", "", hxs) self.xml.create_xml(item) # for handling cases when there are color options for specific product, create child for every color, and call # for creating size children for every provided color else: for i in range(0, len(color_names)): print "name :" + color_names[i] + " code:" + color_codes[i] item['master_product_id'] = [id] item['product_id'] = [id + "_" + str(i)] item['color'] = [color_names[i]] item['color_short'] = [color_codes[i]] item['normal_image_url'] = self.get_server_path( [product_image[i]]) item['in_stock'] = ["IN_STOCK"] item['custom_size'] = self.create_sizes_subproducts( id, id + "_" + str(i), color_codes[i], hxs) self.xml.create_xml(item) item.clear() return 0 # function for creating child products for sizes # little messy with all the commented lines but those lines can be used if needed to go back to old way with # child products instead of json def create_sizes_subproducts(self, main_id, id, color_code, hxs): print color_code jsons = [] # if block for cases when color is provided if color_code != "": showmode = hxs.select( '//input[@name="showmode"]/@value').extract()[0] itemmode = hxs.select( '//input[@name="itemmode"]/@value').extract()[0] salemode = hxs.select( '//input[@name="salemode"]/@value').extract()[0] url = "http://www.lydiasuniforms.com/ajaxed/product-showoptions.asp?sku=%s&opt1=%s&opt2=-1&type2=l1type&" \ "type3=&showmode=%s&itemmode=%s&salemode=%s&rnum=193" % (main_id, color_code, showmode, itemmode, salemode) page = urllib2.urlopen(url).read() page = page.replace("'", "") page = page.replace("[", ",") page = page.replace(",,", "") temp = page.split("]") for i in range(0, len(temp) - 2): tmp = temp[i].split(",") item = {} # item['master_product_id'] = [id] item['size_short'] = tmp[0] item['price_url'] = self.get_size_price( str(main_id), str(color_code), tmp[0]) item['size'] = tmp[1] # item['product_id'] = [id + "_" + str(i)] # item['in_stock'] = ["IN_STOCK"] # xml.create_xml(item) jsons.append(basic.cdata(simplejson.dumps(item))) return jsons # when the color is not provided different block of code cause it's done differently on the page else: temp = hxs.select('//div[@class="not_size"]/text()').extract() for i in range(0, len(temp)): item = {} # item['master_product_id'] = [id] # item['product_id'] = [id + "_" + str(i)] item['size_short'] = temp[i] item['price_url'] = self.get_size_price( str(main_id), "", temp[i]) # item['in_stock'] = ["IN_STOCK"] # xml.create_xml(item) jsons.append(basic.cdata(simplejson.dumps(item))) return jsons # return 0 # function for getting price for combination of every size and color, can return url where the price is, or can # parse that url to get that actual price but will drastically increase scraping time def get_size_price(self, id, color, size): if color != "": url = "http://www.lydiasuniforms.com/ajaxed/product-showprice.asp?sku=%s %s %s&qty=1&itemmode=" \ "0&showmode=1&rnum=388" % (str(id), str(color), size) else: url = "http://www.lydiasuniforms.com/ajaxed/product-showprice.asp?sku=%s %s&qty=1&itemmode=" \ "0&showmode=1&rnum=259" % (id, size) url = url.replace(" ", "%20") return url # just adding part for getting absolute paths for relative paths from page def absolute_path(self, urls): new = [] for i in urls: new.append("http://www.lydiasuniforms.com" + i) return new # function used for gettin embroidery information from clients page, was used only once to get it # cause embroidery is the same for all the products def get_emb(self, hxs): emb = hxs.select('//div[@id="emb"]').extract() lettering_colors = hxs.select( '//select[@id="threadcolor"]/option/@value').extract() urls = [] d = {} colors = [] for i in range(1, len(lettering_colors)): d['type'] = "lettering colors" d['name'] = lettering_colors[i] url = "http://www.lydiasuniforms.com/images/lydias/threadcolor_" url += lettering_colors[i].lower().replace(' ', '_') + ".gif" d['url'] = self.get_server_path_single(url) urls.append(url) colors.append(basic.cdata(simplejson.dumps(d))) lettering = hxs.select( '//select[@id="lettering"]/option/@value').extract() l = {} letterings = [] for i in range(1, len(lettering)): l['type'] = "lettering" l['name'] = lettering[i] url = "http://www.lydiasuniforms.com/images/lydias/lettering_" url += lettering[i].lower().replace(' ', '_') + ".gif" l['url'] = self.get_server_path_single(url) letterings.append(basic.cdata(simplejson.dumps(l))) urls.append(url) logo = hxs.select('//select[@id="logoname"]/option/@value').extract() logos = {} log = [] for i in range(1, len(logo)): logos['type'] = "logo" logos['name'] = logo[i] url = "http://www.lydiasuniforms.com/images/logos/" url += logo[i].lower() + ".jpg" logos['url'] = self.get_server_path_single(url) urls.append(url) log.append(basic.cdata(simplejson.dumps(logos))) item = LydiasItem() item['color'] = colors item['lettering'] = letterings item['log'] = log xml.create_xml(item) xml.write_xml("emb") return urls print colors, letterings, log os._exit(0) def handle_not_provided(self): item = LydiasItem() for n in self.no_urls['product_ids']: item['product_id'] = [n] index = self.no_urls['product_ids'].index(n) item['name'] = [self.no_urls['names'][index]] item['in_stock'] = ['NOT_AVAILABLE'] self.xml.create_xml(item) def spider_closed(self, spider): """Handles spider_closed signal from end of scraping. Handles usual end operations for scraper like writing xml, exporting to database and sending appropriate mail message.""" msg = "" if self.counter < self.total: msg += "\nScraper didn't go through all products, please report" msg += "\n\nScraped %d product out of %d\n\n" % (self.counter, self.total) # filename for writing xml if self.d['database']: try: self.database.connect() filename = self.database.get_name(self.d['catalog_id']) self.database.update_db(self.products) self.database.disconnect() msg += "\nRan from interface.\n" except: msg += "\nUpdating database failed, please report." else: msg += "\nRan from console.\n" filename = self.d['file'] self.xml.write_xml(self.name, filename) msg += self.exc.create_message(self.counter) #if self.d['upload']: #exp = CommonExport() #try: #exp.xml_to_db(self.name, filename, "4b0d6b52-7b05-4e54-9d87-dfe77ac270c9") #msg += "\n\nExport to database successful" #except StandardError: #msg += "\n\nExport to database failed" #else: #msg += "\n\nUpload to database not selected" ## part for exporting to database here from modules.mail import Mail mail = Mail() try: mail.send_mail(msg, "Lydias: {0}".format(filename)) except: msg += "\nSending mail failed." if self.d['database']: path = "logs/{0}".format(self.name) if not os.path.exists(path): os.makedirs(path) with open("{0}/{1}".format(path, filename), 'w') as f: f.write(msg) def get_lists_from_excel(self): xls = DictExcel(basic.get_excel_path(self.name, self.d['file'])) self.products = dict() try: self.products['urls'] = xls.read_excel_collumn_for_urls(3, 15) self.products['product_ids'] = xls.read_excel_collumn_for_ids( 1, 15) self.products['names'] = xls.read_excel_collumn(2, 15) except IOError as e: msg = "I/O error {0}: {1}".format(e.errno, e.strerror) msg += "\nError occurred for given file: {0}".format( self.d['file']) self.exc.code_handler(103, msg=msg) except StandardError: msg = "Error reading excel file" msg += "\nError occurred for given file: {0}".format( self.d['file']) self.exc.code_handler(103, msg=msg) else: self.products = xls.delete_duplicates_dict(self.products) self.products, self.no_urls = xls.separate_no_urls(self.products) self.products = xls._add_none_status(self.products) self.no_urls = xls._add_none_status(self.no_urls)
class LydiasSpider(CrawlSpider): name = "lydias" allowed_domains = ["example.com"] start_urls = ["http://www.example.com"] counter = 0 def __init__(self, *a, **kw): super(LydiasSpider, self).__init__(*a, **kw) dispatcher.connect(self.spider_closed, signals.spider_closed) terminal = DatabaseTerminal(sys.argv, self.name) self.d = terminal.get_arguments() self.xml = VariantsXml() self.exc = ZmagsException(5) if self.d['database']: self.database = Database() self.database.connect() self.products, self.no_urls = self.database.select_products(self.d['catalog_id'], self.d['product_id']) self.database.disconnect() else: self.get_lists_from_excel() # fix for bug with links they provide self.products['urls'] = basic.cut_string_field(self.products['urls'], "&cat=") self.handle_not_provided() self.start_urls = self.products['urls'] self.images_store = "/" + settings['IMAGES_STORE'] lydias.add_properties(self.xml) self.total = len(self.products['urls']) def parse(self, response): self.counter += 1 basic.print_status(self.counter, self.total) hxs = HtmlXPathSelector(response) item = LydiasItem() if 'redirect_urls' in response.request.meta: cur_url = response.request.meta['redirect_urls'][0] else: cur_url = response.url index = self.products['urls'].index(cur_url) id = self.products['product_ids'][index] try: available = hxs.select('//div[@id="searchfor"]/text()').extract() if not available: item['product_id'] = [id] item['name'], item['price'], item['old_price'], item['description'] = self.get_basic_info(hxs) item['rating'], item['custom_rating'] = self.get_rating(hxs) chart = self.absolute_path(self.get_size_image(hxs)) item['sizes_chart_image_url'] = self.get_server_path(chart) color_urls, color_names, product_image, color_codes = self.get_image_swatches(hxs) color_urls = self.absolute_path(color_urls) item['color_image_url'] = self.make_colors_json(color_urls, color_names, color_codes) item['in_stock'] = ["IN_STOCK"] item['embroidery'] = self.get_embroidery(hxs) default_images = self.absolute_path(self.get_extra_images(hxs)) item['default_image_url'] = self.get_server_path(default_images) self.xml.create_xml(item) product_image = self.absolute_path(product_image) self.create_subproducts(id, color_names, product_image, color_codes, hxs) item['image_urls'] = product_image + color_urls + chart + default_images self.products['status'][index] = "ran" else: self.exc.code_handler(102, response.url) item['product_id'] = [id] item['in_stock'] = ["NOT_AVAILABLE"] self.products['status'][index] = "not_avail" self.xml.create_xml(item) except: self.products['status'][index] = "error" self.exc.code_handler(100, response.url) return item # function for checking if product has embroidery or not def get_embroidery(self, hxs): page = hxs.select('//html').extract()[0] if "document.getElementById('logocolor').disabled = true;" in page: return ["True"] else: return ["False"] # function for creating json with all information for colors def make_colors_json(self, color_urls, color_names, color_codes): dict = {} jsons = [] for i in range(0, len(color_urls)): dict['color_url'] = self.get_server_path_single(color_urls[i]) dict['color_name'] = color_names[i] dict['color_short'] = color_codes[i] json = basic.cdata(simplejson.dumps(dict)) jsons.append(json) return jsons # function for getting image server path def get_server_path_single(self, url): # return url return self.images_store + "/full/" + hashlib.sha1(url).hexdigest() + ".jpg" # function for getting image path for field of images def get_server_path(self, urls): # return urls new = [] for url in urls: new.append(self.images_store + "/full/" + hashlib.sha1(url).hexdigest() + ".jpg") return new #function for getting basic information for product def get_basic_info(self, hxs): name = hxs.select('//div[@id="proddetail"]/h1/text()').extract() price = hxs.select('//div[@id="proddetail"]/div[@class="yourprice bigprice"]/text()').extract() description = basic.cdata(hxs.select('//div[@id="details"]').extract()[0]) description = basic.clean_string(description) old_price = hxs.select('//span[@class="yourprice_product"]/text()').extract() if not price: price = hxs.select('//span[@id="PriceDisplay"]/text()').extract() if old_price: old_price = [re.sub('[^0-9.]', '', old_price[0])] price = [re.sub('[^0-9.]', '', price[0])] return name, price, old_price, [description] # function for getting rating, both number and sentence (e.g. Rating 5 out of 6 votes) def get_rating(self, hxs): temp = hxs.select('//div[@id="Customerssay"]/p[2]/text()').extract() if temp: rating = basic.get_middle_text(temp[0].replace(" ", ""), "Rating:", "out") return rating, temp else: return [], temp #function for getting reviews, returning rating and field of json reviews # or empty fields if there's no reviews def get_reviews(self, hxs): reviews = hxs.select('//div[@class="prodReview"]') if reviews: title = reviews[0].select('p[@class="review_title"]/text()').extract() text = reviews[0].select('p[@class="review_text"]/text()').extract() author = reviews[0].select('p[@class="review_author"]/text()').extract() location = reviews[0].select('p[@class="review_location"]/text()').extract() jsons = self.make_reviews_json(title, text, author, location) return jsons else: return [] # function for making json for reviews # currently not in use. cause there are no reviews in DPW design def make_reviews_json(self, title, text, author, location): jsons = [] print len(title) print len(text) print len(author) print len(location) os._exit(0) for i in range(0, len(title)): json = '{ "title" : " %s ", "text" : "%s", "author" : "%s", "location" :\ "%s" }' % (title[i], text[i], author[i], location[i]) json = basic.cdata(json) jsons.append(json) return jsons #function for getting size chart image def get_size_image(self, hxs): temp = hxs.select('//div[@class="TabbedPanelsContent cells"]/img/@src').extract() return temp #function for getting image swatches, returning fields (image_urls, image name, product color image) def get_image_swatches(self, hxs): colors = hxs.select('//div[@class="lolite"]') color_images = [] color_names = [] products_image = [] color_codes = [] for color in colors: color_images.append(color.select('a/img/@src').extract()[0]) color_names.append(color.select('a/img/@alt').extract()[0]) #if zoom image needed, this is the place to get it products_image.append(color.select('a/@rev').extract()[0]) color_codes.append(color.select('a/@onclick').extract()[0].split(",")[1].replace("'", "")) return color_images, color_names, products_image, color_codes #function for getting additional images, returns field of images or empty field if there is no def get_extra_images(self, hxs): additional_images = hxs.select('//div[@id="AddImg"]/script/text()').extract() if additional_images: temp = basic.get_middle_text(additional_images[0], '"', '"') thumb_images = temp[0].split(",") return thumb_images else: return [] #function for getting product id from the page def get_product_id(self, hxs): temp = hxs.select('//div[@id="wrap"]/script/text()').extract() id = basic.get_middle_text(temp[0], 'productid","', '"') return id[0] # function for getting sizes from another url, retunrning field of jsons for sizes # one id from the page is 115NB, if needed here to hardcode for testing # currently not in use def get_sizes(self, id, hxs): showmode = hxs.select('//input[@name="showmode"]/@value').extract()[0] itemmode = hxs.select('//input[@name="itemmode"]/@value').extract()[0] salemode = hxs.select('//input[@name="salemode"]/@value').extract()[0] url = "http://www.lydiasuniforms.com/ajaxed/product-showoptions.asp?sku=%s&opt1=AV&opt2=-1&type2=l1type" % (id) url += "&type3=&showmode=%s&itemmode=%s&salemode=%s&rnum=429" % (showmode, itemmode, salemode) jsons = [] print "reading page..." page = urllib2.urlopen(url).read() print "page read" page = page.replace("'", "") page = page.replace("[", ",") page = page.replace(",,", "") temp = page.split("]") for i in range(0, len(temp) - 2): tmp = temp[i].split(",") json = '{ "size_short" : " %s ", "size_full" : "%s", "some_number" :\ "%s", "some_id" : "%s" }' % (tmp[0], tmp[1], tmp[2], tmp[3]) json = basic.cdata(json) jsons.append(json) return jsons # function that handles creating subproducts, can be implemented for the usual way product for every combination # of size and color if needed def create_subproducts(self, id, color_names, product_image, color_codes, hxs): item = LydiasItem() # if no colors for specific product do this part and call to creating size children with empty string instead # of actual color name if len(color_names) == 0: item['master_product_id'] = [id] item['product_id'] = [id + "_" + "0"] item['color'] = ["NO_COLOR"] item['custom_size'] = self.create_sizes_subproducts(id, id + "_" + "0", "", hxs) self.xml.create_xml(item) # for handling cases when there are color options for specific product, create child for every color, and call # for creating size children for every provided color else: for i in range(0, len(color_names)): print "name :" + color_names[i] + " code:" + color_codes[i] item['master_product_id'] = [id] item['product_id'] = [id + "_" + str(i)] item['color'] = [color_names[i]] item['color_short'] = [color_codes[i]] item['normal_image_url'] = self.get_server_path([product_image[i]]) item['in_stock'] = ["IN_STOCK"] item['custom_size'] = self.create_sizes_subproducts(id, id + "_" + str(i), color_codes[i], hxs) self.xml.create_xml(item) item.clear() return 0 # function for creating child products for sizes # little messy with all the commented lines but those lines can be used if needed to go back to old way with # child products instead of json def create_sizes_subproducts(self, main_id, id, color_code, hxs): print color_code jsons = [] # if block for cases when color is provided if color_code != "": showmode = hxs.select('//input[@name="showmode"]/@value').extract()[0] itemmode = hxs.select('//input[@name="itemmode"]/@value').extract()[0] salemode = hxs.select('//input[@name="salemode"]/@value').extract()[0] url = "http://www.lydiasuniforms.com/ajaxed/product-showoptions.asp?sku=%s&opt1=%s&opt2=-1&type2=l1type&" \ "type3=&showmode=%s&itemmode=%s&salemode=%s&rnum=193" % (main_id, color_code, showmode, itemmode, salemode) page = urllib2.urlopen(url).read() page = page.replace("'", "") page = page.replace("[", ",") page = page.replace(",,", "") temp = page.split("]") for i in range(0, len(temp) - 2): tmp = temp[i].split(",") item = {} # item['master_product_id'] = [id] item['size_short'] = tmp[0] item['price_url'] = self.get_size_price(str(main_id), str(color_code), tmp[0]) item['size'] = tmp[1] # item['product_id'] = [id + "_" + str(i)] # item['in_stock'] = ["IN_STOCK"] # xml.create_xml(item) jsons.append(basic.cdata(simplejson.dumps(item))) return jsons # when the color is not provided different block of code cause it's done differently on the page else: temp = hxs.select('//div[@class="not_size"]/text()').extract() for i in range(0, len(temp)): item = {} # item['master_product_id'] = [id] # item['product_id'] = [id + "_" + str(i)] item['size_short'] = temp[i] item['price_url'] = self.get_size_price(str(main_id), "", temp[i]) # item['in_stock'] = ["IN_STOCK"] # xml.create_xml(item) jsons.append(basic.cdata(simplejson.dumps(item))) return jsons # return 0 # function for getting price for combination of every size and color, can return url where the price is, or can # parse that url to get that actual price but will drastically increase scraping time def get_size_price(self, id, color, size): if color != "": url = "http://www.lydiasuniforms.com/ajaxed/product-showprice.asp?sku=%s %s %s&qty=1&itemmode=" \ "0&showmode=1&rnum=388" % (str(id), str(color), size) else: url = "http://www.lydiasuniforms.com/ajaxed/product-showprice.asp?sku=%s %s&qty=1&itemmode=" \ "0&showmode=1&rnum=259" % (id, size) url = url.replace(" ", "%20") return url # just adding part for getting absolute paths for relative paths from page def absolute_path(self, urls): new = [] for i in urls: new.append("http://www.lydiasuniforms.com" + i) return new # function used for gettin embroidery information from clients page, was used only once to get it # cause embroidery is the same for all the products def get_emb(self, hxs): emb = hxs.select('//div[@id="emb"]').extract() lettering_colors = hxs.select('//select[@id="threadcolor"]/option/@value').extract() urls = [] d = {} colors = [] for i in range(1, len(lettering_colors)): d['type'] = "lettering colors" d['name'] = lettering_colors[i] url = "http://www.lydiasuniforms.com/images/lydias/threadcolor_" url += lettering_colors[i].lower().replace(' ', '_') + ".gif" d['url'] = self.get_server_path_single(url) urls.append(url) colors.append(basic.cdata(simplejson.dumps(d))) lettering = hxs.select('//select[@id="lettering"]/option/@value').extract() l = {} letterings = [] for i in range(1, len(lettering)): l['type'] = "lettering" l['name'] = lettering[i] url = "http://www.lydiasuniforms.com/images/lydias/lettering_" url += lettering[i].lower().replace(' ', '_') + ".gif" l['url'] = self.get_server_path_single(url) letterings.append(basic.cdata(simplejson.dumps(l))) urls.append(url) logo = hxs.select('//select[@id="logoname"]/option/@value').extract() logos = {} log = [] for i in range(1, len(logo)): logos['type'] = "logo" logos['name'] = logo[i] url = "http://www.lydiasuniforms.com/images/logos/" url += logo[i].lower() + ".jpg" logos['url'] = self.get_server_path_single(url) urls.append(url) log.append(basic.cdata(simplejson.dumps(logos))) item = LydiasItem() item['color'] = colors item['lettering'] = letterings item['log'] = log xml.create_xml(item) xml.write_xml("emb") return urls print colors, letterings, log os._exit(0) def handle_not_provided(self): item = LydiasItem() for n in self.no_urls['product_ids']: item['product_id'] = [n] index = self.no_urls['product_ids'].index(n) item['name'] = [self.no_urls['names'][index]] item['in_stock'] = ['NOT_AVAILABLE'] self.xml.create_xml(item) def spider_closed(self, spider): """Handles spider_closed signal from end of scraping. Handles usual end operations for scraper like writing xml, exporting to database and sending appropriate mail message.""" msg = "" if self.counter < self.total: msg += "\nScraper didn't go through all products, please report" msg += "\n\nScraped %d product out of %d\n\n" % (self.counter, self.total) # filename for writing xml if self.d['database']: try: self.database.connect() filename = self.database.get_name(self.d['catalog_id']) self.database.update_db(self.products) self.database.disconnect() msg += "\nRan from interface.\n" except: msg += "\nUpdating database failed, please report." else: msg += "\nRan from console.\n" filename = self.d['file'] self.xml.write_xml(self.name, filename) msg += self.exc.create_message(self.counter) #if self.d['upload']: #exp = CommonExport() #try: #exp.xml_to_db(self.name, filename, "4b0d6b52-7b05-4e54-9d87-dfe77ac270c9") #msg += "\n\nExport to database successful" #except StandardError: #msg += "\n\nExport to database failed" #else: #msg += "\n\nUpload to database not selected" ## part for exporting to database here from modules.mail import Mail mail = Mail() try: mail.send_mail(msg, "Lydias: {0}".format(filename)) except: msg += "\nSending mail failed." if self.d['database']: path = "logs/{0}".format(self.name) if not os.path.exists(path): os.makedirs(path) with open("{0}/{1}".format(path, filename), 'w') as f: f.write(msg) def get_lists_from_excel(self): xls = DictExcel(basic.get_excel_path(self.name, self.d['file'])) self.products = dict() try: self.products['urls'] = xls.read_excel_collumn_for_urls(3, 15) self.products['product_ids'] = xls.read_excel_collumn_for_ids(1, 15) self.products['names'] = xls.read_excel_collumn(2, 15) except IOError as e: msg = "I/O error {0}: {1}".format(e.errno, e.strerror) msg += "\nError occurred for given file: {0}".format(self.d['file']) self.exc.code_handler(103, msg=msg) except StandardError: msg = "Error reading excel file" msg += "\nError occurred for given file: {0}".format(self.d['file']) self.exc.code_handler(103, msg=msg) else: self.products = xls.delete_duplicates_dict(self.products) self.products, self.no_urls = xls.separate_no_urls(self.products) self.products = xls._add_none_status(self.products) self.no_urls = xls._add_none_status(self.no_urls)
class KennethSpider(CrawlSpider): name = "kenneth" allowed_domains = ["example.com"] start_urls = ["http://www.example.com"] counter = 0 def __init__(self, *a, **kw): super(KennethSpider, self).__init__(*a, **kw) dispatcher.connect(self.spider_closed, signals.spider_closed) terminal = DatabaseTerminal(sys.argv, self.name) self.images_store = "/" + settings['IMAGES_STORE'] + "/" self.d = terminal.get_arguments() self.xml = VariantsXml() self.exc = ZmagsException(5) print self.d if self.d['database']: self.database = Database() self.database.connect() self.products, self.no_urls = self.database.select_products(self.d['catalog_id'], self.d['product_id']) self.database.disconnect() else: self.get_lists_from_excel() self.add_properties(self.xml) self.no_url_products(self.no_urls) self.start_urls = self.products['urls'] self.total = len(self.start_urls) def parse(self, response): self.counter += 1 basic.print_status(self.counter, self.total) hxs = HtmlXPathSelector(response) item = KennethItem() #main try for script, run general except if error happens in code (send # url on mail where it happened) try: cur_url = response.url # search for noResultContent div on the page, if it exists keep # track, that product doesn't exist on # their page, otherwise continue scraping page available = hxs.select('//div[@id="noResultsContent"]').extract() if not available: index = self.products['urls'].index(cur_url) cur_id = self.get_product_id(cur_url) id = self.products['product_ids'][index] page = hxs.select('//div[@id="mainContent"]').extract() page = " ".join(page) item['name'], item['description'] = self.get_basic_info(hxs) price, new_p, old_p = self.get_prices(hxs) if new_p: item['new_price'] = new_p item['old_price'] = old_p else: item['price'] = price desc = basic.clean_string(item['description'][0]) item['description'] = [desc] urls = self.get_color_image(hxs) new = self.get_image_server_path(urls, id) item['color_image_urls'] = new self.export(item['color_image_urls'], [id], "swatchImage") jsons, images = self.we_also_recommend(cur_id, id) item['product_page'] = [cur_url] item['product_id'] = [id] item['add_to_cart_id'] = [cur_id] item['recommended_product'] = jsons item['in_stock'] = ["IN_STOCK"] self.products['status'][index] = "ran" images_or_404 = self.get_colors(hxs, page, id) if images_or_404 == 404: item['in_stock'] = ["NOT_AVAILABLE"] self.xml.create_xml(item) item['image_urls'] = [] if images_or_404 != 404: item['image_urls'] += images_or_404 item['image_urls'] += urls item['image_urls'] += images #self.export(item['image_urls']) #item['image_urls'] = [] #uncomment for donwloading images else: # part for handling products that are not available cur_id = self.get_product_id(cur_url) cur_url = "http://www.kennethcole.com/product/index.jsp?" cur_url += "productId=" + str(cur_id) index = self.products['urls'].index(cur_url) self.products['status'][index] = "no_avail" item['product_id'] = [self.products['product_ids'][index]] if self.products['product_ids'][index]: item['name'] = [self.products['names'][index]] else: item['name'] = ["not available"] item['in_stock'] = ["NOT_AVAILABLE"] self.xml.create_xml(item) self.exc.code_handler(102, cur_url) except: # part for catching errors and keeping track of numbers of # it and urls where it happened print "Error occured scraping this product" index = self.products['urls'].index(cur_url) self.products['status'][index] = "error" self.exc.code_handler(100, cur_url) return item def no_url_products(self, no_url): item = KennethItem() for n in no_url['product_ids']: item['product_id'] = [n] index = no_url['product_ids'].index(n) item['name'] = [no_url['names'][index]] item['in_stock'] = ['NOT_AVAILABLE'] self.xml.create_xml(item) #function for getting basic product info from the page def get_basic_info(self, hxs): name = hxs.select('//div[@id="productInfoTop"]/h1/text()').extract() description = basic.cdata(hxs.select('//div[@id="productDescription"]').extract()[0]) return name, [description] # function for getting prices from the page, nly one or new and old one if # that's the case def get_prices(self, hxs): price = hxs.select('//div[@id="productInfoTop"]/h2/text()').extract()[0] new_p = hxs.select('//h2[@class="sale-now"]/text()').extract() old_p = hxs.select('//span[@class="productGrey"]/text()').extract() price = re.sub('[^0-9.,]', '', price) return [price], new_p, old_p def get_color_image(self, hxs): return hxs.select('//div[@id="productInfoR2W"]/img/@src').extract() # function for gettng colors from javascript on the page, and writing them # in xml, from here is called function # for creating further sizes subproducts def get_colors(self, hxs, page, main_id): item = KennethItem() try: tmp = page.split('displays[0]')[1] except IndexError: print "This product is not available" return 404 script = tmp.split('</script>')[0] displays = script.split("};") global counter ids = [] images = [] color_ids = [] sizes_script = self.get_sizes_part_page(page) color_internal_code = {} for x in range(0, len(displays) - 1): id = basic.get_middle_text(displays[x], 'colorId: "', '"') ids.append(id[0]) reg = displays[x].count("Reg") images_in = [] for i in range(1, reg + 1): image = basic.get_middle_text(displays[x], "vw" + str(i) + 'Reg: "', '"') if len(image) == 0: image = basic.get_middle_text(displays[x], "vw" + str(i) + 'Reg:"', '"') if (len(image) > 0): if (image[0] != "null"): images_in.append(image[0]) if not images_in: images_in = hxs.select('//input[@name="productImage"]/@value').extract() color_ids.append(str(main_id) + "_" + str(x)) item['product_id'] = [str(main_id) + "_" + str(x)] item['color_option_id'] = id item['master_product_id'] = [main_id] item['normal_image_url'] = self.get_image_server_path(images_in, main_id) item['thumb_image_url'] = self.get_image_server_path_thumb(images_in, main_id) item['in_stock'] = ["NOT_IN_STOCK"] item['color'] = self.get_color_name(sizes_script, id[0]) color_internal_code[id[0]] = str(x) self.xml.create_xml(item) images += images_in self.export(item['normal_image_url'], item['product_id'], "productImage") self.get_sizes(sizes_script, ids, main_id, color_internal_code) return images # function for getting sizes for products from javascript, and storing # information in dicts of format {id : information} def get_sizes(self, page, ids, main_id, color_internal_code): options = page.split("};") skus = {} colors_name = {} inStocks = {} sizes = {} prices = {} for x in range(0, len(options) - 1): id = basic.get_middle_text(options[x], 'cId: "', '"') for i in range(0, len(ids)): if (id[0] == ids[i]): sku = basic.get_middle_text(options[x], 'sku: ', ',s') sku = re.sub("[^0-9]", "", sku[0]) skus = self.add_to_dict(skus, ids[i], sku) size = basic.get_middle_text(options[x], 'sDesc: "', '"') sizes = self.add_to_dict(sizes, ids[i], size[0]) price = basic.get_middle_text(options[x], 'price: "', '"') price = self.clean_price(price[0]) prices = self.add_to_dict(prices, ids[i], price[0]) available = basic.get_middle_text(options[x], 'avail: "', '"') inStocks = self.add_to_dict(inStocks, ids[i], available[0]) self.create_subproducts_xml(main_id, color_internal_code, colors_name, sizes, skus, inStocks, prices) return main_id, colors_name, sizes, skus, inStocks, prices # function for creating subproducts for every size def create_subproducts_xml(self, main_id, color_internal_code, colors_name, sizes, skus, inStocks, prices): number = 0 global counter for k, v in sizes.iteritems(): item = KennethItem() for i in range(0, len(v)): item['size'] = [v[i]] item['size_option_id'] = [skus[k][i]] m_id = main_id + "_" + color_internal_code[k] item['master_product_id'] = [m_id] id = m_id + "_" + str(i) item['product_id'] = [id] if inStocks[k][i] == "NOT_AVAILABLE": item['in_stock'] = ["NOT_IN_STOCK"] elif inStocks[k][i] == "ADVANCED_SALE_LIMITED": item['in_stock'] = ["IN_STOCK"] else: item['in_stock'] = [inStocks[k][i]] item['price'] = [prices[k][i]] #item['color'] = colors_name[k] self.xml.create_xml(item) number += 1 def add_to_dict(self, dict, index, value): try: dict[index].append(value) except: dict[index] = [value] return dict # function for getting we also recommend information about products from # their page, returns json list with information and images # list with images urls def we_also_recommend(self, id, main_id): url = "http://www.res-x.com/ws/r2/Resonance.aspx?appid=kennethcole01&t" url += "k=154212870918247&ss=525178103419747&sg=1&pg=897706724574618&b" url += "x=true&vr=2.67&sc=product_rr&ev=product&ei=" + id + "&cu=&ct=k" url += "ennethcolec01&no=3&cb=r1eh&clk=&cv1=" + id + "&cv23=63&ur=http%" url += "3A//www.kennethcole.com/product/index.jsp%3FproductId%3D3" + id url += "&plk=&rf=" import urllib2 page = urllib2.urlopen(url).read() temp = page.split("certonaRecBoxes") images = [] ids = [] names = [] prices = [] urls = [] # parsing data got from the upper url about we also recommend products for i in range(1, len(temp)): id = [basic.get_middle_text(temp[i], "d=", '\\"')[0]] image = basic.get_middle_text(temp[i], 'src=\\"', '\\"')[0] name = basic.get_middle_text(temp[i], 'alt=\\"', '\\"') price = basic.get_middle_text(temp[i], '<br>', '</a>') url = "http://www.kennethcole.com/product/index.jsp?productId=" url += id[0] urls.append(url) ids.append(id) names.append(name) prices.append(price) images.append(image) jsons = self.make_json(ids, names, prices, self.get_image_server_path(images, main_id), urls) return jsons, images # function for getting product id from the url def get_product_id(self, url): return url.split("=")[1] #function for making json def make_json(self, ids, names, prices, images, urls): jsons = [] for i in range(0, len(ids)): json = "{" + ' "id" : "' + str(ids[i][0]) + '", ' json += '"name" : "' + str(names[i][0]) + '", ' # insert function for storing the right image path json += '"image_url" : "' + str(images[i]) + '", ' json += '"product_url" : "' + urls[i] + '", ' json += '"price" : "' + str(prices[i][0]) + '" } ' json = basic.cdata(json) jsons.append(json) return jsons #function for getting javascript where sizes are handled def get_sizes_part_page(self, page): tmp = page.split("availDates = new Array();")[1] script = tmp.split("</script>")[0] return script # function for getting name of the color by id def get_color_name(self, script, id): temp = script.split(id) temp = temp[0].split('cDesc: "') temp = temp[len(temp) - 1] name = temp.split('"')[0] return [name] return {id: name} #function for exporting images to database via rest def export(self, images, id, tags): #set override to 0 for uploading images or else to skip uploading override = 1 if override == 0: import MultipartPostHandler import urllib2 import os url = 'http://api.admin.zmags.com/productImage/import?key=5ef90922-283b-4412-a1c8-3e70bc28b9d3' for i in range(0, len(images)): image_name = self.get_image_name(images[i]) path = "images/kenneth_images/small/" + str(image_name) params = {'file': file(path, 'rb'), 'product_id': id[0], 'index': str(i + 1), 'tags': tags} #token not working opener = urllib2.build_opener(MultipartPostHandler.MultipartPostHandler) code = opener.open(url, params).getcode() if (code != 202): print ("Achtung") global images_number images_number += 1 print images_number print "Image uploaded to product " + id[0] else: #print "Image upload overriden.." pass #function for getting image name from url def get_image_server_path(self, urls, id): # print urls new = [] for url in urls: temp = url.split("/") new.append(self.images_store + id + "/full/" + temp[len(temp) - 1]) return new # function for getting image paths on our server def get_image_server_path_thumb(self, urls, id): new = [] for url in urls: temp = url.split("/") new.append(self.images_store + id + "/small/" + temp[len(temp) - 1]) return new def clean_price(self, price): return [re.sub('[^0-9.,]', '', price)] def spider_closed(self, spider): """Handles spider_closed signal from end of scraping. Handles usual end operations for scraper like writing xml, exporting to database and sending appropriate mail message.""" msg = "" if self.counter < self.total: msg += "\nScraper didn't go through all products, please report" msg += "\n\nScraped {0} product out of {1}\n\n".format(self.counter, self.total) # filename for writing xml if self.d['database']: try: self.database.connect() filename = self.database.get_name(self.d['catalog_id']) self.database.update_db(self.products) self.database.disconnect() msg += "\nRan from interface.\n" except: msg += "\nUpdating database failed, please report." else: msg += "\nRan from console.\n" filename = self.d['file'] self.xml.write_xml(self.name, filename) msg += self.exc.create_message(self.counter) if self.d['upload']: exp = CommonExport() #try: exp.xml_to_db(self.name, filename, "29eac9ea-8c57-4d22-baf4-3f1471dc3ab6") msg += "\n\nExport to database successful" #except StandardError: #msg += "\n\nExport to database failed" else: msg += "\n\nUpload to database not selected" from modules.mail import Mail mail = Mail() try: mail.send_mail(msg, "KennethCole: {0}".format(filename)) if self.d['email']: mail.send_mail(msg, "KennethCole: {0}".format(filename), self.d['email']) except: msg += "\nSending mail failed." if self.d['database']: path = 'logs/{0}'.format(self.name) if not os.path.exists(path): os.makedirs(path) with open("{0}/{1}".format(path, filename), 'w') as f: f.write(msg) def get_lists_from_excel(self): xls = DictExcel(basic.get_excel_path(self.name, self.d['file'])) self.products = dict() try: self.products['urls'] = xls.read_excel_collumn_for_urls(2, 2) self.products['product_ids'] = xls.read_excel_collumn_for_ids(0, 2) self.products['names'] = xls.read_excel_collumn(1, 2) except IOError as e: msg = "I/O error {0}: {1}".format(e.errno, e.strerror) msg += "\nError occurred for given file: {0}".format(self.d['file']) self.exc.code_handler(103, msg=msg) except StandardError: msg = "Error reading excel file" msg += "\nError occurred for given file: {0}".format(self.d['file']) self.exc.code_handler(103, msg=msg) self.products = xls.delete_duplicates_dict(self.products) self.products, self.no_urls = xls.separate_no_urls(self.products) self.products = xls._add_none_status(self.products) self.no_urls = xls._add_none_status(self.no_urls) def add_properties(self, xml): xml.add_property("add_to_cart_id", "Add To Cart Id", "text") xml.add_property("product_page", "Product page", "text") xml.add_property("color_image_urls", "Color Image URLs", "text_list") xml.add_property("color_option_id", "Color Option ID", "text") xml.add_property("recommended_product", "Recommended Product", "text_list") xml.add_property("size_option_id", "Size Option ID", "text") xml.add_property("in_stock", "In Stock", "text") xml.add_property("old_price", "Old Price", "text") xml.add_property("new_price", "New Price", "text")