def parse(self, response): self.counter += 1 basic.print_status(self.counter, self.total) hxs = HtmlXPathSelector(response) item = GuitarCenterItem() from scrapy.conf import settings if "redirect_urls" in response.request.meta: cur_url = response.request.meta["redirect_urls"][0] else: cur_url = response.url index = self.products["urls"].index(cur_url) try: item["product_id"] = [self.products["product_ids"][index]] item["name"], item["brand"] = self.get_basic_info(hxs) item["heading"], item["details"], item["specs"], item["call_to_action"] = self.get_description(hxs) item["brand_image"], item["brand_image_promo"], brand_images = self.get_description_images(hxs) item["old_price"], item["discount"], item["price"] = self.get_prices(hxs) item["image_json"], img = self.get_images(hxs) item["serial"] = self.get_serials(hxs) item["warranty"] = self.gold_coverage(hxs) item["in_stock"] = self.get_available(hxs) item["product_ref"], item["add_to_cart_id"] = self.get_add_to_cart(hxs) if not item["add_to_cart_id"]: item["in_stock"] = ["NOT_AVAILABLE"] item["shipping"] = self.get_shipping(hxs) item["colors"] = self.get_colors(hxs) self.products["status"][index] = "ran" except StandardError: self.products["status"][index] = "error" self.exc.code_handler(100, response.url) else: self.xml.create_xml(item) item["image_urls"] = img + brand_images return item
def parse(self, response): self.counter += 1 basic.print_status(self.counter, self.total) hxs = HtmlXPathSelector(response) item = GuitarCenterItem() from scrapy.conf import settings if 'redirect_urls' in response.request.meta: cur_url = response.request.meta['redirect_urls'][0] else: cur_url = response.url index = self.products['urls'].index(cur_url) try: item['product_id'] = [self.products['product_ids'][index]] item['name'], item['brand'] = self.get_basic_info(hxs) item['heading'], item['details'], item['specs'], item['call_to_action'] = self.get_description(hxs) item['brand_image'], item['brand_image_promo'], brand_images = self.get_description_images(hxs) item['old_price'], item['discount'], item['price'] = self.get_prices(hxs) item['image_json'], img = self.get_images(hxs) item['serial'] = self.get_serials(hxs) item['warranty'] = self.gold_coverage(hxs) item['in_stock'] = self.get_available(hxs) item['product_ref'], item['add_to_cart_id'] = self.get_add_to_cart(hxs) if not item['add_to_cart_id']: item['in_stock'] = ["NOT_AVAILABLE"] item['shipping'] = self.get_shipping(hxs) item['colors'] = self.get_colors(hxs) self.products['status'][index] = "ran" except StandardError: self.products['status'][index] = "error" self.exc.code_handler(100, response.url) else: self.xml.create_xml(item) item['image_urls'] = img + brand_images return item
def parse(self, response): self.counter += 1 basic.print_status(self.counter, self.total) hxs = HtmlXPathSelector(response) item = SportmanItem() if "redirect_urls" in response.request.meta: cur_url = response.request.meta["redirect_urls"][0] else: cur_url = response.url index = self.products["urls"].index(cur_url) try: if "redirect_urls" in response.request.meta: item["product_id"] = [self.products["product_ids"][index]] item["name"] = [self.products["names"][index]] item["in_stock"] = ["NOT_AVAILABLE"] self.exc.code_handler(102, response.url) self.xml.create_xml(item) self.products["status"][index] = "no_avail" else: item["name"], item["short_desc"], item["description"], item["old_price"], item["custom_price"], item[ "product_id" ], item["sku"] = self.get_basic_info(hxs) item["in_stock"] = ["IN_STOCK"] viewstate, eventval, prevpage, hidden, view_page, even_page, pre_page, hidd_page = self.get_vars( response, hxs ) viewstate1 = viewstate[:2000] viewstate2 = viewstate[2000:4000] viewstate3 = viewstate[4000:6000] viewstate4 = viewstate[6000:8000] viewstate5 = viewstate[8000:10000] viewstate6 = viewstate[10000:] item["viewstate1"] = [basic.cdata(viewstate1)] item["viewstate2"] = [basic.cdata(viewstate2)] item["viewstate3"] = [basic.cdata(viewstate3)] item["viewstate4"] = [basic.cdata(viewstate4)] item["viewstate5"] = [basic.cdata(viewstate5)] item["viewstate6"] = [basic.cdata(viewstate6)] item["eventval"] = [basic.cdata(eventval)] item["size_options"] = self.get_variants(hxs, response) images_url = self.get_images(hxs) item["normal_image_url"] = self.get_server_path(images_url) self.xml.create_xml(item) item.clear() item["image_urls"] = self.get_images(hxs) self.products["status"][index] = "ran" except: self.exc.code_handler(100, response.url) self.products["status"][index] = "error" else: return item
def parse_can(self, response): """Parse function for scraping canadian sites. There is meta information send in request in this function about language.""" self.counter += 1 basic.print_status(self.counter, self.total) item = PartyliteItem() hxs = HtmlXPathSelector(response) image_urls = [] if 'redirect_urls' in response.request.meta: item['product_id'] = [self.get_id(response.request.meta['redirect_urls'][0])[0]] self.exc.code_handler(102, response.request.meta['redirect_urls']) if 'language' in response.request.meta: item['product_id'] = [self.get_id(response.request.meta['redirect_urls'][0])[0] + "_can" + "_" + response.meta['language']] try: index = self.products['product_ids'].index(self.get_id (response.request.meta['redirect_urls'][0])[0]) item['name'] = [basic.cdata(item['product_id'][0] + self.products['names'][index])] self.products['status'][index] = 'no_avail' except KeyError as e: print "This %s id is not in list" % (item['product_id'][0]) item['in_stock'] = ['NOT_AVAILABLE'] item['product_id'] = self.remove_spaces(item['product_id']) self.xml.create_xml(item) else: index = self.products['product_ids'].index(self.get_id(response.url)[0]) try: item['product_id'] = self.get_id(response.url) item['name'], item['shown_with'] = self.get_basic_info(hxs) item['description'] = self.get_description(hxs) if 'language' in response.meta: item['product_id'] = [item['product_id'][0] + "_can" + "_" + response.meta['language']] response.meta['item'] = item page = " ".join(hxs.select('//html').extract()) image_urls = self.get_more_images(page) item['normal_image_url'] = self.get_server_path_field(image_urls) item['in_stock'] = self.get_in_stock(hxs) color_products = self.create_subproducts(page) if color_products: self.write_subproducts(item['product_id'], color_products, xml) else: item['add_to_cart_id'] = self.get_add_to_cart_id(page) item['custom_price'], item['custom_discount'] = self.get_price(hxs) self.products['status'][index] = "ran" except StandardError: basic.print_error() self.products['status'][index] = "error" self.exc.code_handler(100, response.url) else: item['product_id'] = self.remove_spaces(item['product_id']) self.xml.create_xml(item) if image_urls: item['image_urls'] = image_urls return item
def parse(self, response): self.counter += 1 basic.print_status(self.counter, self.total) hxs = HtmlXPathSelector(response) item = BootsItem() item['product_id'], item['store_id'], item['lang_id'], item['catalog_id'] = self.get_ids(hxs) item['name'] = self.get_name(hxs) item['short_description'], sponsored, description, in_stock, item['ingredients'], patient_information_url, item['offer'], item['promotion'] = self.get_description(hxs) item['rating'] = self.get_rating(hxs) size, price_per_size = self.get_size(hxs) item['normal_image_url'], image_urls = self.get_images(hxs) brand, brand_image_url = self.get_brand(hxs) item['save_money'], item['old_price'] = self.get_oldies(hxs) for i in range(0, len(description)): tag = 'description_%d' % (i + 1) item[tag] = [basic.cdata(description[i])] if sponsored is not None: item['sponsored'] = sponsored item['in_stock'] = ["NOT_IN_STOCK"] if in_stock == "In stock": item['in_stock'] = ["IN_STOCK"] item['order_id'] = hxs.select('//input[@name="orderId"]/@value').extract() item['cat_entry_id'] = hxs.select('//input[@name="catEntryId"]/@value').extract() item['calculation_usage_id'] = hxs.select('//input[@name="calculationUsageId"]/@value').extract() if brand_image_url is not None: item['brand'] = brand item['brand_image_url'] = ["43662980-f344-11e1-a21f-0800200c9a66/full/" + self.get_image_sha1(brand_image_url)] image_urls.append(brand_image_url) if patient_information_url is not None: item['patient_information_url'] = [basic.cdata(patient_information_url)] prices, point_prices, collect_points, colors, color_image_urls, variant_ids = self.get_color_variants(hxs) if size is not None: item['size'] = size item['price_per_size'] = price_per_size elif variant_ids is None: prices, point_prices, collect_points, sizes, variant_ids = self.get_size_variants(hxs) if color_image_urls is not None: image_urls.extend(color_image_urls) if variant_ids is not None: self.xml.create_xml(item) if colors is not None: self.create_color_variants(prices, point_prices, colors, color_image_urls, variant_ids, collect_points, item['product_id']) else: self.create_size_variants(prices, point_prices, sizes, variant_ids, collect_points, item['product_id']) else: prices = hxs.select('//p[@class="price"]/text()').extract()[0] point_prices = hxs.select('//span[@class="pointsPrice"]/text()').extract()[0] collect_points = [basic.get_price(hxs.select('//p[@class="collectPoints"]/text()').extract()[0])] item['price'] = [basic.get_price(prices)] item['points_price'] = [basic.get_price(point_prices)] item['collect_points'] = collect_points self.xml.create_xml(item) item['image_urls'] = image_urls #raw_input("Press Enter to continue...") return item
def parse(self, response): self.counter += 1 basic.print_status(self.counter, self.total) hxs = HtmlXPathSelector(response) item = SportmanItem() if 'redirect_urls' in response.request.meta: cur_url = response.request.meta['redirect_urls'][0] else: cur_url = response.url index = self.products['urls'].index(cur_url) try: if 'redirect_urls' in response.request.meta: item['product_id'] = [self.products['product_ids'][index]] item['name'] = [self.products['names'][index]] item['in_stock'] = ["NOT_AVAILABLE"] self.exc.code_handler(102, response.url) self.xml.create_xml(item) self.products["status"][index] = "no_avail" else: item["name"], item["short_desc"], item["description"], item[ "old_price"], item["custom_price"], item[ "product_id"], item["sku"] = self.get_basic_info(hxs) item['in_stock'] = ['IN_STOCK'] viewstate, eventval, prevpage, hidden, view_page, even_page, pre_page, hidd_page = self.get_vars( response, hxs) viewstate1 = viewstate[:2000] viewstate2 = viewstate[2000:4000] viewstate3 = viewstate[4000:6000] viewstate4 = viewstate[6000:8000] viewstate5 = viewstate[8000:10000] viewstate6 = viewstate[10000:] item["viewstate1"] = [basic.cdata(viewstate1)] item["viewstate2"] = [basic.cdata(viewstate2)] item["viewstate3"] = [basic.cdata(viewstate3)] item["viewstate4"] = [basic.cdata(viewstate4)] item["viewstate5"] = [basic.cdata(viewstate5)] item["viewstate6"] = [basic.cdata(viewstate6)] item["eventval"] = [basic.cdata(eventval)] item["size_options"] = self.get_variants(hxs, response) images_url = self.get_images(hxs) item["normal_image_url"] = self.get_server_path(images_url) self.xml.create_xml(item) item.clear() item['image_urls'] = self.get_images(hxs) self.products["status"][index] = "ran" except: self.exc.code_handler(100, response.url) self.products["status"][index] = "error" else: return item
def parse(self, response): self.counter += 1 basic.print_status(self.counter, self.total) hxs = HtmlXPathSelector(response) item = LydiasItem() if 'redirect_urls' in response.request.meta: cur_url = response.request.meta['redirect_urls'][0] else: cur_url = response.url index = self.products['urls'].index(cur_url) id = self.products['product_ids'][index] try: available = hxs.select('//div[@id="searchfor"]/text()').extract() if not available: item['product_id'] = [id] item['name'], item['price'], item['old_price'], item[ 'description'] = self.get_basic_info(hxs) item['rating'], item['custom_rating'] = self.get_rating(hxs) chart = self.absolute_path(self.get_size_image(hxs)) item['sizes_chart_image_url'] = self.get_server_path(chart) color_urls, color_names, product_image, color_codes = self.get_image_swatches( hxs) color_urls = self.absolute_path(color_urls) item['color_image_url'] = self.make_colors_json( color_urls, color_names, color_codes) item['in_stock'] = ["IN_STOCK"] item['embroidery'] = self.get_embroidery(hxs) default_images = self.absolute_path(self.get_extra_images(hxs)) item['default_image_url'] = self.get_server_path( default_images) self.xml.create_xml(item) product_image = self.absolute_path(product_image) self.create_subproducts(id, color_names, product_image, color_codes, hxs) item[ 'image_urls'] = product_image + color_urls + chart + default_images self.products['status'][index] = "ran" else: self.exc.code_handler(102, response.url) item['product_id'] = [id] item['in_stock'] = ["NOT_AVAILABLE"] self.products['status'][index] = "not_avail" self.xml.create_xml(item) except: self.products['status'][index] = "error" self.exc.code_handler(100, response.url) return item
def parse(self, response): self.counter += 1 basic.print_status(self.counter, self.total) hxs = HtmlXPathSelector(response) item = LydiasItem() if 'redirect_urls' in response.request.meta: cur_url = response.request.meta['redirect_urls'][0] else: cur_url = response.url index = self.products['urls'].index(cur_url) id = self.products['product_ids'][index] try: available = hxs.select('//div[@id="searchfor"]/text()').extract() if not available: item['product_id'] = [id] item['name'], item['price'], item['old_price'], item['description'] = self.get_basic_info(hxs) item['rating'], item['custom_rating'] = self.get_rating(hxs) chart = self.absolute_path(self.get_size_image(hxs)) item['sizes_chart_image_url'] = self.get_server_path(chart) color_urls, color_names, product_image, color_codes = self.get_image_swatches(hxs) color_urls = self.absolute_path(color_urls) item['color_image_url'] = self.make_colors_json(color_urls, color_names, color_codes) item['in_stock'] = ["IN_STOCK"] item['embroidery'] = self.get_embroidery(hxs) default_images = self.absolute_path(self.get_extra_images(hxs)) item['default_image_url'] = self.get_server_path(default_images) self.xml.create_xml(item) product_image = self.absolute_path(product_image) self.create_subproducts(id, color_names, product_image, color_codes, hxs) item['image_urls'] = product_image + color_urls + chart + default_images self.products['status'][index] = "ran" else: self.exc.code_handler(102, response.url) item['product_id'] = [id] item['in_stock'] = ["NOT_AVAILABLE"] self.products['status'][index] = "not_avail" self.xml.create_xml(item) except: self.products['status'][index] = "error" self.exc.code_handler(100, response.url) return item
def parse(self, response): self.counter += 1 basic.print_status(self.counter, self.total) hxs = HtmlXPathSelector(response) item = BurtonItem() page = hxs.extract() if 'redirect_urls' in response.request.meta: cur_url = response.request.meta['redirect_urls'][0] else: cur_url = response.url index = self.products['urls'].index(cur_url) try: if 'redirect_urls' in response.request.meta: item['product_id'] = [self.products['product_ids'][index]] item['name'] = [self.products['names'][index]] item['in_stock'] = ["NOT_AVAILABLE"] self.exc.code_handler(102, response.url) self.xml.create_xml(item) self.products["status"][index] = "no_avail" else: item['product_id'], item['name'] = self.get_basic_info(hxs) item['description'], item['features'] = self.get_description( hxs) item['variants'], thumb_urls, color_names = self.get_variants( page) item['all_sizes'] = self.get_all_sizes(page) item['color_json'], image_urls = self.get_colors( page, color_names) item['price'], item['old_price'] = self.get_prices(hxs) item['in_stock'] = ['IN_STOCK'] item['product_link'] = [basic.cdata(response.url)] self.xml.create_xml(item) item['image_urls'] = image_urls + thumb_urls self.products["status"][index] = "ran" except: self.exc.code_handler(100, response.url) self.products["status"][index] = "error" else: return item
def parse(self, response): self.counter += 1 basic.print_status(self.counter, self.total) hxs = HtmlXPathSelector(response) item = BurtonItem() page = hxs.extract() if 'redirect_urls' in response.request.meta: cur_url = response.request.meta['redirect_urls'][0] else: cur_url = response.url index = self.products['urls'].index(cur_url) try: if 'redirect_urls' in response.request.meta: item['product_id'] = [self.products['product_ids'][index]] item['name'] = [self.products['names'][index]] item['in_stock'] = ["NOT_AVAILABLE"] self.exc.code_handler(102, response.url) self.xml.create_xml(item) self.products["status"][index] = "no_avail" else: item['product_id'], item['name'] = self.get_basic_info(hxs) item['description'], item['features'] = self.get_description(hxs) item['variants'], thumb_urls, color_names = self.get_variants(page) item['all_sizes'] = self.get_all_sizes(page) item['color_json'], image_urls = self.get_colors(page, color_names) item['price'], item['old_price'] = self.get_prices(hxs) item['in_stock'] = ['IN_STOCK'] item['product_link'] = [basic.cdata(response.url)] self.xml.create_xml(item) item['image_urls'] = image_urls + thumb_urls self.products["status"][index] = "ran" except: self.exc.code_handler(100, response.url) self.products["status"][index] = "error" else: return item
def parse(self, response): self.counter += 1 basic.print_status(self.counter, self.total) hxs = HtmlXPathSelector(response) item = ExpressItem() index = self.url_list.index(response.url) self.url_list[index] = self.counter flag = 0 shop_look = 0 # main try that catches all unhandled errors try: hxs = HtmlXPathSelector(response) if response.url != "http://www.zmags.com/": error_404 = hxs.select('//img[@alt="404 Error Page Not Found"]').extract() flag = 1 if not error_404: flag = 1 available = hxs.select('//span[@class="glo-tex-error"]/text()').extract() page = " ".join(hxs.select('//html').extract()) #part for creating main product in xml id = self.get_product_id(hxs)[0] if id != self.id_list[index]: msg = "\nNot equal, id in sheet {0}, on site {1}".format(self.id_list[index], id) self.temp_msg += msg item['product_id'] = [id] item['name'] = self.get_name(hxs) item['description'], item['promo_text'] = self.get_basic_info(hxs) item['master_price'], item['discount_price'] = self.get_product_prices(hxs) item['shop_look'] = ['False'] item['normal'] = ['True'] item['shop_line'] = ['False'] item['in_stock'] = ["NOT_IN_STOCK"] if available[0] != "This item is no longer available for purchase.": item['category_id'], item['subcategory_id'] = self.get_categories(hxs) item['add_to_cart_id'] = self.get_add_to_cart_id(hxs) color_names, urls, swatch_image_names, jsons = self.get_swatch_images(hxs) #urls = basic.cdata_field(self.map_url_to_server(urls, id, True)) item['color_image_url'] = self.create_color_json(urls, color_names) item['in_stock'] = ["IN_STOCK"] item['product_page'] = [response.url] self.xml.create_xml(item) product_images, images_grouped = self.parse_jsons(jsons, color_names) ids, sizes, prices = self.get_variants(page) # calling function that will handle creating all child products self.create_child_products(id, ids, sizes, prices, images_grouped) item['image_urls'] = urls + product_images if self.shop_look_list[index]: self.parse_for_shop_look(hxs, self.shop_look_list[index], id, page, images_grouped, response.url, index) if self.shop_line_list[index]: self.parse_for_shop_look(hxs, self.shop_line_list[index], id, page, images_grouped, response.url, index) else: self.xml.create_xml(item) self.exc.code_handler(102, response.url) else: self.exc.code_handler(104, response.url) else: basic.not_provided() self.exc.code_handler(101, response.url) if not flag: item['product_id'] = [self.id_list[index]] item['in_stock'] = ["NOT_AVAILABLE"] item['name'] = ["not available"] self.xml.create_xml(item) except StandardError: self.exc.code_handler(100, response.url) #if it's last product write xml and run end_operations return item
def parse(self, response): self.counter += 1 basic.print_status(self.counter, self.total) hxs = HtmlXPathSelector(response) item = KennethItem() #main try for script, run general except if error happens in code (send # url on mail where it happened) try: cur_url = response.url # search for noResultContent div on the page, if it exists keep # track, that product doesn't exist on # their page, otherwise continue scraping page available = hxs.select('//div[@id="noResultsContent"]').extract() if not available: index = self.products['urls'].index(cur_url) cur_id = self.get_product_id(cur_url) id = self.products['product_ids'][index] page = hxs.select('//div[@id="mainContent"]').extract() page = " ".join(page) item['name'], item['description'] = self.get_basic_info(hxs) price, new_p, old_p = self.get_prices(hxs) if new_p: item['new_price'] = new_p item['old_price'] = old_p else: item['price'] = price desc = basic.clean_string(item['description'][0]) item['description'] = [desc] urls = self.get_color_image(hxs) new = self.get_image_server_path(urls, id) item['color_image_urls'] = new self.export(item['color_image_urls'], [id], "swatchImage") jsons, images = self.we_also_recommend(cur_id, id) item['product_page'] = [cur_url] item['product_id'] = [id] item['add_to_cart_id'] = [cur_id] item['recommended_product'] = jsons item['in_stock'] = ["IN_STOCK"] self.products['status'][index] = "ran" images_or_404 = self.get_colors(hxs, page, id) if images_or_404 == 404: item['in_stock'] = ["NOT_AVAILABLE"] self.xml.create_xml(item) item['image_urls'] = [] if images_or_404 != 404: item['image_urls'] += images_or_404 item['image_urls'] += urls item['image_urls'] += images #self.export(item['image_urls']) #item['image_urls'] = [] #uncomment for donwloading images else: # part for handling products that are not available cur_id = self.get_product_id(cur_url) cur_url = "http://www.kennethcole.com/product/index.jsp?" cur_url += "productId=" + str(cur_id) index = self.products['urls'].index(cur_url) self.products['status'][index] = "no_avail" item['product_id'] = [self.products['product_ids'][index]] if self.products['product_ids'][index]: item['name'] = [self.products['names'][index]] else: item['name'] = ["not available"] item['in_stock'] = ["NOT_AVAILABLE"] self.xml.create_xml(item) self.exc.code_handler(102, cur_url) except: # part for catching errors and keeping track of numbers of # it and urls where it happened print "Error occured scraping this product" index = self.products['urls'].index(cur_url) self.products['status'][index] = "error" self.exc.code_handler(100, cur_url) return item