def parse_item(self, response): if self.links.find_one({"_id": response.url}) is None: print("------------- New Item ----------------") self.browser.get(response.url) time.sleep(2) source = self.browser.page_source sel = Selector(text=source) item = Item() item['created_at'] = datetime.now() item['url'] = response.url item['brand'] = 'vitamina' item['breadcrumb'] = [] item['title'] = sel.xpath('.//h1[@id="nombreProducto"]/text()').extract()[0] description = html_text_normalize(sel.xpath('.//p[@itemprop="description"]/text()').extract()) item['description'] = description item['code'] = '' price = sel.xpath('.//section[@id="datos"]//p[@class="special-price"]/span[@itemprop="price" and @class="price"]/@content').extract() if len(price) > 0: price = price[0] else: price = sel.xpath('.//span[@itemprop="price"]/@content').extract()[0] item['price'] = price_normalize(price) sizes = sel.xpath('.//li[@class="swatchContainer"]/div[@class="swatch"]/text()').extract() item['sizes'] = sizes item['image_urls'] = sel.xpath('.//div[@class="fotozoom"]/img[@class="zoomImg"]/@src').extract() yield item else: print("-------------- OLD -------------")
def parse_item(self, response): if self.links.find_one({"_id": response.url}) is None: print("------------- New Item ----------------") self.browser.get(response.url) source = self.browser.page_source sel = Selector(text=source) item = Item() item['created_at'] = datetime.now() item['url'] = response.url item['brand'] = 'ladystork' title = html_text_normalize(sel.xpath('.//div[@class="p-title-group"]/h2/text()').extract()[0]) item['title'] = title item['breadcrumb'] = sel.xpath('.//ol/li/a/text()').extract()[2:] item['description'] = html_text_normalize(sel.xpath('.//div[@id="ctl00_HTMLContent_pnlDesc"]/div/p/text()').extract()) item['code'] = None price = price_normalize(sel.xpath('.//strong[@class="p-price"]/text()').extract()[0]) item['price'] = price sizes = sel.xpath('.//ul[@class="p-size-list"]/li[@class!="disabled"]/a/text()').extract() item['sizes'] = sizes item['other'] = None img_urls = sel.xpath('.//div[@class="slick p-thumbs-photo"]/div/img/@src').extract() item['image_urls'] = img_urls yield item else: print("-------------- OLD -------------")
def parse_item(self, response): if self.links.find_one({"_id": response.url}) is None: print("------------- New Item ----------------") self.browser.get(response.url) # wait for sizes and color data to load sizes_path = './/div[@role="option"]/text()' self.is_visible(sizes_path, timeout=2) source = self.browser.page_source sel = Selector(text=source) item = Item() item['created_at'] = datetime.now() item['url'] = response.url item['brand'] = 'prune' item['breadcrumb'] = sel.xpath('.//title/text()').extract()[0].split(' ', 1)[0] item['title'] = sel.xpath('.//div[@class="page-title-wrapper product"]/h1/span/text()').extract()[0] sizes = sel.xpath(sizes_path).extract() item['sizes'] = sizes item['color'] = sel.xpath('.//div[@class="swatch-option color selected"]/@aria-label').extract() description = sel.xpath('.//div[@class="product attribute description"]/div/ul').extract() item['description'] = html_text_normalize(description) item['code'] = sel.xpath('.//div[@itemprop="sku"]/text()').extract()[0] price_str = sel.xpath('.//span[@class="price"]/text()').extract()[0] item['price'] = price_normalize(price_str) item['other'] = None item['image_urls'] = sel.xpath('.//img[@class="img-responsive"]/@src').extract() yield item else: print("-------------- OLD -------------")
def parse_item(self, response): if self.links.find_one({"_id": response.url}) is None: print("------------- New Item ----------------") self.browser.get(response.url) time.sleep(2) source = self.browser.page_source sel = Selector(text=source) item = Item() item['created_at'] = datetime.now() item['url'] = response.url item['brand'] = 'justaosadia' item['breadcrumb'] = [] item['title'] = sel.xpath( './/span[@class="name"]/text()').extract()[0] description = html_text_normalize( sel.xpath('.//span[@class="desc"]//text()').extract()) description += ' ' + html_text_normalize( sel.xpath('.//ul[@class="detail-list"]/li//text()').extract()) item['description'] = description item['code'] = sel.xpath( './/span[@class="code"]/text()').extract()[0] item['price'] = price_normalize( sel.xpath( './/span[@class="price"]/span[@itemprop="price"]/@content' ).extract()[0]) sizes = sel.xpath('.//ul[@class="sizes-list"]/li/@title').extract() item['sizes'] = sizes item['image_urls'] = sel.xpath( './/ul[@class="thumbs"]/li/a/@href').extract() yield item else: print("-------------- OLD -------------")
def parse_item(self, response): if self.links.find_one({"_id": response.url}) is None: print("------------- New Item ----------------") self.browser.get(response.url) time.sleep(2) source = self.browser.page_source sel = Selector(text=source) item = Item() item['created_at'] = datetime.now() item['url'] = response.url item['brand'] = 'dos61' item['breadcrumb'] = [] item['title'] = sel.xpath( './/h1[@itemprop="name"]/text()').extract()[0] description = html_text_normalize( sel.xpath('.//div[@class="content"]//text()').extract()) item['description'] = description item['code'] = '' price = sel.xpath( './/p[@class="price"]//span[@class="woocommerce-Price-amount amount"]/text()' ).extract() if len(price) > 1: price = price[len(price) - 1] else: price = price[0] item['price'] = price_normalize(price) item['sizes'] = sel.xpath( './/div[@data-attribute="pa_talle"]/span[contains(@class, "ivpa_instock")]/@data-term' ).extract() item['image_urls'] = sel.xpath( './/a[@data-slide-index]/img/@src').extract() yield item else: print("-------------- OLD -------------")
def parse_item(self, response): if self.links.find_one({"_id": response.url}) is None: print("------------- New Item ----------------") self.browser.get(response.url) time.sleep(1) source = self.browser.page_source sel = Selector(text=source) item = Item() item['created_at'] = datetime.now() item['url'] = response.url item['brand'] = 'batistella' item['breadcrumb'] = [] item['title'] = sel.xpath( './/h1[@itemprop="name"]/text()').extract()[0] description = html_text_normalize( sel.xpath('.//div[@class="column push-1-16 col-10-12"]/p/text()').extract() + \ sel.xpath('.//table[@class="table-right"]//text()').extract() ) item['description'] = description item['code'] = sel.xpath( './/span[@itemprop="sku"]/text()').extract()[0] item['price'] = price_normalize( sel.xpath('.//span[@itemprop="price"]/text()').extract()[0]) sizes = sel.xpath( './/select[@class="form-control attribute_select"]/option[@value!=0]/text()' ).extract() item['sizes'] = sizes_normalize(sizes) item['image_urls'] = sel.xpath( './/img[@data-src]/@data-src').extract() yield item else: print("-------------- OLD -------------")
def parse_item(self, response): print("------------- New Item ----------------") self.browser.get(response.url) source = self.browser.page_source sel = Selector(text=source) item = Item() item['created_at'] = datetime.now() item['url'] = response.url item['brand'] = 'margiefranzini' item['breadcrumb'] = [] title = sel.xpath('.//h1[@class="title border"]/text()').extract()[0] item['title'] = title.replace(' Margie Franzini Shoes ', ' ').replace(' Margie Franzini ', ' ') item['description'] = html_text_normalize( sel.xpath('.//article[@id="tabDescription"]/p/text()').extract()) item['code'] = '' price = sel.xpath( './/dl[@class="priceInfo clearfix promotionPrice"]//span[@class="ch-price price"]/text()' ).extract() if len(price) == 0: price = sel.xpath( './/span[@class="ch-price price"]/text()').extract()[0] else: price = price[0] item['price'] = price_normalize(price) sizes = sel.xpath( './/menu/li/span[not(contains(text(),"Talle"))]/text()').extract() if len(sizes) == 0: sizes = sel.xpath( './/span[@data-idx="1" and contains(text(),"Talle")]/text()' ).extract() item['sizes'] = sizes_normalize(sizes) img_urls = sel.xpath('.//li[@role="listitem"]/img/@src').extract() item['image_urls'] = [url[2:] for url in img_urls] yield item
def parse_item(self, response): if self.links.find_one({"_id": response.url}) is None: print("------------- New Item ----------------") self.browser.get(response.url) time.sleep(2) source = self.browser.page_source sel = Selector(text=source) item = Item() item['created_at'] = datetime.now() item['url'] = response.url item['brand'] = 'lazaro' item['breadcrumb'] = [] item['title'] = sel.xpath( './/div[@class="product-main-info text-center"]//h1/text()' ).extract()[0] item['description'] = html_text_normalize( sel.xpath('.//div[@id="collapseOne"]/div/text()').extract()) item['code'] = sel.xpath( './/div[@class="sku"]/text()').extract()[0].replace( 'SKU# ', '') item['price'] = price_normalize( sel.xpath('.//span[@class="price"]/text()').extract()[0]) sizes = sel.xpath( './/div[@class="amconf-images-container switcher-field"]//label[not(contains(@class,"no-stock"))]/text()' ).extract() item['sizes'] = sizes item['image_urls'] = sel.xpath( './/div[@id="gallery_01"]//li/a/@data-image').extract() yield item else: print("-------------- OLD -------------")
def parse_item(self, response): print("------------- New Item ----------------") self.browser.get(response.url) source = self.browser.page_source sel = Selector(text=source) item = Item() item['created_at'] = datetime.now() item['url'] = response.url item['brand'] = 'johnlcook' item['breadcrumb'] = None item['title'] = sel.xpath('.//h2[@class="name"]/text()').extract()[0] item['description'] = sel.xpath( './/div[@id="descripcion"]/child::text()').extract()[-1] item['code'] = sel.xpath( './/h2[@class="name"]/following-sibling::p/text()').extract( )[0][8:] item['price'] = sel.xpath( './/span[@class="price"]/text()').extract()[0] sizes = sel.xpath( './/ul[@id="ul-attribute257"]//li//div/@title').extract() availability = sel.xpath( './/ul[@id="ul-attribute257"]//li//div/@class').extract() available_sizes = list() for i in range(len(sizes)): if not str(availability[i]) == 'swatch disabledSwatch': available_sizes.append(str(sizes[i])) item['sizes'] = available_sizes item['other'] = None item['image_urls'] = sel.xpath( './/li[contains(@class,"moreview")]/child::node()/child::node()/@src' ).extract() yield item
def parse_item(self, response): print("------------- New Item ----------------") self.browser.get(response.url) time.sleep(5) source = self.browser.page_source sel = Selector(text=source) item = Item() item['created_at'] = datetime.now() item['url'] = response.url item['brand'] = 'mishka' item['breadcrumb'] = [] title = sel.xpath('.//span[@itemprop="name"]/text()').extract() item['title'] = html_text_normalize(title) item['description'] = html_text_normalize(sel.xpath('.//div[@class="product attribute description"]/div/text()') \ .extract()) item['code'] = sel.xpath('.//div[@itemprop="sku"]/text()').extract()[0] price = sel.xpath( './/span[contains(@id,"product-price")]/span/text()').extract()[0] item['price'] = price_normalize(price) sizes = sel.xpath('.//div[@class="swatch-attribute size"]/div[@class="swatch-attribute-options clearfix"]/div/text()') \ .extract() item['sizes'] = sizes_normalize(sizes) img_urls_prefix = sel.xpath('.//img[contains(@src, "https://www.mishka.com.ar/media/catalog/product/cache/") and not(contains(@src, "thumb"))]/@src') \ .extract()[0][:-5] thumbnails = len( sel.xpath('.//img[contains(@src, "thumb")]/@src').extract()) img_urls = list() for thumb in range(thumbnails): img_url = img_urls_prefix + str(thumb) + '.jpg' img_urls.append(img_url) item['image_urls'] = img_urls yield item
def parse_item(self, response): if self.links.find_one({"_id": response.url}) is None: print("------------- New Item ----------------") self.get_with_short_wait(10, response.url) source = self.browser.page_source sel = Selector(text=source) item = Item() item['created_at'] = datetime.now() item['url'] = response.url item['brand'] = 'sofimartire' title = sel.xpath( './/div[@class="product-main-info"]//h1/text()').extract()[0] item['breadcrumb'] = [title.split(' ', 1)[0]] item['title'] = title item['description'] = html_text_normalize( sel.xpath('.//div[@id="collapseOne"]/div/text()').extract()) item['code'] = sel.xpath( './/div[@class="sku"]/text()').extract()[0] item['price'] = price_normalize( sel.xpath( './/div[@class="product-main-info"]//span[@class="price"]/text()' ).extract()[0]) sizes = sel.xpath( './/label[@class="amconf-color-container amconf-noimage-div"]/text()' ).extract() item['sizes'] = sizes item['other'] = None item['image_urls'] = sel.xpath( './/a[contains(@data-image,"product/cache")]/@data-zoom-image' ).extract() yield item else: print("-------------- OLD -------------")
def parse_item(self, response): if self.links.find_one({"_id": response.url}) is None: print("------------- New Item ----------------") self.browser.get(response.url) time.sleep(2) source = self.browser.page_source sel = Selector(text=source) item = Item() item['created_at'] = datetime.now() item['url'] = response.url item['brand'] = 'cestfini' item['breadcrumb'] = [] item['title'] = sel.xpath( './/h1[@itemprop="name"]/text()').extract()[0] description = sel.xpath( './/div[@class="descripcion_cestfini"]//text()').extract() description = html_text_normalize(description) item['description'] = description item['code'] = '' price = sel.xpath( './/div[@class="span4 force100ipad"]//span[@id="price_display"]/text()' ).extract()[0] item['price'] = price_normalize(price) item['sizes'] = list( set( sel.xpath( './/div[@data-variant="Talle"]//span[@class="custom-variants" and not(@style)]/text()' ).extract())) item['image_urls'] = [ url[2:] for url in sel.xpath( './/a[@class="cloud-zoom-gallery"]/@href').extract() ] yield item else: print("-------------- OLD -------------")
def parse_item(self, response): if self.links.find_one({"_id": response.url}) is None: print("------------- New Item ----------------") self.browser.get(response.url) source = self.browser.page_source sel = Selector(text=source) item = Item() item['created_at'] = datetime.now() item['url'] = response.url item['brand'] = 'converse' item['breadcrumb'] = ['calzado', 'zapatilla', 'zapatilla urbana'] item['title'] = sel.xpath( './/h1[@class="entry-title"]/text()').extract()[0] item['description'] = None item['code'] = sel.xpath( './/p[contains(text(),"SKU")]/text()').extract()[0] price = None item['sizes'] = None item['other'] = None item['image_urls'] = sel.xpath( './/div[@class="producto-image"]/img/@src').extract() yield item self.links.insert({"_id": response.url}) else: print("-------------- OLD -------------")
def parse_item(self, response): if self.links.find_one({"_id": response.url}) is None: print("------------- New Item ----------------") self.browser.get(response.url) time.sleep(2) source = self.browser.page_source sel = Selector(text=source) item = Item() item['created_at'] = datetime.now() item['url'] = response.url item['brand'] = 'blaque' item['breadcrumb'] = '' item['title'] = sel.xpath('.//h2[@class="tituloproducto"]/text()').extract()[0] description = sel.xpath('.//p[@class="descri"]/text()').extract() if len(description) > 0: item['description'] = html_text_normalize(description) item['code'] = sel.xpath('.//span[@class="numart"]/text()').extract()[0] item['price'] = price_normalize(sel.xpath('.//div[@class="descprod"]//div[@class="price-box"]//span[@itemprop="price"]/@content').extract()[0]) sizes = sel.xpath('.//div[@class="swatchesContainer"]//li/div[not(contains(@class, "disabledSwatch"))]/text()').extract() item['sizes'] = sizes img_urls = sel.xpath('.//ul[@id="ul-moreviews"]//a[@class="cloud-zoom-gallery"]/@href').extract() item['image_urls'] = img_urls yield item else: print("-------------- OLD -------------")
def parse_item(self, response): if self.links.find_one({"_id": response.url}) is None: print("------------- New Item ----------------") self.browser.get(response.url) source = self.browser.page_source sel = Selector(text=source) item = Item() item['created_at'] = datetime.now() item['url'] = response.url item['brand'] = 'febo' item['breadcrumb'] = sel.xpath( './/a[contains(@href,"javascript:Form")]/text()').extract() item['title'] = sel.xpath('.//articulo_det/text()').extract()[0] description = sel.xpath('.//descripcion_det/p/text()').extract() item['description'] = html_text_normalize(description) item['code'] = sel.xpath('.//articulo_det/text()').extract()[0] item['price'] = price_normalize( sel.xpath('.//precio_det[@id="preciohtml"]/text()').extract() [0]) sizes = sel.xpath( './/div[@class="talles" and img/@src="img/btn_S.jpg"]/span/text()' ).extract() item['sizes'] = sizes item['other'] = None item['image_urls'] = ['https://zapateriafebo.com/' + url for url in \ sel.xpath('.//foto_principal/img/@src').extract()] yield item else: print("-------------- OLD -------------")
def parse_item(self, response): if self.links.find_one({"_id": response.url}) is None: print("------------- New Item ----------------") self.browser.get(response.url) time.sleep(2) source = self.browser.page_source sel = Selector(text=source) item = Item() item['created_at'] = datetime.now() item['url'] = response.url item['brand'] = 'viamo' item['breadcrumb'] = sel.xpath( './/ul[@class="breadcrumb"]/li//a/text()').extract() item['title'] = sel.xpath( './/div[contains(@class,"prodname")]/text()').extract()[0] item['description'] = html_text_normalize( sel.xpath( './/div[@class="productDescription"]/text()').extract()) item['code'] = None item['price'] = price_normalize( sel.xpath('.//strong[@class="skuBestPrice"]/text()').extract() [0]) sizes = sel.xpath( './/label[contains(@class,"dimension-Talle") and not(contains(@class,"unavailable"))]/text()' ).extract() item['sizes'] = sizes item['other'] = None item['image_urls'] = sel.xpath( './/a[contains(@title,"Zoom")]/@zoom').extract() yield item else: print("-------------- OLD -------------")
def parse_item(self, response): if self.links.find_one({"_id": response.url}) is None: print("------------- New Item ----------------") self.browser.get(response.url) source = self.browser.page_source sel = Selector(text=source) item = Item() item['created_at'] = datetime.now() item['url'] = response.url item['brand'] = 'lucerna' title = sel.xpath('.//h1[@itemprop="name"]/text()').extract()[0] item['title'] = title item['breadcrumb'] = [title.split(' ', 1)[0]] item['description'] = html_text_normalize( sel.xpath( './/div[@class="description user-content clear"]/p/text()' ).extract()) item['code'] = None price = price_normalize( sel.xpath('.//span[@id="price_display"]/text()').extract()[0]) item['price'] = price sizes = sel.xpath( './/a[contains(@class,"insta-variations Talle") and span/@class="custom-variants"]/@data-option' ).extract()[0:5] item['sizes'] = sizes item['other'] = None img_urls = sel.xpath( './/a[contains(@class,"cloud-zoom") and not(contains(@rel,"position"))]/@href' ).extract() img_urls = list(map((lambda x: x[2:]), img_urls)) item['image_urls'] = img_urls yield item else: print("-------------- OLD -------------")
def parse_item(self, response): if self.links.find_one({"_id": response.url}) is None: print("------------- New Item ----------------") self.browser.get(response.url) time.sleep(2) source = self.browser.page_source sel = Selector(text=source) item = Item() item['created_at'] = datetime.now() item['url'] = response.url item['brand'] = 'benditopie' item['breadcrumb'] = [] item['title'] = sel.xpath( './/h1[@itemprop="name"]/text()').extract()[0] item['description'] = html_text_normalize( sel.xpath( './/div[@itemprop="description"]//span/text()').extract()) item['code'] = '' item['price'] = price_normalize( sel.xpath('.//span[@itemprop="price"]/text()').extract()[0]) size_labels = sel.xpath( './/select[@id="ProductSelect-product-template"]/option[not(contains(text(),"gotado"))]/text()' ).extract() item['sizes'] = [label.strip()[:2] for label in size_labels] image_urls = sel.xpath( './/ul[@id="ProductThumbs-product-template"]/li/a/@href' ).extract() item['image_urls'] = [url[2:] for url in image_urls] yield item else: print("-------------- OLD -------------")
def parse_item(self, response): print("------------- New Item ----------------") self.browser.get(response.url) source = self.browser.page_source sel = Selector(text=source) item = Item() item['created_at'] = datetime.now() item['url'] = response.url item['brand'] = 'lucianomarra' item['breadcrumb'] = [] title = sel.xpath('.//h1[@itemprop="name"]/span/text()').extract() item['title'] = html_text_normalize(title) item['description'] = html_text_normalize(sel.xpath('.//div[@id="tab-description"]//text()').extract()) item['code'] = '' price = sel.xpath('.//div[@class="product-price"]/p/ins/span/text()').extract() if len(price) == 0: price = sel.xpath('.//div[@class="product-price"]/p/span/text()').extract()[0] else: price = price[0] item['price'] = price_normalize(price) sizes = sel.xpath('.//div[@class="select_option_label select_option"]/span/text()').extract() item['sizes'] = sizes_normalize(sizes) img_urls = sel.xpath('.//div[@class="images"]//a[@itemprop="image"]/@href').extract() if len(img_urls) ==0: img_urls = sel.xpath('.//div[@class="caroufredsel_wrapper"]//li/a/@href').extract() item['image_urls'] = img_urls yield item
def parse_item(self, response): if self.links.find_one({"_id": response.url}) is None: print("------------- New Item ----------------") self.browser.get(response.url) time.sleep(2) source = self.browser.page_source sel = Selector(text=source) item = Item() item['created_at'] = datetime.now() item['url'] = response.url item['brand'] = sel.xpath( './/div[@class="prd-details"]//h2[@itemprop="brand"]/text()' ).extract()[0] item['breadcrumb'] = [] # TODO item['title'] = sel.xpath( './/div[@class="prd-details"]//h1[@class="prd-title"]/text()' ).extract()[0] item['description'] = html_text_normalize( sel.xpath( './/div[@id="productDetails"]//div[contains(@class,"prd-information")]/text()' ).extract()) item['code'] = sel.xpath( './/div[@id="detailSku"]/@data-sku').extract()[0] item['price'] = price_normalize( sel.xpath('.//span[@id="price_box"]/text()').extract()[0]) jsonSizes = sel.xpath( './/div[@class="prd-details"]//ul[contains(@class,"shoe_size")]/li/@data-simple' ).extract() item['sizes'] = self.parseSize(jsonSizes) item['image_urls'] = sel.xpath( './/ul[@id="productMoreImagesList"]//li/@data-image-product' ).extract() yield item else: print("-------------- OLD -------------")
def parse_item(self, response): if self.links.find_one({"_id": response.url}) is None: print("------------- New Item ----------------") self.browser.get(response.url) time.sleep(2) source = self.browser.page_source sel = Selector(text=source) item = Item() item['created_at'] = datetime.now() item['url'] = response.url item['brand'] = 'honkytonk' item['breadcrumb'] = sel.xpath( './/a[@class="breadcrumb-crumb"]/text()').extract() item['title'] = sel.xpath( './/span[contains(@class,"product-name")]/text()').extract()[0] item['description'] = '' item['code'] = '' item['price'] = price_normalize( sel.xpath( './/span[@class="price product-price js-price-display"]/@content' ).extract()[0]) sizes = sel.xpath( './/div[contains(./label/text(),"talle")]/select/option/text()' ).extract() if len(sizes) == 0: sizes = sel.xpath( './/a[contains(@class,"custom Size")]/span/@data-name' ).extract() item['sizes'] = list(set(sizes)) img_urls = [ url[2:] for url in sel.xpath( './/div[@class="jTscroller scroller-thumbs"]/a/@href'). extract() ] if len(img_urls) == 0: img_urls = [ url[2:] for url in sel.xpath('.//a[@id="zoom"]/@href').extract() ] item['image_urls'] = img_urls yield item else: print("-------------- OLD -------------")
def parse_item(self, response): if self.links.find_one({"_id": response.url}) is None: print("------------- New Item ----------------") self.browser.get(response.url) time.sleep(2) source = self.browser.page_source sel = Selector(text=source) item = Item() item['created_at'] = datetime.now() item['url'] = response.url item['brand'] = 'xl' item['breadcrumb'] = sel.xpath( './/li[@class="last" and @typeof="v:Breadcrumb"]/a/text()' ).extract() item['title'] = sel.xpath( './/div[contains(@class, "fn productName")]/text()').extract( )[0] description = sel.xpath( './/div[contains(@class, "productDescription")]/text()' ).extract() code = '' if len(description) > 0: for i, s in enumerate(description): if 'Código:' in s: start_of_code = s.index('Código:') + 8 code = s[start_of_code:] del description[i] item['description'] = html_text_normalize(description) item['code'] = code item['price'] = price_normalize(response.meta['price']) sizes = sel.xpath( './/div[@class="talles isTalle"]/span[@class="stock"]/text()' ).extract() item['sizes'] = sizes img_urls = sel.xpath('.//div[@class="thumbs"]/img/@src').extract() item['image_urls'] = [ 'https://www.xlshop.com.ar/' + url for url in img_urls ] yield item else: print("-------------- OLD -------------")
def parse_item(self, response): if self.links.find_one({"_id": response.url}) is None: print("------------- New Item ----------------") self.browser.get(response.url) source = self.browser.page_source sel = Selector(text=source) item = Item() item['created_at'] = datetime.now() item['url'] = response.url item['brand'] = 'sarkany' js_dict_variable = sel.xpath( '//script[contains(.,"categoryName")]/text()').extract()[0] # String like --> vtxctx = {skus:"23",.....,categoryName:"Fiesta",....} js_dict_variable = js_dict_variable[js_dict_variable. find('categoryName') + 14:] category = js_dict_variable[:js_dict_variable.find('"')] item['breadcrumb'] = [category] item['title'] = sel.xpath( './/div[contains(@class,"prodname")]/text()').extract()[0] item['description'] = html_text_normalize( sel.xpath( './/div[@class="productDescription"]/text()').extract()[0]) item['code'] = None price = sel.xpath( './/strong[@class="skuBestPrice"]/text()').extract() if len(price) > 0: price = price_normalize(price[0]) else: price = 0 item['price'] = price sizes = sel.xpath( './/label[contains(@class,"Talle") and not(contains(@class,"unavailable"))]/text()' ).extract() item['sizes'] = sizes item['other'] = None item['image_urls'] = sel.xpath( './/a[@id="botaoZoom"]/@rel').extract() yield item else: print("-------------- OLD -------------")
def parse_item(self, response): if self.links.find_one({"_id": response.url}) is None: print("------------- New Item ----------------") self.browser.get(response.url) time.sleep(2) source = self.browser.page_source sel = Selector(text=source) item = Item() item['created_at'] = datetime.now() item['url'] = response.url item['brand'] = 'ancayco' item['breadcrumb'] = [] item['title'] = sel.xpath('.//div[@class="name uppercase bold"]/text()').extract()[0] description = [(text if ('PRODUCTO' not in text and 'MERCADO' not in text) else '') for text in sel.xpath('.//div[@class="lfill top-1"]//text()').extract()] description = html_text_normalize(description) item['description'] = description item['code'] = sel.xpath('.//div[@class="lfill"]/text()').extract()[0].replace('Código ','') price = sel.xpath('.//span[@class="_totalContainer left-1"]//text()').extract() if len(price) > 0: price = price[0] item['price'] = price_normalize(price) else: item['price'] = 0 sizes = [] for size_div in self.browser.find_elements_by_xpath(self.size_div_path): for color_div in self.browser.find_elements_by_xpath(self.color_div_path): self.click_element(size_div) self.click_element(color_div) time.sleep(1) actual_size = size_div.text source = self.browser.page_source sel = Selector(text=source) buy_button_style = sel.xpath(self.buy_button_path).extract()[0] if not 'display: none;' in buy_button_style: sizes.append(actual_size) item['sizes'] = list(set(sizes)) item['image_urls'] = sel.xpath('.//div[@class="thumbnail"]/a/@href').extract() yield item else: print("-------------- OLD -------------")
def parse_item(self, response): if self.links.find_one({"_id": response.url}) is None: print("------------- New Item ----------------") self.browser.get(response.url) time.sleep(2) source = self.browser.page_source sel = Selector(text=source) item = Item() item['created_at'] = datetime.now() item['url'] = response.url item['brand'] = 'sibylvane' item['breadcrumb'] = [] item['title'] = html_text_normalize(sel.xpath('.//h3[@class="light-blue"]/text()').extract()) item['description'] = html_text_normalize(sel.xpath('.//div[@id="ctl00_HTMLContent_pnlDesc"]/p/text()').extract()) item['code'] = sel.xpath('.//div[@class="ref"]/text()').extract()[0].replace('ref:', '').replace('\n','') item['price'] = price_normalize(sel.xpath('.//div[@id="ctl00_HTMLContent_pnlPrice"]/text()').extract()[1]) sizes = sel.xpath('.//select[@id="ddlSizesPicker"]/option[not(@value="-1") and not(@disabled)]/text()').extract() item['sizes'] = sizes item['image_urls'] = sel.xpath('.//img[@id="imgProductGallery"]/@data-zoom-image').extract() yield item else: print("-------------- OLD -------------")
def parse_item(self, response): if self.links.find_one({"_id": response.url}) is None: print("------------- New Item ----------------") self.browser.get(response.url) time.sleep(2) source = self.browser.page_source sel = Selector(text=source) item = Item() item['created_at'] = datetime.now() item['url'] = response.url item['brand'] = '47street' item['breadcrumb'] = [] item['title'] = sel.xpath('.//span[@class="base"]/text()').extract()[0] item['description'] = html_text_normalize(sel.xpath('.//div[@itemprop="description"]/text()').extract()) item['code'] = sel.xpath('.//div[@itemprop="sku"]/text()').extract()[0].replace('SKU# ', '') item['price'] = price_normalize(sel.xpath('.//span[@class="price"]/text()').extract()[0]) sizes = sel.xpath('.//div[@class="swatch-option text"]/text()').extract() item['sizes'] = sizes item['image_urls'] = sel.xpath('.//div[@class="imagen"]/img/@src').extract() yield item else: print("-------------- OLD -------------")
def parse_item(self, response): if self.links.find_one({"_id": response.url}) is None: print("------------- New Item ----------------") self.browser.get(response.url) time.sleep(2) source = self.browser.page_source sel = Selector(text=source) item = Item() item['created_at'] = datetime.now() item['url'] = response.url item['brand'] = 'natacha' breadcrumb = sel.xpath('.//nav/ol/li/a/text()').extract() if len(breadcrumb) > 2: breadcrumb = breadcrumb[1:-1] else: breadcrumb = [] item['breadcrumb'] = breadcrumb item['title'] = sel.xpath('.//h1[@class="title border"]/text()').extract()[0].replace('Natacha Zapato Mujer', '') description = sel.xpath('.//article[@id="tabDescription"]//text()').extract() description = html_text_normalize(description) generic_text_start = ' Productos confeccionados' if generic_text_start in description: description = description[:description.index(generic_text_start)] item['description'] = description item['code'] = '' price = sel.xpath('.//span[@class="ch-price price"]/text()').extract()[0] item['price'] = price_normalize(price) sizes = sel.xpath('.//div[@id="my-variation-1-container"]//menu[contains(@class,"ch-select-content")]/li/span[not(text()="Talle")]/text()').extract() if len(sizes) == 0: sizes = [sel.xpath('.//span[contains(text(),"Talle")]/text()').extract()[0].replace('Talle: ','')] item['sizes'] = [s for s in sizes if "Sin Stock" not in s] # Cuando no hay un talle queda asi [ "37 - Sin Stock", "39 - Sin Stock", "40 - Sin Stock", "41 - Sin Stock" ] img_urls = sel.xpath('.//ul[@class="ch-carousel-list"]/li/img/@src').extract() if len(img_urls) > 1: img_urls = img_urls[:-1] # Eliminate size table image item['image_urls'] = img_urls yield item else: print("-------------- OLD -------------")
def parse_item(self, response): print("------------- New Item ----------------") self.browser.get(response.url) time.sleep(1) source = self.browser.page_source sel = Selector(text=source) item = Item() item['created_at'] = datetime.now() item['url'] = response.url brand = sel.xpath( './/h6[@class="fb-product-cta__brand fb-stylised-caps"]/text()' ).extract()[0] item['brand'] = slugify(brand) item['breadcrumb'] = html_text_normalize( sel.xpath( './/b[@class="fb-masthead__breadcrumb__links"]//span[@itemprop="title"]/text()' ).extract()) item['title'] = sel.xpath( './/h1[@class="fb-product-cta__title"]/text()').extract()[0] description = html_text_normalize( sel.xpath( './/table[@class="fb-product-information__specification__table"]//tr[contains(@class,"row-data")]//text()' ).extract()) item['description'] = description item['code'] = sel.xpath( './/p[@class="fb-product-sets__product-code"]/text()').extract( )[0].replace('Código del producto:', '') item['price'] = price_normalize( sel.xpath( './/p[@class="fb-price" and contains(text(), "Contado")]/text()' ).extract()[0].replace('Contado', '')) sizes = sel.xpath( './/select[@class="fb-inline-dropdown__native-dropdown fsrVisible"]/option[@value!=""]/@value' ).extract() item['sizes'] = sizes_normalize(sizes) item['image_urls'] = [url[2:] for url in \ sel.xpath('.//span[@class="fb-pp-gallery-list__link js-pp-zoom-link" and not(span/i[@class="icon-productGalleryMore"])]/@data-image-zoom').extract()] yield item
def parse_item(self, response): if self.links.find_one({"_id": response.url}) is None: print("------------- New Item ----------------") self.browser.get(response.url) time.sleep(10) source = self.browser.page_source sel = Selector(text=source) item = Item() item['created_at'] = datetime.now() item['url'] = response.url item['brand'] = 'grimoldi' # Get first word of title i.e Abotinadas Berry item['breadcrumb'] = sel.xpath( './/title/text()').extract()[0].split(' ', 1)[0] item['title'] = sel.xpath( './/span[@id="Nombre"]/text()').extract()[0] description = html_text_normalize( sel.xpath('.//div[@class="description"]/p/text()').extract()) # remove generic text description = description.split('. Compr', 1)[0] item['description'] = description item['code'] = None item['price'] = price_normalize( sel.xpath( './/label[@id="PrecioSeleccionado"]/text()').extract()[0]) sizes = sel.xpath( './/select[@id="IdMedidaSeleccionada"]/option/text()').extract( ) item['sizes'] = sizes item['other'] = None urls = sel.xpath( './/div[@class="productImages"]//li/img/@data-image-url' ).extract() item['image_urls'] = [url[2:] for url in urls] yield item else: print("-------------- OLD -------------")
def parse_item(self, response): if self.links.find_one({"_id": response.url}) is None: print("------------- New Item ----------------") self.browser.get(response.url) time.sleep(2) source = self.browser.page_source sel = Selector(text=source) item = Item() item['created_at'] = datetime.now() item['url'] = response.url item['brand'] = 'brunomanetti' item['breadcrumb'] = [] item['title'] = sel.xpath('.//span[@itemprop="name"]/text()').extract()[0] description = [sel.xpath('.//h3/strong/font/i/text()').extract()[0]] description += sel.xpath('.//p[contains(@style,"color: rgb(51, 51, 51); font-family: sans-serif, Arial, Verdana, ")]//text()').extract() if len(description) > 3: description = html_text_normalize(description[:len(description)-2]) item['description'] = description item['code'] = '' price = sel.xpath('.//span[@id="price_display"]/text()').extract()[0] item['price'] = price_normalize(price) sizes = [] for size_a in self.browser.find_elements_by_xpath(self.size_a_path): try: tmp = size_a.click() except: pass time.sleep(0.2) actual_size = size_a.text buy_button = self.browser.find_elements_by_xpath(self.buy_button_path)[0] if buy_button.is_enabled(): sizes.append(actual_size) item['sizes'] = sizes item['image_urls'] = sel.xpath('.//a[@class="cloud-zoom"]/@href').extract()[0][2:] yield item else: print("-------------- OLD -------------")