def parse_product(self, response): item = AliceItem() item['url'] = response.url item['title'] = response.xpath('//*[@id="title-product-detail"]/h1/text()').extract()[0] item['picture'] = response.xpath('/html/head/meta[14]/@content').extract()[0] item['price'] = int(response.xpath('//*[@id="product-special-price"]/span[@property="gr:hasCurrencyValue"]/text()').re('\d.+\d')[0].replace(".","")) #generaly use as brand item['brand'] = "" item['store'] = "linio" item['id_store'] = 16 tags = response.xpath('//*[@id="category-navigation-breadcrumbs"]/li/a/text()').extract() try: item['tag1'] = tags[1] except: item['tag1'] = "" try: item['tag2'] = tags[2] except: item['tag2'] = "" try: item['tag3'] = tags[3] except: item['tag3'] = "" try: item['tag4'] = tags[4] except: item['tag4'] = "" try: item['tag5'] = tags[5] except: item['tag5'] = "" yield item
def parse_products(self, response): try: item = AliceItem() item['url'] = response.url item['title'] = response.xpath( '//meta[@property="og:title"]/@content').extract()[0].replace( ' - Sodimac.com', '') item['picture'] = response.xpath( '//meta[@property="og:image"]/@content').extract()[0] try: item['price'] = int( response.xpath('//div[@class="precio1-1"]/div/text()').re( '\d\S*')[0].replace('.', '')) except: item['price'] = int( response.xpath( '//div[@id="skuPrice"]//div[@class="precio1-1"]/text()' ).re('\d\S*')[0].replace('.', '')) try: item['brand'] = response.xpath( '//div[@class="marca"]/text()').re('\S.+\S')[0] except: try: item['brand'] = response.xpath( '//div[@class="marca"]/text()').re('\S.+\S') except: item['brand'] = '' item['store'] = "sodimac" item['id_store'] = 11 tags = response.xpath('//div[@id="ruta"]//span/text()').re( '\S.+\S') tags.pop(0) try: item['tag1'] = tags[0] except: item['tag1'] = "" try: item['tag2'] = tags[1] except: item['tag2'] = "" try: item['tag3'] = tags[2] except: item['tag3'] = "" try: item['tag4'] = tags[3] except: item['tag4'] = "" try: item['tag5'] = tags[4] except: item['tag5'] = "" yield item except IOError: print 'cannot open', arg
def parse_item(self, response): try: item = AliceItem() item['url'] = response.url title = response.xpath( '//tr/td/strong/span[@class="main_titulo_ficha_bold"]/text()' ).re('\S.+\S') try: title.pop(2) except: title.pop(1) item['title'] = " ".join(title).title().encode('ascii', 'ignore') item['picture'] = self.dom + response.xpath( '//div[@id="loadarea"]/a[@href]/img/@src').extract()[0] try: item['price'] = int( response.xpath( '//span[@class="main_precio_efectivo"]/strong/text()'). re('\d\S*')[0].replace('.', '')) except: item['price'] = int( response.xpath('//div[@class="precio_lg"]/text()')[0].re( '\d\S*')[0].replace('.', '')) #generaly use as brand item['brand'] = title[0] item['store'] = "pcfactory" item['id_store'] = 6 tags = response.xpath( '//a[@class="main_ruta_link"]/text()').extract() try: item['tag1'] = tags[1] except: item['tag1'] = "" try: item['tag2'] = tags[2] except: item['tag2'] = "" try: item['tag3'] = tags[3] except: item['tag3'] = "" try: item['tag4'] = tags[4] except: item['tag4'] = "" try: item['tag5'] = tags[5] except: item['tag5'] = "" yield item except IOError: print 'cannot open', arg
def parse_item(self, response): try: item = AliceItem() item['url'] = response.url try: item['title'] = str(response.xpath('//*[@id="breadcrum"]/text()')[-1].re('\S.+\S')[0].replace('| ','')) except: try: item['title'] = str(response.xpath('//*[@id="cont_ficha_right"]/h1/text()')[0]) except: item['title'] = str(response.xpath('//*[@id="cont_ficha_right"]/h1/text()')) item['picture'] = "http://www.corona.cl" + response.xpath('//img[@id="img_zoom"]/@src')[0].extract() try: item['price'] = int(response.xpath('//div[@id="cont_ficha_precio"]/span[@class="precio_internet"]/text()').re('\d\S*')[0].replace(',','')) except: item['price'] = int(response.xpath('//div[@id="cont_ficha_precio"]/span/text()').re('\d\S*')[0].replace(',','')) item['brand'] = "" item['store'] = "corona" item['id_store'] = 13 tags = response.xpath('//*[@id="breadcrum"]/a/text()').re('\S.+\S') tags.pop(0) try: item['tag1'] = tags[0] except: item['tag1'] = "" try: item['tag2'] = tags[1] except: item['tag2'] = "" try: item['tag3'] = tags[2] except: item['tag3'] = "" try: item['tag4'] = tags[3] except: item['tag4'] = "" try: item['tag5'] = tags[4] except: item['tag5'] = "" yield item except IOError: print 'cannot open', arg
def parse_products(self, response): try: item = AliceItem() item['url'] = response.url item['picture'] = response.xpath( '//*[@id="contenedor1PP"]/meta/@content').extract()[0] try: item['price'] = int( response.xpath( '//*[@id="skuPrice"]/div[@class="precio1"]/text()[2]'). extract()[0].replace(".", "")) except: item['price'] = int( response.xpath( '//*[@id="skuPrice"]/div[@class="precio1"]/span[2]/text()' ).extract()[0].replace(".", "")) item['brand'] = response.xpath( '//*[@id="productBrand"]/text()').extract()[0].title() item['title'] = response.xpath( '//*[@id="skuPrice"]/meta[@name="twitter:title"]/@content' ).extract()[0].title().encode('ascii', 'ignore') item['store'] = "falabella" item['id_store'] = 1 tags = response.xpath('//div[@id="ruta"]//text()').re('\S.+\S') try: item['tag1'] = tags[1] except: item['tag1'] = "" try: item['tag2'] = tags[2] except: item['tag2'] = "" try: item['tag3'] = tags[3] except: item['tag3'] = "" try: item['tag4'] = tags[4] except: item['tag4'] = "" try: item['tag5'] = tags[5] except: item['tag5'] = "" yield item except IOError: print "error", index, arg
def parse_item(self, response): try: item = AliceItem() item['url'] = response.url item['title'] = (response.xpath( '//*[@id="main"]/div[@class="container-data-product"]/h3[@class="title-product"]/text()' ).re('\S.+\S')[0].title()) item['picture'] = response.xpath( '//img[@id]/@src')[0].extract().replace( '..', 'https://www.casaximena.cl').replace( 'public/img/productos', 'https://www.casaximena.cl/new/public/img/productos') item['price'] = int( response.xpath('//*[@id="main"]/div[2]/h3[2]/span/text()').re( '\d\S*')[0].replace('.', '')) item['brand'] = "" item['store'] = "casaximena" item['id_store'] = 14 tags = response.xpath('//*[@id="main"]/ul/li/a/text()').re( '\S.+\S') tags.pop(0) try: item['tag1'] = tags[0] except: item['tag1'] = "" try: item['tag2'] = tags[1] except: item['tag2'] = "" try: item['tag3'] = tags[2] except: item['tag3'] = "" try: item['tag4'] = tags[3] except: item['tag4'] = "" try: item['tag5'] = tags[4] except: item['tag5'] = "" yield item except IOError: print 'cannot open', arg
def parse_item(self, response): item = AliceItem() item['url'] = response.url try: item['title'] = response.xpath( '//span[@itemprop="name"]/text()').re('(\S.+\S)')[0].title() except: item['title'] = response.xpath( '//span[@itemprop="name"]/text()').re('(\S.+\S)').title() item['picture'] = "http://www.ripley.cl/ripley-chile" + str( response.xpath('//img[@id="imagen-mini"]/@src').extract()[0]) item['price'] = int( response.xpath('//p[@class="ofomp"]/text()').re('\d\S*') [0].replace(".", "")) #generaly use as brand item['brand'] = "" item['store'] = "ripley" item['id_store'] = 2 try: item['tag1'] = response.xpath( '//*[@id="breadcrumb"]//text()')[5].extract().title() except: pass try: item['tag2'] = response.xpath( '//*[@id="breadcrumb"]//text()')[8].extract().title() except: pass try: item['tag3'] = response.xpath( '//*[@id="breadcrumb"]//text()')[11].extract().title() except: pass try: item['tag4'] = response.xpath('//meta[@name="keyword"]/@content' ).extract()[0].split(',')[0].title() except: pass try: item['tag5'] = response.xpath('//meta[@name="keyword"]/@content' ).extract()[0].split(',')[1].title() except: item['tag5'] = "" yield item
def parse_item(self, response): item = AliceItem() item['url'] = response.url item['title'] = response.xpath( '//meta[@property="og:description"]/@content').extract()[0].title( ).encode('ascii', 'ignore') item['picture'] = response.xpath( '//meta[@property="og:image"]/@content').extract()[0] item['price'] = int( response.xpath('//div[@class="price offerPrice bold"]/text()').re( '\d\S*')[0].replace(".", "")) #generaly use as brand item['brand'] = "" item['store'] = "Paris" item['id_store'] = 3 tags = response.xpath( '//*[@id="WC_BreadCrumbTrailDisplay_div_1"]//text()').re('\w.+') tags.pop() try: if (tags[0] == ""): item['tag1'] = "Sin categoría" else: item['tag1'] = tags[0].title() except: item['tag1'] = "Sin categoría" try: item['tag2'] = tags[1].title() except: item['tag2'] = "" try: item['tag3'] = tags[2].title() except: item['tag3'] = "" try: item['tag4'] = tags[3].title() except: item['tag4'] = "" try: item['tag5'] = tags[4].title() except: item['tag5'] = "" yield item
def parse_product(self, response): item = AliceItem() item['url'] = response.url item['title'] = response.xpath( '/html/head/meta[@property="og:title"]/@content').extract()[0] item['picture'] = response.xpath( '/html/head/meta[@property="og:image"]/@content').extract()[0] item['price'] = int( response.xpath('//meta[@itemprop="price"]/@content').re('\d+')[0]) #generaly use as brand item['brand'] = "" item['store'] = "dafiti" item['id_store'] = 17 tags = response.xpath('//li[@class="prs"]/a/@title').extract() try: item['tag1'] = tags[1] except: item['tag1'] = "" try: item['tag2'] = tags[2] except: item['tag2'] = "" try: item['tag3'] = tags[3] except: item['tag3'] = "" try: item['tag4'] = tags[4] except: item['tag4'] = "" try: item['tag5'] = tags[5] except: item['tag5'] = "" yield item
def parse_product(self, response): item = AliceItem() item['url'] = response.url item['title'] = response.xpath( '//*[@id="producto"]/h1/a/text()').extract()[0] item['picture'] = "http://www.zmart.cl" + response.xpath( '//*[@id="imagen_producto"]/img/@src').extract()[0] item['price'] = int( response.xpath('//div[@id="PriceProduct"]//text()').re('\d.*\d') [0].replace('.', '')) #generaly use as brand item['brand'] = "" item['store'] = "zmart" item['id_store'] = 15 tags = response.xpath('/html/head/meta[22]/@content').extract()[0] item['tag1'] = "Videojuegos y Consolas" item['tag2'] = "" item['tag3'] = "" item['tag4'] = "" item['tag5'] = "" yield item
def parse_product(self, response): try: item = AliceItem() item['url'] = response.url try: item['title'] = response.xpath( '//*[@class="titulo1 descrip_jq"]/text()').extract( )[0].encode('ascii', 'ignore') except: item['title'] = response.xpath( '//*[@class="titulo1 descrip_jq"]/text()').extract( ).encode('ascii', 'ignore') try: item['picture'] = response.xpath( '/html/head/meta[3]/@content').extract()[0] except: item['picture'] = response.xpath( '/html/head/meta[3]/@content').extract() try: item['price'] = int( response.xpath('//*[@class="precio precio_jq"]/text()').re( '\d\S*')[0].replace('.', '')) except: item['price'] = int( response.xpath('//*[@class="precio precio_jq"]/text()').re( '\d\S*').replace('.', '')) item['brand'] = "" item['store'] = "lapolar" item['id_store'] = 4 tags = response.xpath( '//tr[not(@id)]/td[@valign="top"]/div[@width]/a/text()' ).extract() try: item['tag1'] = tags[0] except: item['tag1'] = "" try: item['tag2'] = tags[1] except: item['tag2'] = "" try: item['tag3'] = tags[2] except: item['tag3'] = "" try: item['tag4'] = tags[3] except: item['tag4'] = "" try: item['tag5'] = tags[4] except: item['tag5'] = "" yield item except IOError: print 'cannot open', arg
def parse_product(self, response): title = response.xpath('//title/text()').extract()[0] if not "Problemas" in title: try: item = AliceItem() item['url'] = response.url try: item['title'] = response.xpath( '/html/head/meta[@property="og:title"]/@content' ).extract()[0] except: item['title'] = response.xpath( '/html/head/meta[@property="og:title"]/@content' ).extract() try: item['picture'] = response.xpath( '/html/head/meta[@property="og:image"]/@content' ).extract()[0] except: item['picture'] = response.xpath( '/html/head/meta[@property="og:title"]/@content' ).extract() try: item['price'] = int( response.xpath( '//*[starts-with(@class,"txt_info_precio_")]/text()' ).re('\d\S*')[0].replace('.', '')) except: try: item['price'] = int( response.xpath( '//*[starts-with(@class,"txt_info_precio_")]/text()' ).re('\d\S*').replace('.', '')) except: try: item['price'] = int( response.xpath( '//*[starts-with(@class,"txt_info_precio_")]/text()' )[1].re('\d\S*')[0]) except: item['price'] = int( response.xpath( '//*[starts-with(@class,"txt_info_precio_")]/text()' )[1].re('\d\S*')[0].replace('.', '')) item['brand'] = "" item['store'] = "easy" item['id_store'] = 10 tags = response.xpath( '//div[@class="ruta_ubicacion_lista" and not(@style)]//text()' ).re('\w.+') tags.pop(0) tags.pop(0) try: if (tags[0] == ""): item['tag1'] = "Sin categoría" else: item['tag1'] = tags[0].title() except: item['tag1'] = "Sin categoría" try: item['tag2'] = tags[1].title() except: item['tag2'] = "" try: item['tag3'] = tags[2].title() except: item['tag3'] = "" try: item['tag4'] = tags[3].title() except: item['tag4'] = "" try: item['tag5'] = tags[4].title() except: item['tag5'] = "" yield item yield item except IOError: print 'cannot open', arg logging.warning(arg) else: print "Tecnical Problem on Server"
def parse_product(self, response): try: item = AliceItem() item['url'] = response.url try: item['title'] = response.xpath( '//meta[@name="description"]/@content').extract()[0].title( ).encode('ascii', 'ignore') except: item['title'] = response.xpath( '//meta[@name="description"]/@content').extract().title( ).encode('ascii', 'ignore') try: item['picture'] = response.xpath( '//meta[@property="og:image"]/@content').extract()[0] except: item['picture'] = response.xpath( '///meta[@property="og:title"]/@content').extract() try: item['price'] = int( response.xpath( '//span[@id="our_price_display"]/strong/text()').re( '\d\S*')[0].replace('.', '')) except: item['price'] = int( response.xpath( '//span[@id="our_price_display"]/strong/text()').re( '\d\S*').replace('.', '')) item['brand'] = response.xpath( '//*[@id="pb-left-column"]/h1/text()').extract()[0].title() item['store'] = "hites" item['id_store'] = 12 tags = response.xpath( '//*[@id="center_column"]/div[@class="breadcrumb"]/a/text()' ).extract() tags.pop(0) try: item['tag1'] = tags[0] except: item['tag1'] = "" try: item['tag2'] = tags[1] except: item['tag2'] = "" try: item['tag3'] = tags[2] except: item['tag3'] = "" try: item['tag4'] = tags[3] except: item['tag4'] = "" try: item['tag5'] = tags[4] except: item['tag5'] = "" yield item except IOError: print 'cannot open', arg
def parse_item(self, response): try: title = response.xpath( '//*[@id="producto-no-disponible-texto"]/div/text()').extract( )[0] except: title = "Product founded" if not "no se encuentra" in title: try: item = AliceItem() item['url'] = response.url try: item['title'] = response.xpath( '//html/head/meta[@name="description"]/@content' ).extract()[0].title() except: item['title'] = response.xpath( '//*[@id="catalog_link"]/text()').extract()[0].title() item['picture'] = "http://www.abcdin.cl" + response.xpath( '//img[@id="productMainImage"]/@src').extract()[0] item['price'] = int( response.xpath('//td[@class="offerprice"]/text()').re( '\d\S*')[0].replace(".", "")) #generaly use as brand item['brand'] = "" item['store'] = "abcdin" item['id_store'] = 5 tags = response.xpath( '//div[@class="breadcrumb_links"]//text()').re('\w.+') tags.pop(0) try: if (tags[0] == ""): item['tag1'] = "Sin categoría" else: item['tag1'] = tags[0].title() except: item['tag1'] = "Sin categoría" try: item['tag2'] = tags[1].title() except: item['tag2'] = "" try: item['tag3'] = tags[2].title() except: item['tag3'] = "" try: item['tag4'] = tags[3].title() except: item['tag4'] = "" try: item['tag5'] = tags[4].title() except: item['tag5'] = "" yield item except IOError: print 'cannot open', arg else: print "Product not found"