def parse_item(self, response): i = CrawlbotItem() i['url'] = response.url i['title'] = response.xpath( '//header[contains(@class, "content-header")]/h1/text()').extract( )[0].strip() desc = response.xpath( '//div[@class="content-text"]//p/text()').extract() desc = " ".join(desc) i['description'] = desc return i
def parse_detail(self, response): images= response.css('#amasty_gallery a::attr(href)').extract() if images is None or len(images)<1: images= response.css('div.product-image img::attr(data-zoom-image)').extract() name= response.css('div.product-name h1::text').extract()[0] loader= ItemLoader(item=CrawlbotItem(), selector= images) loader.add_value('image_urls', images) price_tex= response.css('div.price-box span span::text').extract_first() if price_tex is not None: price_tex= price_tex[:-2].replace('.','') sizes=[] for li_size in response.css('#configurable_swatch_size li'): sizes.append(li_size.css('a::attr(title)').extract()[0]) product_item= ProductItem() product_item.product['name']= name product_item.product['price_tex']= price_tex product_item.product['manufacturer'] = CRAWLING_SITES[self.start_urls[0]]['brand'] ima_url = loader._values['image_urls'] ima_url = [CRAWLING_SITES[self.start_urls[0]]['image_dir'] + 'full/' + hashlib.sha1(i.encode('utf-8')).hexdigest() + '.jpg' for i in ima_url] product_item.product['image'] = ','.join(i for i in ima_url) product_item.set_alt_image() product_item.product['product_url'] = response.request.url product_item.product['categories_url'] = response.meta['root_url'] product_item.product['category'] = response.meta['root_name'] # write product to csv file if product_item.product['image'].find('images') == 0: product_item.write_to_csv(CRAWLING_SITES[self.start_urls[0]]['data_file']) # set a combination if sizes is not None and len(sizes) > 0: combination_item = CombinationItem() for element in sizes: size = element.split('/')[0].strip() combination_item.set_attribute(size) # cost = element.split('-')[1][0:-1].replace(',', '.') combination_item.set_wholesale_price(price_tex) combination_item.set_wholesale_price(price_tex) combination_item.set_product_reference() combination_item.combination['group'] = 'Size:12' # write combination to csv file if size is not None: combination_item.write_to_csv(CRAWLING_SITES[self.start_urls[0]]['data_file']) # save images yield loader.load_item()
def parse_detail(self, response): #set image_url and call pipeline image = response.css('div.swiper-wrapper')[1].css( 'a::attr(href)').extract() image = [response.urljoin(url) for url in image] loader = ItemLoader(item=CrawlbotItem(), selector=image) loader.add_value('image_urls', image) # get value for fields from web name_p1 = response.css('div.col-sm-12 h3::text').extract_first() name_p2 = response.css( 'div.row div.col-sm-12::text').extract()[2].strip() name = name_p1 + name_p2 price_tex = response.css('div.col-sm-12.price::text').extract_first() reduction_from = response.css( 'div.col-sm-12.price-line-through::text').extract_first() description = response.css( '#content-wrapper > section > div:nth-child(3) > div > div.col-sm-7 > div:nth-child(4) > div p::text' ).extract_first() # set product product_item = ProductItem() product_item.product["name"] = name if description is not None: product_item.product["description"] = description if price_tex is not None and len(price_tex) > 3: price_tex = price_tex.strip()[0:-1].replace('.', '') product_item.product["price_tex"] = price_tex if reduction_from is not None: product_item.product['reduction_from'] = reduction_from.strip( )[0:-1].replace('.', '') product_item.set_reduction_price() product_item.product['manufacturer'] = CRAWLING_SITES[ self.start_urls[0]]['brand'] ima_url = loader._values['image_urls'] ima_url = [ CRAWLING_SITES[self.start_urls[0]]['image_dir'] + 'full' + hashlib.sha1(i.encode('utf-8')).hexdigest() + '.jpg' for i in ima_url ] product_item.product['image'] = ','.join(i for i in ima_url) product_item.set_alt_image() product_item.product['product_url'] = response.request.url product_item.product['categories_url'] = response.meta['root_url'] product_item.product['category'] = response.meta['root_name'] sizes = response.css('div.col-xs-10.attribut-wrapper')[0].css( 'label a::text').extract() # # write product to csv file if product_item.product['image'].find('images') == 0: product_item.write_to_csv( CRAWLING_SITES[self.start_urls[0]]['data_file']) # set a combination if sizes is not None and len(sizes) > 0: combination_item = CombinationItem() for size in sizes: combination_item.set_attribute(size) combination_item.set_wholesale_price(price_tex) combination_item.set_product_reference() combination_item.combination['group'] = 'Size:12' #write combination to csv file if size is not None: combination_item.write_to_csv( CRAWLING_SITES[self.start_urls[0]]['data_file']) # # save images yield loader.load_item()
def parse_detail(self, response): #set image_url and call pipeline image = response.css('#surround > div a::attr(data-image)').extract() image = ['https:' + i for i in image] loader = ItemLoader(item=CrawlbotItem(), selector=image) loader.add_value('image_urls', image) # get value for fields from web name = response.css('div.product-title h1::text').extract_first() description_p1 = response.css('#tab_one > div > p::text').extract() description_p2 = response.css('#tab_one > div::text').extract() description = description_p1 + description_p2 price_tex = response.css( 'div.product-price span::text').extract_first() reduction_from = response.css( 'div.product-price del::text').extract_first() sizes_colors_cost = response.css( '#product-select > option::text').extract() # set product product_item = ProductItem() product_item.product["name"] = name product_item.product["description"] = '\n'.join( i for i in description).strip() product_item.product["price_tex"] = price_tex[0:-1].replace(',', '') if reduction_from is not None: product_item.product['reduction_from'] = reduction_from[ 0:-1].replace(',', '') product_item.set_reduction_price() product_item.product['manufacturer'] = CRAWLING_SITES[ self.start_urls[0]]['brand'] ima_url = loader._values['image_urls'] ima_url = [ CRAWLING_SITES[self.start_urls[0]]['image_dir'] + 'full/' + hashlib.sha1(i.encode('utf-8')).hexdigest() + '.jpg' for i in ima_url ] product_item.product['image'] = ','.join(i for i in ima_url) product_item.set_alt_image() product_item.product['product_url'] = response.request.url product_item.product['categories_url'] = response.meta['root_url'] product_item.product['category'] = response.meta['root_name'] # write product to csv file if product_item.product['image'].find('images') == 0: product_item.write_to_csv( CRAWLING_SITES[self.start_urls[0]]['data_file']) # set a combination if sizes_colors_cost is not None and len(sizes_colors_cost) > 0: combination_item = CombinationItem() for element in sizes_colors_cost: size = element.split('/')[0].strip() combination_item.set_attribute(size) cost = element.split('-')[1][0:-1].replace(',', '.') combination_item.set_wholesale_price(cost) combination_item.set_wholesale_price(price_tex[0:-1].replace( ',', '')) combination_item.set_product_reference() combination_item.combination['group'] = 'Size:12' #write combination to csv file if size is not None: combination_item.write_to_csv( CRAWLING_SITES[self.start_urls[0]]['data_file']) # save images yield loader.load_item()
def parse_detail2(self, response): #get set of image url image_urls= response.css('#product_addtocart_form > div.product-img-box > div.more-views > ul li a img::attr(src)').extract() name= response.css('#product_addtocart_form > div.product-info-right > div.product-shop > div.product-name > span::text').extract_first() loader= ItemLoader(CrawlbotItem(), image_urls) image_urls = ['https:' + i for i in image_urls] loader.add_value('image_urls', image_urls) #get detail sizes= response.css('div.input-box ul li a::attr(title)').extract() yield{ 'size': sizes, } oldPrice = response.css('div.price-info div.price-box span.regular-price span.price::text')[0].extract() newPrice = [] if oldPrice is None: oldPrice= response.css('#product_addtocart_form > div.product-info-right > div.product-shop > div.price-info > div > p.old-price span.price::text')[0].extract_first() newPrice= response.css('#product_addtocart_form > div.product-info-right > div.product-shop > div.price-info > div > p.special-price span::text')[0].extract_first() yield{ 'old': oldPrice, 'new': newPrice } productItem= ProductItem() productItem.product["name"]= name.replace(',','') productItem.product["price_tex"]= oldPrice.replace('đ', '').replace('.','') productItem.product['manufacturer']= CRAWLING_SITES[self.start_urls[0]]['brand'] ima_url = loader._values['image_urls'] ima_url = [ CRAWLING_SITES[self.start_urls[0]]['image_dir'] + 'full/' + hashlib.sha1(i.encode('utf-8')).hexdigest() + '.jpg' for i in ima_url] productItem.product['image'] = ','.join(i for i in ima_url) productItem.set_alt_image() productItem.product['product_url'] = ''.join(i for i in response.meta['root_url']) productItem.product['categories_url'] = response.request.url productItem.product['category'] = ''.join(i for i in response.meta['root_name']) # if newPrice is not None: # productItem.product['reduction_from'] = newPrice.replace('đ','').replace('.','') # productItem.set_reduction_price() if productItem.product['image'].find('images') == 0: productItem.write_to_csv(CRAWLING_SITES[self.start_urls[0]]['data_file']) if sizes is not None and len(sizes) > 0: combination_item = CombinationItem() for size in sizes: combination_item.set_attribute(size) combination_item.set_wholesale_price(oldPrice.replace(',', '')) combination_item.set_product_reference() combination_item.combination['group'] = 'Size:12' #write combination to csv file if size is not None: combination_item.write_to_csv(CRAWLING_SITES[self.start_urls[0]]['data_file']) # # save images yield loader.load_item()
def parse_detail(self, response): #set image_url and call pipeline image = response.css( 'div.more-views div.product-image-thumbs div a img::attr(src)' ).extract() loader = ItemLoader(item=CrawlbotItem(), selector=image) loader.add_value('image_urls', image) # get value for fields from web name = response.css( 'div.product-shop div.product-name span::text').extract_first() description_p1 = response.css( 'div.tab-content div.std p::text').extract() description_p2 = response.css('div.tab-content div::text').extract() descriptions = description_p2[0:2] + description_p1 description = ','.join(i.strip() for i in descriptions) price = response.css('div.price-info div span.price::text').extract() # set product product_item = ProductItem() product_item.product["name"] = name product_item.product["description"] = description.replace( ',', '').replace('\t', '').replace('\r', '') if price_tex is not None and len(price_tex) > 3: price_tex = price[-1].strip()[0:-2].replace('.', '') product_item.product["price_tex"] = price product_item.product['reduction_from'] = price[0].strip( )[0:-2].replace('.', '') product_item.set_reduction_price() product_item.product['manufacturer'] = CRAWLING_SITES[ self.start_urls[0]]['brand'] ima_url = loader._values['image_urls'] ima_url = [ CRAWLING_SITES[self.start_urls[0]]['image_dir'] + 'full/' + hashlib.sha1(i.encode('utf-8')).hexdigest() + '.jpg' for i in ima_url ] product_item.product['image'] = ','.join(i for i in ima_url) product_item.set_alt_image() product_item.product['product_url'] = response.request.url product_item.product['categories_url'] = response.meta['root_url'] product_item.product['category'] = response.meta['root_name'] sizes = response.css( 'dd.clearfix.swatch-attr.last ul.configurable-swatch-list.clearfix li a::attr(title)' ).extract() # # write product to csv file if product_item.product['image'].find('images') == 0: product_item.write_to_csv( CRAWLING_SITES[self.start_urls[0]]['data_file']) # set a combination if sizes is not None and len(sizes) > 0: combination_item = CombinationItem() for size in sizes: combination_item.set_attribute(size) combination_item.set_wholesale_price(price_tex) combination_item.set_product_reference() combination_item.combination['group'] = 'Size:12' #write combination to csv file if size is not None: combination_item.write_to_csv( CRAWLING_SITES[self.start_urls[0]]['data_file']) # # save images yield loader.load_item()