def parse(self, response): # Yield list pages. x_pagination = response.xpath( '//body[@id="darty_liste_produit"]//div[@id="main_pagination_top"]/div[' + u.x_class('darty_product_list_pages_list') + ']') if x_pagination: url_next_page = x_pagination.xpath( './a[text()=" Page suivante"][last()]/@href').extract_first() if url_next_page is not None: yield Request(self.base_url + url_next_page.strip(), callback=self.parse) # Yield product pages. x_list = response.xpath( '//body[@id="darty_liste_produit"]//div[@id="main_products_list"]') if x_list: urls = x_list.xpath('.//div[' + u.x_class('infos_container') + ']/h2/a/@href').extract() for url in urls: url = self.base_url + url.strip() open_ssl_hash = u.generate_open_ssl_hash(url) if open_ssl_hash not in self.already_crawled: self.already_crawled.append(open_ssl_hash) yield Request(url, callback=self.parse) # Yield product. x_product = response.xpath('//body[@id="page_product"]') if x_product: item = Product() # Categories x_categories = response.xpath('//ul[@id="dartyCom_fil_ariane"]') main_category = x_categories.xpath( './li[2]/a/text()').extract_first() if main_category is not None: main_category = main_category.strip() categories = x_categories.xpath( './li[position() >= 3 and position() < last()]/a/text()' ).extract() if categories: for i, category in enumerate(categories): categories[i] = category.strip() # Brand brand = response.xpath( '//a[@id="darty_product_brand"]/text()').extract_first() if brand is not None: brand = brand.strip() # Name name = re.sub( ' +', ' ', ''.join( response.xpath('//h1[' + u.x_class('product_head') + ']//div[' + u.x_class('product_name') + ']/span//text()').extract()).replace( '\n', '').replace('\r', '').strip()) # Price price, price_old, currency = p.get_darty_prices(response) # Image src = response.xpath( '//div[' + u.x_class('darty_product_picture_main_pic_container') + ']/div[1]//img/@src').extract_first() if src is not None: src = src.strip() # Avis x_avis = response.xpath('//div[' + u.x_class('bloc_reviews_resume') + ']') rate = x_avis.xpath( '//meta[@itemprop="ratingValue"]/@content').extract_first() if rate is not None: rate = u.string_to_float(rate.strip()) max_rate = x_avis.xpath('//div[' + u.x_class('bloc_reviews_note') + ']/sub/text()').extract_first() if max_rate is not None: max_rate = int(re.sub('\D', ' ', max_rate.strip())) nb_avis = x_avis.xpath( '//meta[@itemprop="ratingCount"]/@content').extract_first() if nb_avis is not None: nb_avis = int(re.sub('\D', ' ', nb_avis.strip())) item['store'] = self.name item['url'] = response.url item['main_category'] = main_category item['categories'] = categories item['brand'] = brand item['openssl_hash'] = u.generate_open_ssl_hash(item['url']) item['name'] = name item['price_old'] = price_old item['price'] = price item['currency'] = currency item["image_urls"] = [src] item["image_name"] = item['openssl_hash'] item["rate"] = rate item["max_rate"] = max_rate item["nb_avis"] = nb_avis item["price_history"] = [{ 'date': time.strftime("%Y/%m/%d"), 'price_old': price_old, 'price': price, 'currency': currency }] yield item
def parse(self, response): # Yield list pages. x_pagination = response.xpath('//ul[' + u.x_class('Pager bottom-toolbar') + ']') if x_pagination: url_next_page = x_pagination.xpath( './/a[' + u.x_class('prevnext actionNext') + ']/@href').extract_first() if url_next_page is not None: yield Request(url_next_page, callback=self.parse) # Yield product pages. x_list = response.xpath('//ul[' + u.x_class('articleList') + ']') if x_list: urls = x_list.xpath('.//p[' + u.x_class('Article-desc') + ']/a/@href').extract() for url in urls: open_ssl_hash = u.generate_open_ssl_hash(url) if open_ssl_hash not in self.already_crawled: self.already_crawled.append(open_ssl_hash) yield Request(url, callback=self.parse) # Yield product. x_product = response.xpath('//div[' + u.x_class('f-productPage') + ']') if x_product: item = Product() # Categories x_categories = response.xpath('//ul[' + u.x_class('f-breadcrumb') + ']') main_category = x_categories.xpath( './li[2]/a/text()').extract_first() if main_category is not None: main_category = main_category.strip() categories = x_categories.xpath( './li[position() >= 3]/a/text()').extract() if categories: for i, category in enumerate(categories): categories[i] = category.strip() # Name name = response.xpath('//h1[' + u.x_class('f-productHeader-Title') + ']/text()').extract_first().strip() # Price price, price_old, currency = p.get_fnac_prices(response) # Image src = response.xpath('//img[' + u.x_class('f-productVisuals-mainMedia') + ']/@src').extract_first() if src is not None: src = src.strip() # Avis x_avis = response.xpath('//div[' + u.x_class('f-review-header') + ']') rate = x_avis.xpath('.//div[' + u.x_class('f-review-headerRate') + ']/text()').extract_first() if rate is not None: rate = u.string_to_float(rate.strip()) max_rate = x_avis.xpath('.//span[' + u.x_class('f-review-headerRateTotal') + ']/text()').extract_first() if max_rate is not None: max_rate = u.string_to_float(max_rate.strip().replace("/", "")) nb_avis = response.xpath('//div[' + u.x_class('f-productHeader-review') + ']//span[' + u.x_class('f-productHeader-reviewLabel') + ']/text()').extract_first() if nb_avis is not None: nb_avis = u.string_to_float(re.sub("\D", "", nb_avis.strip())) item['store'] = self.name item['url'] = response.url item['main_category'] = main_category item['categories'] = categories item['brand'] = None item['openssl_hash'] = u.generate_open_ssl_hash(item['url']) item['name'] = name item['price_old'] = price_old item['price'] = price item['currency'] = currency item["image_urls"] = [src] item["image_name"] = item['openssl_hash'] item["rate"] = rate item["max_rate"] = max_rate item["nb_avis"] = nb_avis item["price_history"] = [{ 'date': time.strftime("%Y/%m/%d"), 'price_old': price_old, 'price': price, 'currency': currency }] if src == self.src_no_image: copyfile( "data/default.jpg", "data/" + self.name + "/img/" + item["image_name"] + ".jpg") yield item
def parse(self, response): # Yield list pages. x_pagination = response.xpath('//nav[' + u.x_class('ui-pagination') + ']') if x_pagination: url_next_page = x_pagination.xpath('.//a[' + u.x_class('ui-pagination--next') + ']/@href').extract_first() if url_next_page is not None: yield Request(self.base_url + url_next_page.strip(), callback=self.parse) # Yield product pages. x_list = response.xpath('//div[' + u.x_class('product-list--container') + ']') if x_list: urls = x_list.xpath('.//div[' + u.x_class('product-item--wrapper') + ']/a/@href').extract() for url in urls: url = self.base_url + url.strip() open_ssl_hash = u.generate_open_ssl_hash(url) if open_ssl_hash not in self.already_crawled: self.already_crawled.append(open_ssl_hash) yield Request(url, callback=self.parse) # Yield product. x_product = response.xpath('//div[' + u.x_class('product-detail') + ']') if x_product: item = Product() # Categories x_categories = response.xpath('//div[' + u.x_class('ui-breadcrumb--scroller') + ']/nav') main_category = x_categories.xpath('./span[2]/meta[@itemprop="name"]/@content').extract_first() if main_category is not None: main_category = main_category.strip() categories = x_categories.xpath('./span[position() >= 3 and position() < last()]/meta[@itemprop="name"]/@content').extract() if categories: for i, category in enumerate(categories): categories[i] = category.strip() # Brand x_brand_name = response.xpath('//div[' + u.x_class('product-detail--wrapper') + ']') brand = x_brand_name.xpath('./meta[@itemprop="brand"]/@content').extract_first() if brand is not None: brand = brand.strip() # Name name = x_brand_name.xpath('./h1[' + u.x_class('product-detail--title') + ']/text()').extract_first().replace('\n', '').replace('\r', '').strip() # Price price, price_old, currency = p.get_auchan_prices(response) # Image src = response.xpath('//div[' + u.x_class('x-scroller') + ']/label[1]//img/@src').extract_first() if src is not None: src = src.strip() # Avis x_avis = response.xpath('//div[' + u.x_class('product-detail--rating') + ']') rate = x_avis.xpath('.//meta[@itemprop="ratingValue"]/@content').extract_first() if rate is not None: rate = u.string_to_float(rate.strip()) nb_avis = x_avis.xpath('.//meta[@itemprop="reviewCount"]/@content').extract_first() if nb_avis is not None: nb_avis = int(nb_avis.strip()) max_rate = x_avis.xpath('.//span[' + u.x_class('ui-rating--background') + ']/i[' + u.x_class('icon-auchan-82') + ']').extract() max_rate = len(max_rate) if max_rate else None item['store'] = self.name item['url'] = response.url item['main_category'] = main_category item['categories'] = categories item['brand'] = brand item['openssl_hash'] = u.generate_open_ssl_hash(item['url']) item['name'] = name item['price_old'] = price_old item['price'] = price item['currency'] = currency item["image_urls"] = [src] item["image_name"] = item['openssl_hash'] item["rate"] = rate item["max_rate"] = max_rate item["nb_avis"] = nb_avis item["price_history"] = [{'date': time.strftime("%Y/%m/%d"), 'price_old': price_old, 'price': price, 'currency': currency}] yield item
def parse(self, response): # Yield list pages. x_pagination = response.xpath('//ul[@id="PaginationForm_ul"]') if x_pagination and self.first: self.first = False nb_page = x_pagination.xpath('./li[last()]/a/text()').extract_first() if nb_page is not None: for x in range(1, int(nb_page.strip())): yield Request(response.url[:-5] + "-" + str(x) + response.url[-5:], callback=self.parse) # Yield product pages. x_list = response.xpath('//ul[@id="lpBloc"]') if x_list: urls = x_list.xpath('.//div[' + u.x_class('prdtBILDetails') + ']/a/@href').extract() for url in urls: url = url.strip() open_ssl_hash = u.generate_open_ssl_hash(url) if open_ssl_hash not in self.already_crawled and self.nb_crawled < 300 : self.nb_crawled += 1 self.already_crawled.append(open_ssl_hash) yield Request(url, callback=self.parse) # Yield product. x_product = response.xpath('//h1[@itemprop="name"]') if x_product: item = Product() # Categories x_categories = response.xpath('//div[@id="bc"]') main_category = x_categories.xpath('.//li[3]//span/text()').extract_first() if main_category is not None: main_category = main_category.strip() categories = x_categories.xpath('.//li[position() >= 4 and position() < last()]//span/text()').extract() if categories: for i, category in enumerate(categories): categories[i] = category.strip() # Brand brand = response.xpath('//table[' + u.x_class('fpDescTb fpDescTbPub') + ']//span[@itemprop="brand"]//span[@itemprop="name"]/text()').extract_first() if brand is not None: brand = brand.strip() # Name name = re.sub(' +', ' ', x_product.xpath('./text()').extract_first().strip()) # Price price, price_old, currency = p.get_cdiscount_prices(response) # Image src = response.xpath('//div[' + u.x_class('fpMainImg') + ']/a[@itemprop="image"]/@href').extract_first() if src is not None: src = src.strip() # Avis x_avis = response.xpath('//div[' + u.x_class('topMainRating') + ']') rate = x_avis.xpath('//span[@itemprop="ratingValue"]/text()').extract_first() if rate is not None: rate = u.string_to_float(rate.strip()) nb_avis = x_avis.xpath('//span[@itemprop="ratingCount"]/text()').extract_first() if nb_avis is not None: nb_avis = int(nb_avis.strip()) item['store'] = self.name item['url'] = response.url item['main_category'] = main_category item['categories'] = categories item['brand'] = brand item['openssl_hash'] = u.generate_open_ssl_hash(item['url']) item['name'] = name item['price_old'] = price_old item['price'] = price item['currency'] = currency item["image_urls"] = [src] item["image_name"] = item['openssl_hash'] item["rate"] = rate item["max_rate"] = 5 item["nb_avis"] = nb_avis item["price_history"] = [{'date': time.strftime("%Y/%m/%d"), 'price_old': price_old, 'price': price, 'currency': currency}] yield item
def parse(self, response): # Yield list pages. x_pagination = response.xpath('//div[' + u.x_class('navigationListe') + ']') if x_pagination: url_next_page = x_pagination.xpath( './/span[' + u.x_class('navPage navPage-right') + ']/a/@href').extract_first() if url_next_page is not None: yield Request(self.base_url + url_next_page.strip(), callback=self.parse) # Yield product pages. x_list = response.xpath('//div[' + u.x_class('productListe') + ']') if x_list: urls = x_list.xpath('.//div[' + u.x_class('designations') + ']/h2/a/@href').extract() for url in urls: url = self.base_url + url.strip() open_ssl_hash = u.generate_open_ssl_hash(url) if open_ssl_hash not in self.already_crawled: self.already_crawled.append(open_ssl_hash) yield Request(url, callback=self.parse) # Yield product. x_product = response.xpath('//h1[@itemprop="name"]') if x_product: item = Product() # Categories x_categories = response.xpath('//div[@id="filAriane"]') main_category = x_categories.xpath( './/li[2]//a/text()').extract_first() if main_category is not None: main_category = main_category.strip() categories = x_categories.xpath( './/li[position() >= 3 and position() <= last()]//a/text()' ).extract() if categories: for i, category in enumerate(categories): categories[i] = category.strip() # Name name = re.sub( ' +', ' ', ''.join(x_product.xpath('./text()').extract()).replace( '\n', '').replace('\r', '').strip()) # Price price, price_old, currency = p.get_boulanger_prices(response) # Image src = response.xpath( '//span[@itemprop="gtin13"]/text()').extract_first() if src is not None: src = "https://boulanger.scene7.com/is/image/Boulanger/" + src.strip( ) + "_h_f_l_0" # Avis x_avis = response.xpath('//div[' + u.x_class('top') + ']/div[' + u.x_class('right') + ']//span[' + u.x_class('rating') + ']') rate = x_avis.xpath('./@class').extract_first() if rate is not None: rate = re.sub('\D', '', rate.strip()) if rate != "0": if len(rate) > 1: rate = rate[:1] + "," + rate[1:] rate = u.string_to_float(rate) else: rate = None nb_avis = x_avis.xpath('./span[' + u.x_class('link') + ']/text()').extract_first() if nb_avis is not None: nb_avis = int(re.sub('\D', '', nb_avis.strip())) item['store'] = self.name item['url'] = response.url item['main_category'] = main_category item['categories'] = categories item['brand'] = None item['openssl_hash'] = u.generate_open_ssl_hash(item['url']) item['name'] = name item['price_old'] = price_old item['price'] = price item['currency'] = currency item["image_urls"] = [src] item["image_name"] = item['openssl_hash'] item["rate"] = rate item["max_rate"] = 5 item["nb_avis"] = nb_avis item["price_history"] = [{ 'date': time.strftime("%Y/%m/%d"), 'price_old': price_old, 'price': price, 'currency': currency }] yield item
def parse(self, response): # Yield list pages. x_pagination = response.xpath('//ul[' + u.x_class('pagination pagination-sm') + ']') if x_pagination: url_next_page = x_pagination.xpath( './li[position() = last()]/a/@href').extract_first() if url_next_page is None: url_next_page = x_pagination.xpath( './li[position() = (last() - 1)]/a/@href').extract_first() if url_next_page is not None: yield Request(self.base_url + url_next_page, callback=self.parse) # Yield product pages. x_list = response.xpath('//table[' + u.x_class('ProdList') + ']') if x_list: urls = x_list.xpath('.//td[' + u.x_class('Photo') + ']/span/@data-href').extract() for url in urls: url = self.base_url + url open_ssl_hash = u.generate_open_ssl_hash(url) if open_ssl_hash not in self.already_crawled: self.already_crawled.append(open_ssl_hash) yield Request(url, callback=self.parse) # Yield product. x_product = response.xpath('//div[@id="prod"]') if x_product: item = Product() # Categories x_categories = response.xpath('//nav[@id="breadcrumb"]') categories = x_categories.xpath( './/li[position() >= 3 and position() < last()]/a/text()' ).extract() if categories: for i, category in enumerate(categories): categories[i] = category.strip() # Brand brand = x_categories.xpath('.//li[2]/a/text()').extract_first() if brand is not None: brand = brand.strip() # Name name = re.sub( ' +', ' ', ''.join( response.xpath( '//h1[@id="ProdTitle"]//text()').extract()).replace( '\n', '').replace('\r', '').strip()) # Price price, price_old, currency = p.get_materiel_net_prices(response) # Image src = response.xpath('//div[' + u.x_class('swiper-wrapper') + ']//a/@data-zoom-image').extract_first() if src is None: src = response.xpath( '//div[@id="container-image"]/@data-zoom-image' ).extract_first() if src is not None: src = src.strip() # Avis x_avis = response.xpath('//div[' + u.x_class('headerAvisClients') + ']') rate = x_avis.xpath('.//span[' + u.x_class('noteUser') + ']/text()').extract_first() if rate is not None: rate = u.string_to_float(rate.strip()) max_rate = x_avis.xpath( './/span[' + u.x_class('noteUser') + ']/following-sibling::span[1]/text()').extract_first() if max_rate is not None: max_rate = u.string_to_float(max_rate.strip()) nb_avis = x_avis.xpath( './/span[@id="avisCount"]/span/text()').extract_first() if nb_avis is not None: nb_avis = int(nb_avis.strip()) item['store'] = self.name item['url'] = response.url item['main_category'] = "Informatique" item['categories'] = categories item['brand'] = brand item['openssl_hash'] = u.generate_open_ssl_hash(item['url']) item['name'] = name item['price_old'] = price_old item['price'] = price item['currency'] = currency item["image_urls"] = [src] item["image_name"] = item['openssl_hash'] item["rate"] = rate item["max_rate"] = max_rate item["nb_avis"] = nb_avis item["price_history"] = [{ 'date': time.strftime("%Y/%m/%d"), 'price_old': price_old, 'price': price, 'currency': currency }] yield item