def _scrape_product_links(self, response): data = WaitroseProductsSpider._get_data(response) for product_data in data['products']: product = SiteProductItem() for product_key, data_key in self._PRODUCT_TO_DATA_KEYS.items(): value = product_data.get(data_key, 'null') if value != 'null': product[product_key] = product_data[data_key] image_url = product.get('image_url', 'None') if image_url: product['image_url'] = urlparse.urljoin('http://', image_url) # This one is not in the mapping since it requires transformation. #product['upc'] = int(product_data['productid']) if product.get('price', None): price = product['price'] price = price.replace('£', 'p') price = re.findall('(p? *[\d ,.]+ *p?) *', price) price = price[0] if price else '' if price.endswith('p'): price = '0.' + price.strip() if 'p' in price: price = re.sub('[p ,]', '', price) product['price'] = Price(priceCurrency='GBP', price=price) else: self.log('Unknown price format at %s' % response) if not product.get('url', '').startswith('http'): product['url'] = urlparse.urljoin('http://www.waitrose.com', product['url']) yield product['url'], product
def _parse_single_product(self, response): productdata = "[" + is_empty( response.xpath('//meta[@name="productdata"]/@content').extract(), "")[:-1].replace("|", ",") + "]" productdata = is_empty(json.loads(productdata)) product = SiteProductItem() if productdata: product["title"] = productdata["name"] product["is_out_of_stock"] = not productdata["available"] product["url"] = "http://www.tesco.com/groceries/product/details/"\ "?id=" + str(productdata["productId"]) regex = "id=([A-Z0-9\-]+)" reseller_id = re.findall(regex, product.get('url', '')) reseller_id = reseller_id[0] if reseller_id else None cond_set_value(product, "reseller_id", reseller_id) try: product["price"] = Price(price=productdata["price"], priceCurrency="GBP") except: pass product["image_url"] = productdata["mediumImage"] product["search_term"] = "" product["brand"] = is_empty(self.brand_from_title( product["title"])) product["site"] = is_empty(self.allowed_domains) if self.product_url: product['is_single_result'] = True if product.get("search_term"): del product['search_term'] return product
def _scrape_product_links(self, response): # To populate the description, fetching the product page is necessary. if self.user_agent_key not in ["desktop", "default"]: links = response.xpath( '//section[contains(@class,"product_listed")]' '//div[contains(@class,"product_info")]//a/@href').extract() if not links: self.log( "[Mobile] Found no product data on: %s" % response.url, ERROR) for link in links: yield urlparse.urljoin(response.url, link), SiteProductItem() else: url = response.url # This will contain everything except for the URL and description. product_jsons = response.xpath( '//meta[@name="productdata"]/@content').extract() if not product_jsons: self.log("Found no product data on: %s" % url, ERROR) product_links = response.css( ".product > .desc > h2 > a ::attr('href')").extract() if not product_links: self.log("Found no product links on: %s" % url, ERROR) for product_json, product_link in zip(product_jsons[0].split('|'), product_links): prod = SiteProductItem() cond_set_value(prod, 'url', urlparse.urljoin(url, product_link)) product_data = json.loads(product_json) cond_set_value(prod, 'price', product_data.get('price')) cond_set_value(prod, 'image_url', product_data.get('mediumImage')) #prod['upc'] = product_data.get('productId') if prod.get('price', None): prod['price'] = Price(price=str(prod['price']).replace( ',', '').strip(), priceCurrency='GBP') try: brand, title = self.brand_from_title(product_data['name']) cond_set_value(prod, 'brand', brand) cond_set_value(prod, 'title', title) except KeyError: raise AssertionError( "Did not find title or brand from JS for product: %s" % product_link) yield None, prod
def _scrape_product_links(self, response): for box in self._fetch_product_boxes(response): url = urlparse.urljoin(response.url, self._link_from_box(box)) product = SiteProductItem() self._populate_from_box(response, box, product) if not product.get('brand', None): dump_url_to_file(response.url) meta = response.meta.copy() meta['product'] = product user_agent = USER_AGENT_LIST.pop(0) USER_AGENT_LIST.append(user_agent) request = Request(url, callback=self.parse_product, meta=meta) request.headers.setdefault('User-Agent', user_agent) yield request, product
def _scrape_product_links(self, response): products = response.xpath('//ol[contains(@class, "search-results")]' '//div[contains(@class, "sc_result_list")]') if not products: self.log("Found no product links.", ERROR) for product in products: prod_links = product.xpath( './/div[contains(@class, "sc_result_title")]//a/@href' ).extract() if not prod_links: self.log( "Failed to extract product link for item: %r" % (product.extract(), ), ERROR) continue prod_link = prod_links[0] item = SiteProductItem() cond_set( item, 'title', product.css('div.sc_result_title a::text').extract(), conv=string.strip, ) cond_set( item, 'price', product.css('div.sc_result_price::text').re('(\d.+)'), ) if item.get('price', None): if not '€' in item['price']: self.log('Unknown currency at' % response.url) else: item['price'] = Price(price=item['price'].replace( ',', '').replace('€', '').strip(), priceCurrency='EUR') cond_set( item, 'locale', response.xpath('//html/@lang').extract(), conv=string.strip, ) cond_set_value(item, 'locale', 'fr-FR') # Default. yield prod_link, item
def _scrape_product_links(self, response): data = json.loads(response.body_as_unicode()) for item in data['items']: prod = SiteProductItem() prod['title'] = item['itemName'] prod['brand'] = item['brandName'] prod['site'] = 'http://www.asda.com/' # Hardcoded, store seems not to have out of stock products prod['is_out_of_stock'] = False prod['price'] = item['price'] if prod.get('price', None): prod['price'] = Price(price=prod['price'].replace( '£', '').replace(',', '').strip(), priceCurrency='GBP') # FIXME Verify by comparing a prod in another site. total_stars = int(item['totalReviewCount']) avg_stars = float(item['avgStarRating']) prod['buyer_reviews'] = BuyerReviews(num_of_reviews=total_stars, average_rating=avg_stars, rating_by_star={}) prod['model'] = item['cin'] image_url = item.get('imageURL') if not image_url and "images" in item: image_url = item.get('images').get('largeImage') prod['image_url'] = image_url pId = is_empty(re.findall("itemid=(\d+)", item['productURL'])) if pId and "search_term" in response.meta: prod['url'] = self.PRODUCT_LINK % (urllib.quote( response.meta["search_term"]), pId) elif "imageURL" in item: prod["url"] = item['imageURL'] prod['locale'] = "en-GB" products_ids = item['id'] url = self.API_URL.format(id=products_ids) yield url, prod