def start_requests(self): """Generate Requests from the SEARCH_URL and the search terms.""" for st in self.searchterms: yield Request( self.url_formatter.format( self.SEARCH_URL, search_term=urllib.quote_plus(st.encode('utf-8')), ), self._parse_all_cat, meta={'search_term': st, 'remaining': self.quantity}, ) if self.product_url: prod = SiteProductItem() prod['is_single_result'] = True prod['url'] = self.product_url prod['search_term'] = '' yield Request(self.product_url, self._parse_single_product, meta={'product': prod}) if self.products_url: urls = self.products_url.split('||||') for url in urls: prod = SiteProductItem() prod['url'] = url prod['search_term'] = '' yield Request(url, self._parse_single_product, meta={'product': prod})
def _scrape_product_links(self, response): def full_url(url): return urlparse.urljoin(response.url, url) self.total_matched = 0 links = response.xpath( "//div[@class='result-row']" "/article/a[@class='product-link']/@href").extract() if not links: no_results = response.xpath( "//div[@class='mod-important']/h1/text()").re(r'No results.*') if not no_results: # Exctract links form brand-page links = response.xpath( "//div[@id='content']/div/div/div/section/section" "/div/ul/li/a/@href").extract() url = full_url(links.pop(0)) new_meta = response.meta.copy() new_meta['pages_wlinks'] = links new_meta['ranking'] = 1 new_meta['count'] = 0 new_meta['links'] = [] yield Request(url, self._scrape_brand_links, meta=new_meta), SiteProductItem() return if not links: self.log("Found no product links.", DEBUG) for link in links: yield full_url(link), SiteProductItem()
def _parse_single_product(self, response): productdata = "[" + is_empty( response.xpath('//meta[@name="productdata"]/@content').extract(), "")[:-1].replace("|", ",") + "]" productdata = is_empty(json.loads(productdata)) product = SiteProductItem() if productdata: product["title"] = productdata["name"] product["is_out_of_stock"] = not productdata["available"] product["url"] = "http://www.tesco.com/groceries/product/details/"\ "?id=" + str(productdata["productId"]) regex = "id=([A-Z0-9\-]+)" reseller_id = re.findall(regex, product.get('url', '')) reseller_id = reseller_id[0] if reseller_id else None cond_set_value(product, "reseller_id", reseller_id) try: product["price"] = Price(price=productdata["price"], priceCurrency="GBP") except: pass product["image_url"] = productdata["mediumImage"] product["search_term"] = "" product["brand"] = is_empty(self.brand_from_title( product["title"])) product["site"] = is_empty(self.allowed_domains) if self.product_url: product['is_single_result'] = True if product.get("search_term"): del product['search_term'] return product
def start_requests(self): for st in self.searchterms: form_data = self.FORM_DATA.copy() form_data['searchTerm'] = st form_data['orderBy'] = self.SORT self.pages[st] = 0 # send request just to count number of total results yield Request(url=self.url_formatter.format(self.FIRST_URL, search_term=quote(st)), callback=self.parse_total_and_start_search, meta={ 'form_data': form_data, 'search_term': st, 'remaining': self.quantity }) if self.product_url: prod = SiteProductItem() prod['is_single_result'] = True prod['url'] = self.product_url yield Request(self.product_url, self._parse_single_product, meta={'product': prod}) if self.products_url: urls = self.products_url.split('||||') for url in urls: prod = SiteProductItem() prod['url'] = url prod['search_term'] = '' yield Request(url, self._parse_single_product, meta={'product': prod})
def start_requests(self): for st in self.searchterms: url = self.url_formatter.format( self.SEARCH_URL, search_term=urllib.quote_plus(st.encode('utf-8')), page='' # don't set for first request, or results will differ ) self.pages[st] = 2 yield Request(url, meta=dict(search_term=st, remaining=self.quantity)) if self.product_url: prod = SiteProductItem() prod['is_single_result'] = True prod['url'] = self.product_url prod['search_term'] = '' yield Request(self.product_url, self._parse_single_product, meta={'product': prod}) if self.products_url: urls = self.products_url.split('||||') for url in urls: prod = SiteProductItem() prod['url'] = url prod['search_term'] = '' yield Request(url, self._parse_single_product, meta={'product': prod})
def _scrape_product_links(self, response): """ Scraping product links from search page """ items = response.xpath( '//ul[@id="prod-list"]/li[contains(@class, "product-list-item")]') if items: for item in items: link = is_empty( item.xpath('./span[@class="product-name-header"]/' 'a/@href').extract()) res_item = SiteProductItem() yield link, res_item else: links = re.findall( r'<a href=\\"(\/p\/\d+)\\"', response.body_as_unicode().replace('\u003c', '<').replace('\u003e', '>')) if links: links = list(set(links)) for link in links: res_item = SiteProductItem() yield link, res_item else: self.log("Found no product links.".format(response.url), INFO)
def _scrape_product_links(self, response): data = WaitroseProductsSpider._get_data(response) for product_data in data['products']: product = SiteProductItem() for product_key, data_key in self._PRODUCT_TO_DATA_KEYS.items(): value = product_data.get(data_key, 'null') if value != 'null': product[product_key] = product_data[data_key] image_url = product.get('image_url', 'None') if image_url: product['image_url'] = urlparse.urljoin('http://', image_url) # This one is not in the mapping since it requires transformation. #product['upc'] = int(product_data['productid']) if product.get('price', None): price = product['price'] price = price.replace('£', 'p') price = re.findall('(p? *[\d ,.]+ *p?) *', price) price = price[0] if price else '' if price.endswith('p'): price = '0.' + price.strip() if 'p' in price: price = re.sub('[p ,]', '', price) product['price'] = Price(priceCurrency='GBP', price=price) else: self.log('Unknown price format at %s' % response) if not product.get('url', '').startswith('http'): product['url'] = urlparse.urljoin('http://www.waitrose.com', product['url']) yield product['url'], product
def _scrape_product_links(self, response): links = response.xpath( "//div[@class='productSearchResults']" "/div[@id='ProductViewListGrid']" "/div[contains(@class,'product_item')]" "/*/*/div[@class='pl_productName']/h5/a/@href").extract() no_results = response.xpath( "//div[@class='searchResultsSummary']" "/h1/text()").re(r'.*We\'re sorry.*could not find.*') if no_results: links = [] if not links: menu_links = response.xpath( "//div[@class='narrowResults']/div/ul/li/a/@href").extract() url = menu_links.pop(0) new_meta = response.meta.copy() new_meta['pages_wlinks'] = menu_links new_meta['count'] = 0 yield Request(url, self._scrape_brand_links, meta=new_meta), SiteProductItem() return if not links: self.log("Found no product links.", ERROR) for link in links: yield link, SiteProductItem()
def _parse_single_product(self, response): product = response.meta["product"] result = self._scrape_product_links(response) for p in result: for p2 in p: if isinstance(p2, SiteProductItem): if "search_term" in p2: del p2["search_term"] product = SiteProductItem( dict(p2.items() + product.items())) try: data = json.loads(response.body_as_unicode()) item = data['items'][0] if item.get("images", {}).get("largeImage"): product["image_url"] = item.get("images").get("largeImage") product['upc'] = item['upcNumbers'][0]['upcNumber'] except (IndexError, ValueError): pass product_id = re.findall('itemid=(\d+)', response.url) if product_id: url = self.REVIEW_URL % product_id[0] meta = {'product': product} return Request(url=url, meta=meta, callback=self._parse_review) return product
def set_zip_code(self, response): zip_code_stage = response.meta.get('zip_code_stage') self.log("zip code stage: %s" % zip_code_stage, DEBUG) if zip_code_stage == 1: new_meta = response.meta.copy() new_meta['zip_code_stage'] = 2 request = Request( url=self.STORES_JSON.format(zip_code=self.zip_code), callback=self.set_zip_code, headers={'X-Crawlera-Cookies': 'disable'}, meta=new_meta) yield request elif zip_code_stage == 2: stores_json = json.loads(response.body) near_store = stores_json['Location'][0] new_meta = response.meta.copy() new_meta['zip_code_stage'] = 3 request = Request( url=self.SELECT_STORE.format(key=near_store['KEY']), headers={'X-Crawlera-Cookies': 'disable'}, callback=self.set_zip_code, meta=new_meta) yield request else: for st in self.searchterms: yield Request( self.url_formatter.format( self.SEARCH_URL, search_term=urllib.quote_plus(st.encode('utf-8')), ), headers={'X-Crawlera-Cookies': 'disable'}, meta={ 'search_term': st, 'remaining': self.quantity }, ) if self.product_url: prod = SiteProductItem() prod['is_single_result'] = True prod['url'] = self.product_url prod['search_term'] = '' yield Request(self.product_url, self._parse_single_product, headers={'X-Crawlera-Cookies': 'disable'}, meta={'product': prod}) if self.products_url: urls = self.products_url.split('||||') for url in urls: prod = SiteProductItem() prod['url'] = url prod['search_term'] = '' yield Request(url, self._parse_single_product, headers={'X-Crawlera-Cookies': 'disable'}, meta={'product': prod})
def _scrape_product_links(self, response): # To populate the description, fetching the product page is necessary. if self.user_agent_key not in ["desktop", "default"]: links = response.xpath( '//section[contains(@class,"product_listed")]' '//div[contains(@class,"product_info")]//a/@href').extract() if not links: self.log( "[Mobile] Found no product data on: %s" % response.url, ERROR) for link in links: yield urlparse.urljoin(response.url, link), SiteProductItem() else: url = response.url # This will contain everything except for the URL and description. product_jsons = response.xpath( '//meta[@name="productdata"]/@content').extract() if not product_jsons: self.log("Found no product data on: %s" % url, ERROR) product_links = response.css( ".product > .desc > h2 > a ::attr('href')").extract() if not product_links: self.log("Found no product links on: %s" % url, ERROR) for product_json, product_link in zip(product_jsons[0].split('|'), product_links): prod = SiteProductItem() cond_set_value(prod, 'url', urlparse.urljoin(url, product_link)) product_data = json.loads(product_json) cond_set_value(prod, 'price', product_data.get('price')) cond_set_value(prod, 'image_url', product_data.get('mediumImage')) #prod['upc'] = product_data.get('productId') if prod.get('price', None): prod['price'] = Price(price=str(prod['price']).replace( ',', '').strip(), priceCurrency='GBP') try: brand, title = self.brand_from_title(product_data['name']) cond_set_value(prod, 'brand', brand) cond_set_value(prod, 'title', title) except KeyError: raise AssertionError( "Did not find title or brand from JS for product: %s" % product_link) yield None, prod
def _scrape_product_links(self, response): for box in self._fetch_product_boxes(response): url = urlparse.urljoin(response.url, self._link_from_box(box)) product = SiteProductItem() self._populate_from_box(response, box, product) if not product.get('brand', None): dump_url_to_file(response.url) meta = response.meta.copy() meta['product'] = product user_agent = USER_AGENT_LIST.pop(0) USER_AGENT_LIST.append(user_agent) request = Request(url, callback=self.parse_product, meta=meta) request.headers.setdefault('User-Agent', user_agent) yield request, product
def _scrape_product_links(self, response): products = response.xpath('//ol[contains(@class, "search-results")]' '//div[contains(@class, "sc_result_list")]') if not products: self.log("Found no product links.", ERROR) for product in products: prod_links = product.xpath( './/div[contains(@class, "sc_result_title")]//a/@href' ).extract() if not prod_links: self.log( "Failed to extract product link for item: %r" % (product.extract(), ), ERROR) continue prod_link = prod_links[0] item = SiteProductItem() cond_set( item, 'title', product.css('div.sc_result_title a::text').extract(), conv=string.strip, ) cond_set( item, 'price', product.css('div.sc_result_price::text').re('(\d.+)'), ) if item.get('price', None): if not '€' in item['price']: self.log('Unknown currency at' % response.url) else: item['price'] = Price(price=item['price'].replace( ',', '').replace('€', '').strip(), priceCurrency='EUR') cond_set( item, 'locale', response.xpath('//html/@lang').extract(), conv=string.strip, ) cond_set_value(item, 'locale', 'fr-FR') # Default. yield prod_link, item
def start_requests(self): # Stolen from walmart """Generate Requests from the SEARCH_URL and the search terms.""" #settings.overrides['CRAWLERA_ENABLED'] = True self.url_formatter.defaults['page'] = 1 for st in self.searchterms: yield Request( self.url_formatter.format(self.SEARCH_URL, search_term=urllib.quote_plus( st.encode('utf-8')), sort_mode=self.sort_mode), meta={ 'search_term': st, 'remaining': self.quantity, # 'dont_redirect': True, 'handle_httpstatus_list': [302], 'page': 1, }, headers={"User-Agent": self.user_agent}, dont_filter=True, cookies={ 'shippingCountry': 'US', 'currency': 'USD' }) if self.product_url: prod = SiteProductItem() prod['is_single_result'] = True prod['url'] = self.product_url yield Request(self.product_url, self._parse_single_product, meta={'product': prod}, dont_filter=True)
def _scrape_product_links(self, response): urls = response.xpath( "//ul[contains(@class,'products-grid')]/li//a[contains(@class, 'product-image')]/@href" ).extract() urls = [ urlparse.urljoin(response.url, x) if x.startswith('/') else x for x in urls ] if not urls: self.log("Found no product links.", DEBUG) # parse shelf category shelf_categories = response.xpath( '//div[@class="card_container"]//div[contains(@class, "no_gutter")]//a/@href' ).extract() shelf_categories = [category.strip() for category in shelf_categories] shelf_categories = filter(None, shelf_categories) try: shelf_name = response.xpath( '//meta[@name="og:title"]/@content').extract()[0].strip() except IndexError: pass for url in urls: item = SiteProductItem() if shelf_categories: item['shelf_name'] = shelf_name item['shelf_path'] = shelf_categories[1:] yield url, item
def start_requests(self): """Generate Requests from the SEARCH_URL and the search terms.""" self.url_formatter.defaults['page_no'] = 1 for st in self.searchterms: search_term, search_term_upper = self.generate_search_terms(st) yield Request( self.url_formatter.format( self.SEARCH_URL, search_term=search_term, search_term_upper=search_term_upper, ), meta={ 'search_term': st, 'remaining': self.quantity, 'page': 1 }, dont_filter=True, ) if self.product_url: prod = SiteProductItem() prod['is_single_result'] = True prod['url'] = self.product_url yield Request(self.product_url, self._parse_single_product, meta={'product': prod}, dont_filter=True)
def _get_json_data(self, item): product = SiteProductItem() item = item['productInfo'] if 'salePrice' in item['priceInfo']: price = re.findall('(/?\d+.\d+)', item['priceInfo']['salePrice']) if len(price) == 1: product['price'] = Price(price=float(price[0]), priceCurrency='USD') else: product['price'] = Price(price=float(price[-1]), priceCurrency='USD') elif 'regularPrice' in item['priceInfo']: price = re.findall('(/?\d+.\d+)', item['priceInfo']['regularPrice']) if len(price) == 1: product['price'] = Price(price=float(price[0]), priceCurrency='USD') else: product['price'] = Price(price=float(price[-1]), priceCurrency='USD') messages = item.get('channelAvailability', []) for mes in messages: if 'displayText' in mes: if 'Not sold online' in mes['displayText']: product['is_in_store_only'] = True if 'Out of stock online' in mes['displayText']: product['is_out_of_stock'] = True upc = item.get('upc') cond_set_value(product, 'upc', upc) return product
def _scrape_product_links(self, response): if self.force_tires: links = response.xpath( "//ul[@id='productList']/li/div[@class='productImage']/a/@href" ).extract() else: text = response.body_as_unicode() m = re.match(r'^SRPInitialLoad\((.*)\)', text, re.DOTALL) links = None if m: jstext = m.group(1) try: jsdata = json.loads(jstext) except ValueError: jsdata = {} self.jsdata = jsdata links = self.jsdata.get('results') def full_ct_url(url): return urlparse.urljoin('http://www.canadiantire.ca/', url) links = [ full_ct_url(x['field']['short-pdp-url'] + '.html') for x in links ] if not links: self.log("Found no product links.", ERROR) for link in links: yield link, SiteProductItem()
def _get_products(self, response): product_list = None try: body = json.loads(response.body) for lane in body['_embedded']['lanes']: _type = lane.get('type') if not _type or _type != 'SearchLane': continue product_list = lane['_embedded']['items'] break if not product_list: self.log('Products was not found.', DEBUG) return for product_info in product_list: product = self.__parse_product( SiteProductItem(), product_info['_embedded']['product']) if not product: continue product['url'] = \ self.BASE_URL + product_info['navItem']['link']['href'] product['reseller_id'] = self._parse_reseller_id( product.get('url', '')) yield product except Exception as e: self.log('Can\'t parse product list body. ERROR: %s.' % str(e), ERROR) return
def parse_product(self, response): product = response.meta.get('product') if response.meta.get('product') else SiteProductItem() cond_set_value(product, 'shelf_path', response.meta.get('shelf_path')) cond_set_value(product, 'shelf_name', response.meta.get('shelf_name')) title = response.xpath('//h1/span/text()').extract()[0].strip() cond_set_value(product, 'title', title) data_body = response.xpath('//script[contains(text(), ' '"merchantID")]/text()').extract() try: asin = re.findall(r'"ASIN" : "(\w+)"', data_body[0])[0] except IndexError: asin = re.findall('\/([A-Z0-9]{10})', response.url)[0] cond_set_value(product, 'asin', asin) cond_set_value(product, 'url', response.url) cond_set_value(product, 'ranking', response.meta.get('ranking')) brand = self._parse_brand(response) cond_set_value(product, 'brand', brand) price = self._parse_price(response) cond_set_value(product, 'price', price) if self.match_target or self.match_walmart: req = Request(url='http://asintoupc.com', callback=self.get_payload, dont_filter=True) req.meta['product'] = product yield req else: yield product
def _scrape_product_links(self, response): urls = response.xpath( '//li[contains(@class,"productDisplay")]//div[@class="productDisplay_image"]/a/@href' ).extract() try: products = re.findall( 'var\s?filterResults\s?=\s?jq\.parseJSON\([\'\"](\{.+?\})[\'\"]\);', response.body, re.MULTILINE)[0].decode( 'string-escape') products = json.loads(products).get('organicZoneInfo').get('records') urls += [product.get('pdpUrl') for product in products] except Exception as e: self.log('Error loading JSON: %s at URL: %s' % (str(e), response.url), WARNING) self.log('Extracted urls using xpath: %s' % (len(urls)), WARNING) ulrs = [urlparse.urljoin(response.url, url) for url in urls] shelf_categories = response.xpath( '//*[contains(@data-anid, "breadcrumbIndex_")]/text()').extract() shelf_category = shelf_categories[-1] if shelf_categories else None for url in urls: item = SiteProductItem() if shelf_categories: item['shelf_name'] = shelf_categories if shelf_category: item['shelf_path'] = shelf_category yield url, item
def _scrape_product_links(self, response): links = response.xpath( '//ul[contains(@class,"search-result-items")]/li/a/@href').extract( ) if not links: links = response.xpath('//a[@class="name-link"]/@href').extract() if links: for i in range(len(links)): if self.ROOT_URL not in links[i]: links[i] = 'http://' + self.ROOT_URL + links[i] cats = response.xpath('.//link[@rel="canonical"]/@href').extract() shelf_categories = [] shelf_category = '' if cats: shelf_categories = [ c.strip() for c in cats[0].split('/') if len(c.strip()) > 1 ] shelf_category = shelf_categories[-1] if shelf_categories else None for item_url in links: item = SiteProductItem() if shelf_category: item['shelf_name'] = shelf_category if shelf_categories: item['shelf_path'] = shelf_categories yield item_url, item
def _scrape_product_links(self, response): boxes = self._fetch_product_boxes(response) for box in boxes: # Fetch product url try: url = self._link_from_box(box) except IndexError: # Most expected self.log('IndexError on %s' % response.url, ERROR) url = None if self.REQUIRE_PRODUCT_PAGE and url is None: self.log('No link found for product on %s' % response.url, DEBUG) product = SiteProductItem() meta = self._populate_from_box(response, box, product) self._populate_hardcoded_fields(product) self._get_model_from_title(product) new_meta = response.meta.copy() if hasattr(response, 'meta') \ else {} if meta and url: new_meta.update(meta) if url: new_meta['product'] = product yield Request(urlparse.urljoin(response.url, url), self.parse_product, meta=new_meta, errback=self._handle_product_page_error), product else: yield None, product
def _scrape_product_links(self, response): boxes = response.css('.product-description') for box in boxes: product = SiteProductItem() url = box.xpath('h3/a/@href').extract() cond_set(product, 'brand', box.xpath('p/text()').extract()) yield url[0], product
def _scrape_product_links(self, response): urls = response.xpath( "//div[contains(@class,'product') " "and contains(@class,'plp-grid')]" "//descendant::a[contains(@class, 'item_description')]/@href" ).extract() urls = [ urlparse.urljoin(response.url, x) if x.startswith('/') else x for x in urls ] if not urls: self.log("Found no product links.", DEBUG) # parse shelf category shelf_categories = response.xpath( '//ul[@id="headerCrumb"]/li//text()').extract() shelf_categories = [category.strip() for category in shelf_categories] shelf_categories = filter(None, shelf_categories) try: shelf_name = response.xpath( '//h1[@class="page-title"]/text()').extract()[0].strip() except IndexError: pass for url in urls: if url in self.product_filter: continue self.product_filter.append(url) item = SiteProductItem() if shelf_categories: if shelf_categories: item['shelf_name'] = shelf_name item['shelf_path'] = shelf_categories[1:] yield url, item
def _scrape_product_links(self, response): shelf_categories = response.meta.get('shelf_categories') """ Scraping product links from search page """ links = response.xpath( './/a[contains(@class, "product")]/@href' ).extract() if links: if not shelf_categories: shelf_categories = self._get_shelf_path(response) shelf_category = shelf_categories[-1] if shelf_categories else None for link in links: # sometimes there is link to category instead of a product like here: # https://www.microsoftstore.com/store/msusa/en_US/cat/Microsoft-Lumia/categoryID.66852000?icid=en_US_Homepage_whatsnew_5_TEST_EDU_160525 if '/pdp/' not in link: self.log("Found shelf link instead of product link {url}".format(url=link), INFO) else: item = SiteProductItem() if shelf_category: item['shelf_name'] = shelf_category if shelf_categories: item['shelf_path'] = shelf_categories yield urlparse.urljoin(response.url, link), item else: self.log("Found no product links in {url}".format(url=response.url), INFO)
def _scrape_product_links(self, response): links = response.xpath('//h4[@class="productTitle"]/a/@href').extract() if not links: self.log("Found no product links.", ERROR) for link in links: yield link, SiteProductItem()
def start_requests(self): for st in self.searchterms: yield Request( self.url_formatter.format( self.SEARCH_URL, search_term=urllib.quote_plus(st.encode('utf-8')), ), meta={ 'search_term': st, 'remaining': self.quantity }, ) if self.product_url: pId = is_empty(re.findall("product/.*/(\d+)", self.product_url)) url = "http://groceries.asda.com/api/items/view?" \ "itemid=" + pId + "&responsegroup=extended" \ "&cacheable=true&shipdate=currentDate" \ "&requestorigin=gi" prod = SiteProductItem() prod['is_single_result'] = True prod["url"] = self.product_url prod["reseller_id"] = pId yield Request(url, self._parse_single_product, meta={'product': prod})
def start_requests(self): cookies = {'pageTemplate': 'new'} for st in self.searchterms: url = self.url_formatter.format(self.SEARCH_URL, search_term=urllib.quote_plus( st.encode('utf-8')), start=0, sort_mode=self.SORTING or '') yield Request(url, meta={ 'search_term': st, 'remaining': self.quantity }, cookies=cookies) if self.product_url: prod = SiteProductItem() prod['is_single_result'] = True yield Request(self.product_url, self._parse_single_product, meta={ 'product': prod, 'handle_httpstatus_list': [404] }, cookies=cookies)
def _scrape_product_links(self, response): urls = response.xpath( '//a[contains(@property, "url")]/@href').extract() if not urls: urls = response.xpath( './/div[@class="product-info"]/a[contains(@class, "product-title")]/@href' ).extract() if not urls: urls = response.xpath( '//a[@class="product-title scTrack pfm"]/@href').extract() urls = [urlparse.urljoin(response.url, x) for x in urls] shelf_category = response.xpath('//h1/text()').extract() if shelf_category: shelf_category = shelf_category[0].strip(' \t\n') shelf_path = response.xpath( '//div[contains(@class, "stp--breadcrumbs")]/ul/li/a/text()' ' | //div[contains(@class, "stp--breadcrumbs")]/ul/li[@class="last"]/text()' ).extract() for url in urls: item = SiteProductItem() if shelf_category: item['shelf_name'] = shelf_category if shelf_path: item['shelf_path'] = shelf_path yield url, item