def test_url_query_parameter_2(self): """ This problem was seen several times in the feeds. Sometime affiliate URLs contains nested encoded affiliate URL with direct URL as parameters. For example: aff_url1 = 'http://www.tkqlhce.com/click-2590032-10294381?url=http%3A%2F%2Fwww.argos.co.uk%2Fwebapp%2Fwcs%2Fstores%2Fservlet%2FArgosCreateReferral%3FstoreId%3D10001%26langId%3D-1%26referrer%3DCOJUN%26params%3Dadref%253DGarden+and+DIY-%3EGarden+furniture-%3EChildren%26%2339%3Bs+garden+furniture%26referredURL%3Dhttp%3A%2F%2Fwww.argos.co.uk%2Fwebapp%2Fwcs%2Fstores%2Fservlet%2FProductDisplay%253FstoreId%253D10001%2526catalogId%253D1500001501%2526productId%253D1500357023%2526langId%253D-1' the typical code to extract needed URL from it is: aff_url2 = url_query_parameter(aff_url1, 'url') after this aff2_url is: 'http://www.argos.co.uk/webapp/wcs/stores/servlet/ArgosCreateReferral?storeId=10001&langId=-1&referrer=COJUN¶ms=adref%3DGarden and DIY->Garden furniture->Children's gardenfurniture&referredURL=http://www.argos.co.uk/webapp/wcs/stores/servlet/ProductDisplay%3FstoreId%3D10001%26catalogId%3D1500001501%26productId%3D1500357023%26langId%3D-1' the direct URL extraction is url = url_query_parameter(aff_url2, 'referredURL') but this will not work, because aff_url2 contains ' (comma sign encoded in the feed) and the URL extraction will fail, current workaround was made in the spider, just a replace for ' to %27 """ return # FIXME: this test should pass but currently doesnt # correct case aff_url1 = "http://www.anrdoezrs.net/click-2590032-10294381?url=http%3A%2F%2Fwww.argos.co.uk%2Fwebapp%2Fwcs%2Fstores%2Fservlet%2FArgosCreateReferral%3FstoreId%3D10001%26langId%3D-1%26referrer%3DCOJUN%26params%3Dadref%253DGarden+and+DIY-%3EGarden+furniture-%3EGarden+table+and+chair+sets%26referredURL%3Dhttp%3A%2F%2Fwww.argos.co.uk%2Fwebapp%2Fwcs%2Fstores%2Fservlet%2FProductDisplay%253FstoreId%253D10001%2526catalogId%253D1500001501%2526productId%253D1500357199%2526langId%253D-1" aff_url2 = url_query_parameter(aff_url1, 'url') self.assertEqual(aff_url2, "http://www.argos.co.uk/webapp/wcs/stores/servlet/ArgosCreateReferral?storeId=10001&langId=-1&referrer=COJUN¶ms=adref%3DGarden and DIY->Garden furniture->Garden table and chair sets&referredURL=http://www.argos.co.uk/webapp/wcs/stores/servlet/ProductDisplay%3FstoreId%3D10001%26catalogId%3D1500001501%26productId%3D1500357199%26langId%3D-1") prod_url = url_query_parameter(aff_url2, 'referredURL') self.assertEqual(prod_url, "http://www.argos.co.uk/webapp/wcs/stores/servlet/ProductDisplay?storeId=10001&catalogId=1500001501&productId=1500357199&langId=-1") # weird case aff_url1 = "http://www.tkqlhce.com/click-2590032-10294381?url=http%3A%2F%2Fwww.argos.co.uk%2Fwebapp%2Fwcs%2Fstores%2Fservlet%2FArgosCreateReferral%3FstoreId%3D10001%26langId%3D-1%26referrer%3DCOJUN%26params%3Dadref%253DGarden+and+DIY-%3EGarden+furniture-%3EChildren%26%2339%3Bs+garden+furniture%26referredURL%3Dhttp%3A%2F%2Fwww.argos.co.uk%2Fwebapp%2Fwcs%2Fstores%2Fservlet%2FProductDisplay%253FstoreId%253D10001%2526catalogId%253D1500001501%2526productId%253D1500357023%2526langId%253D-1" aff_url2 = url_query_parameter(aff_url1, 'url') self.assertEqual(aff_url2, "http://www.argos.co.uk/webapp/wcs/stores/servlet/ArgosCreateReferral?storeId=10001&langId=-1&referrer=COJUN¶ms=adref%3DGarden and DIY->Garden furniture->Children's garden furniture&referredURL=http://www.argos.co.uk/webapp/wcs/stores/servlet/ProductDisplay%3FstoreId%3D10001%26catalogId%3D1500001501%26productId%3D1500357023%26langId%3D-1") prod_url = url_query_parameter(aff_url2, 'referredURL') # fails, prod_url is None now self.assertEqual(prod_url, "http://www.argos.co.uk/webapp/wcs/stores/servlet/ProductDisplay?storeId=10001&catalogId=1500001501&productId=1500357023&langId=-1")
def parse(self, response): if self.meta_df is None and hasattr(self, 'prev_crawl_id'): meta_filename = os.path.join( DATA_DIR, 'meta/%s_meta.json-lines' % self.prev_crawl_id) if os.path.exists(meta_filename): with open(meta_filename) as f: self.meta_df = pd.DataFrame( columns=['identifier', 'promo_start', 'promo_end'], dtype=pd.np.str) for i, line in enumerate(f): p = json.loads(line.strip()) self.meta_df.loc[i] = { 'identifier': p['identifier'], 'promo_start': p['metadata'].get('promo_start'), 'promo_end': p['metadata'].get('promo_end') } self.meta_df.set_index('identifier', drop=False, inplace=True) elif not hasattr(self, 'prev_crawl_id'): self.log('prev_crawl_id attr not found') for url in response.xpath( '//*[@id="header"]/nav/div/ul/li/a/@href').extract(): u_id = url_query_parameter(url, 'id') u_cat = url_query_parameter(url, 'cat') if u_id and u_cat: yield scrapy.Request( 'http://www.phonehouse.pt/api.php/getProducts/' + u_id + '/' + u_cat + '/0', callback=self.parse_products, meta={ 'u_id': u_id, 'u_cat': u_cat, 'offset': 0 })
def test_url_query_parameter(self): self.assertEqual(url_query_parameter("product.html?id=200&foo=bar", "id"), '200') self.assertEqual(url_query_parameter("product.html?id=200&foo=bar", "notthere", "mydefault"), 'mydefault') self.assertEqual(url_query_parameter("product.html?id=", "id"), None) self.assertEqual(url_query_parameter("product.html?id=", "id", keep_blank_values=1), '')
def parse_results(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) products = hxs.select('//div[contains(@class, "item") and contains(@class, "product")]') cookie_jar = response.meta['cookiejar'] current_page = url_query_parameter(response.url, 'page') pages_found = bool(hxs.select('//div[@class="item-count"]/strong/text()').extract()) if current_page and pages_found: current_page = int(current_page) per_page = int(url_query_parameter(response.url, 'per_page')) total_items = int(hxs.select('//div[@class="item-count"]/strong/text()').extract()[-1]) next_page = current_page + 1 total_pages = total_items / per_page if (total_items % per_page) > 0: total_pages += 1 if next_page <= total_pages: next_url = add_or_replace_parameter(response.url, 'page', str(next_page)) yield Request(next_url, meta={'cookiejar': cookie_jar}, callback=self.parse_results) for product in products: product_url = product.select('.//div[@class="title"]//a/@href').extract() if not product_url: continue product_url = urljoin_rfc(base_url, product_url[0]) if product_url in self.viewed_urls: continue self.viewed_urls.append(product_url) self._browser.get(product_url) response = HtmlResponse(url=self._browser.driver.current_url, body=self._browser.driver.page_source, encoding='utf-8') for item in self.parse_product(response): if item['identifier'] not in self.new_ids: self.new_ids.append(item['identifier']) yield item options = product.select('.//*[@class="color-selector-items"]/a/@href').extract() for option_url in options: option_url = urljoin_rfc(base_url, option_url) if option_url in self.viewed_urls: continue self.viewed_urls.append(option_url) self._browser.get(option_url) response = HtmlResponse(url=self._browser.driver.current_url, body=self._browser.driver.page_source, encoding='utf-8') for item in self.parse_product(response): if item['identifier'] not in self.new_ids: self.new_ids.append(item['identifier']) yield item time.sleep(5)
def parse(self, response): hxs = HtmlXPathSelector(response) meta = response.meta name = hxs.select('//*[@id="sku-title"]/h1/text()').extract() if not name: return name = name[0] image_url = hxs.select( '//*[@id="postcard-thumbnail"]//img[@itemprop="image"]/@src' ).extract() identifier = url_query_parameter(response.url, 'id') price = hxs.select( '//*[@id="priceblock-wrapper-wrapper"]//div[@class="item-price"]/text()' ).extract()[0] loader = ProductLoader(item=Product(), response=response) loader.add_value('identifier', identifier) loader.add_value('name', name) loader.add_value('category', meta['product']['category']) loader.add_value('brand', meta['product']['brand']) loader.add_value('sku', meta['product']['sku']) loader.add_value('url', response.url) loader.add_value('price', price) if image_url: loader.add_value('image_url', image_url[0]) yield loader.load_item()
def parse_products_list(self, response): products = response.xpath('//div[contains(@class, "card--product")]') for product in products: presc = ' '.join(product.xpath('.//div[@class="links_widget"]/p/a/span/text()').extract()) if 'I Have a Private Prescription' in presc or 'I Need a Private Prescription' in presc or 'I Have an NHS Prescription' in presc: continue loader = ProductLoader(item=Product(), selector=product) name = product.xpath('.//h2/a/text()').extract()[0] loader.add_value('name', name) url = product.xpath('.//h2/a/@href').extract()[0] loader.add_value('url', url) identifier = product.xpath('.//div/button/@data-product-id').extract()[0] loader.add_value('identifier', identifier) loader.add_value('sku', identifier) price = product.xpath('.//span[@class="special-price"]/span[@class="price"]/text()').extract() if not price: price = product.xpath('.//span[@class="regular-price"]/span[@class="price"]/text()').extract() price = extract_price(price[0]) loader.add_value('price', price) category = response.xpath('//nav[@class="breadcrumb"]//li/span/text()').extract() category = category[-1] if category else '' loader.add_value('category', category) if price < 40: loader.add_value('shipping_cost', 3.19) image_url = product.xpath('.//img[contains(@id, "product-collection-image")]/@src').extract() image_url = response.urljoin(image_url[0]) if image_url else '' loader.add_value('image_url', image_url) yield loader.load_item() url_list = products.xpath('.//h2/a/@href').extract() if products and url_list != response.meta.get('url_list', []): current_page = url_query_parameter(response.url,'p', '1') next_url = add_or_replace_parameter(response.url, 'infinitescroll', '1') next_url = add_or_replace_parameter(next_url, 'p', str(int(current_page)+1)) yield Request(next_url, callback=self.parse_products_list, meta={'url_list': url_list})
def parse(self, response): hxs = HtmlXPathSelector(response) products = hxs.select('//div[@class="bloc_article_float"]') for product in products: meta = {} meta['category'] = product.select('table/tr/td//div[@class="marque trunc"]/@title').extract()[0] meta['name'] = product.select('table/tr/td//div[@class="nom trunc"]/div/span/a/text()').extract()[0].strip() meta['sku'] = meta['name'].split('-')[0] meta['brand'] = "LEGO" meta['price'] = product.select('table/tr/td//div[@class="prix"]/text()').extract()[0].strip().replace(',', '.') url = product.select('table/tr/td//div[@class="nom trunc"]/div/span/a/@href').extract()[0].strip() image = product.select('table/tr/td[contains(@class, "photo")]//img/@src').extract()[0].replace('MED', 'ZOO') l = ProductLoader(item=Product(), response=response) l.add_value('identifier', url_query_parameter(url, 'id_article')) l.add_value('name', meta['category'] + ' ' + meta['name']) l.add_value('category', meta['category']) l.add_value('brand', meta['brand']) l.add_value('sku', meta['sku']) l.add_value('url', url) l.add_value('price', meta['price']) l.add_value('image_url', image) yield l.load_item() # yield Request(url, callback=self.parse_product, meta=meta) next = hxs.select('//a[text()="Page suivante "]/@href').extract() if next: yield Request(next[0])
def parse(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) data = demjson.decode(response.body) product = None for product in data['itemList']: product_loader = ProductLoader(item=Product(), selector=hxs) image_url = '//d39rqydp4iuyht.cloudfront.net/store/product/image/{}.gif'.format(product['id']) product_identifier = product['id'] product_name = product['name'] product_loader.add_value('identifier', product_identifier) product_loader.add_value('name', product_name) product_loader.add_value('image_url', image_url) price = product['minPrice'] sku = '' for match in re.finditer(r"([\d,\.]+)", product_name): if len(match.group()) > len(sku): sku = match.group() product_loader.add_value('sku', sku) product_loader.add_value('price', price) url = '/store/ck/item/' + str(product['id']) product_loader.add_value('url', urljoin_rfc(base_url, url)) yield product_loader.load_item() if product and product['dataPosition'] < data['numItems']: page = int(url_query_parameter(response.url, 'page')) + 1 url = add_or_replace_parameter(response.url, 'page', str(page)) yield Request(url)
def parse_products(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) try: products = hxs.select( '//div[@class="product_name"]/a/@href').extract() for product in products: yield Request(urljoin_rfc(get_base_url(response), product), callback=self.parse_product, meta=response.meta) if len(products) >= 500: index = int(url_query_parameter(response.url, 'beginIndex', 0)) url = add_or_replace_parameter(response.url, 'beginIndex', str(index + 500)) yield Request(url, callback=self.parse_products, meta=response.meta) except: log.msg('PAGE ERROR >>>') log.msg(str(response.body)) retry = response.meta.get('retry', 0) + 1 if retry <= 7: log.msg('Retry: ' + response.url) time.sleep(5) yield Request(response.url, dont_filter=True, callback=self.parse_products, meta={'retry': retry})
def load_item_(self, item, browser=None, use_adurl=True): if browser: response = HtmlResponse(url=browser['webdriver'].current_url, body=browser['webdriver'].page_source, encoding='utf-8') else: response = HtmlResponse(url='http://www.google.co.uk/shopping', body='<html></html>', encoding='utf-8') l = ProductLoader(item=Product(), response=response) l.add_value('name', self._try_encoding(item['name'])) # Item URL url = self._try_encoding(item['url']) adurl = url_query_parameter(url, 'adurl') if adurl and use_adurl: item_url = adurl else: item_url = url l.add_value('url', item_url) l.add_value('price', item['price']) l.add_value('shipping_cost', item.get('shipping_cost', 0)) l.add_value('dealer', item.get('dealer', '')) l.add_value( 'identifier', browser['meta']['identifier'] if browser else item['identifier']) l.add_value('sku', browser['meta']['sku'] if browser else item['sku']) return l.load_item()
def parse(self, response): categories = response.xpath( '//div[contains(@class, "div-category")]//a/@href').extract() categories += response.xpath( '//ul[contains(@class, "category-list")]//a/@href').extract() for category in categories: yield Request(category) brands = response.xpath( '//dl[@id="narrow-by-list"]//a[contains(@href, "?manufacturer=")]/@href' ).extract() if response.meta.get('extract_brands', True): for brand in brands: manufacturer_id = url_query_parameter(brand, 'manufacturer', None) if manufacturer_id: manufacturer_id = manufacturer_id.split(',')[0] if brand.endswith(manufacturer_id): yield Request(brand, meta={'extract_brands': False}) products = response.xpath( '//h3[contains(@class, "product-name")]/a/@href').extract() for product in products: yield Request(product, callback=self.parse_product) next = response.xpath('//a[contains(text(), "Next")]/@href').extract() if next: yield Request(next[0])
def parse_brand(self, response): brand = url_query_parameter(response.url, 'Brand', '') urls = response.xpath( '//section[@id="productList"]//a/@href').extract() for url in urls: yield Request(urljoin(get_base_url(response), url), meta={'brand': brand}, callback=self.parse_product)
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) price_decimal = None try: price_decimal = min( map( lambda p: Decimal(p), hxs.select( '//section[@class="product-offers-group"]//tr/@data-offer-price' ).extract())) except: price = hxs.select( '//*[@itemprop="price"]/text()|//*[@itemprop="lowprice"]/text()' ).extract() price_decimal = extract_price_eu(price[0]) if price else None unavailable = 'Aktualnie brak ofert tego produktu. Zobacz inne produkty z kategorii' in response.body if (not price_decimal) and (not unavailable): blocked_url = url_query_parameter(response.url, 'returnUrl') if blocked_url: blocked_url = urljoin_rfc(base_url, blocked_url) self.log('ERROR: Blocked URL => %s' % blocked_url) else: self.log('ERROR: No product found in => %s' % response.url) retry_no = int(response.meta.get('retry_no', 0)) if retry_no < 10: retry_no += 1 self.log('DEBUG: Retrying page - Retry No: %s' % retry_no) yield Request(blocked_url or response.url, meta={ 'category': response.meta['category'], 'cookiejar': response.meta['cookiejar'], 'retry_no': retry_no }, dont_filter=True, callback=self.parse_product) return if price_decimal: loader = ProductLoader(item=Product(), response=response) loader.add_xpath('name', '//h1[@itemprop="name"]/text()') loader.add_xpath('identifier', '//input[@name="ProductID"]/@value') loader.add_xpath('sku', '//input[@name="ProductID"]/@value') loader.add_value('url', response.url) loader.add_value('price', price_decimal) loader.add_value('category', response.meta['category'].split(',')) image_url = hxs.select('//img[@itemprop="image"]/@src').extract() if image_url: loader.add_value( 'image_url', urljoin_rfc(base_url, image_url[0]).split('?')[0]) item = loader.load_item() if item['identifier'] not in self.new_ids: self.new_ids.append(item['identifier']) yield item
def parse_product(self, response): name = response.xpath("//h2/span[@itemprop='name']/text()").extract() if not name: name = response.xpath("//table//tr/td//h2/text()").extract() name = name[0] price = response.xpath("//span[@itemprop='price']/text()").re('[\d\.]+') if not price: price = response.xpath("//span[@class='pr-price']/strong/text()").re('[\d\.]+') price = price[0] stock = response.xpath("//*[@itemprop='availability']/@href").extract() if stock: if 'InStock' in stock[0]: stock = None else: stock = 0 else: stock = None cats = response.xpath("//div[@class='grid_10']/h1/a/text()").extract() brand = cats[-1] image_url = response.xpath("//img[@alt='{}']/@src".format(name)).extract() m = re.search("details(.*)\.html", response.url) if m: identifier = m.group(1) else: entryid = url_query_parameter(response.url, 'entryid') priceid = url_query_parameter(response.url, 'priceid') if not entryid or not priceid: raise KeyError("Not found entryid and priceid in url: {}".format(response.url)) identifier = entryid + priceid sku = identifier loader = ProductLoaderWithNameStrip(Product(), response=response) loader.add_value('name', name) loader.add_value('price', price) loader.add_value('stock', stock) loader.add_value('url', response.url) loader.add_value('brand', brand) loader.add_value('sku', sku) loader.add_value('identifier', identifier) loader.add_value('image_url', image_url) loader.add_value('category', cats) yield loader.load_item()
def load_item_(self, item, browser, use_adurl=True): response = HtmlResponse(url=browser['webdriver'].current_url, body=browser['webdriver'].page_source, encoding='utf-8') l = ProductLoader(item=Product(), response=response) l.add_value('name', self._try_encoding(item['name'])) l.add_value('brand', self._try_encoding(item.get('brand', ''))) # Item URL url = self._try_encoding(item['url']) adurl = url_query_parameter(url, 'adurl') if adurl and use_adurl: item_url = adurl else: item_url = url dest_url = url_query_parameter(item_url, 'ds_dest_url') or url_query_parameter(item_url, 'url') if dest_url: item_url = dest_url if ('%s/url' % self.GOOGLE_DOMAIN) in item_url: url_q = url_query_parameter(item_url, 'q') if not url_q: url_q = url_query_parameter(item_url, 'url') if url_q: item_url = url_q l.add_value('url', item_url) l.add_value('price', item['price']) l.add_value('shipping_cost', item.get('shipping_cost', 0)) l.add_value('dealer', item.get('dealer', '')) l.add_value('identifier', item['identifier']) l.add_value('sku', item.get('sku')) if 'meta' in browser: for k, v in browser['meta'].items(): l.add_value(k, v) res = l.load_item() if 'metadata' in item: res['metadata'] = item['metadata'] return res
def parse(self, response): result = json.loads(response.body) page = url_query_parameter(response.url, 'p') hxs = HtmlXPathSelector(text=result['html']) product_urls = hxs.select('//li/a/@href').extract() self.log('{} products found'.format(len(product_urls))) for url in product_urls: yield Request(url, callback=self.parse_product) if result['is_there_a_next_page']: yield Request(self.search_url.format(int(page) + 1))
def parse(self, response): base_url = get_base_url(response) products = json.loads(response.body)['products'] for product in products: yield Request(urljoin(base_url, product['link']), callback=self.parse_product) if products: page = int(url_query_parameter(response.url, 'p', '0')) page += 1 yield Request(add_or_replace_parameter(response.url, 'p', str(page)))
def parse_product_list(self, response): i = 0 for match in re.finditer(r'(?si)<h5>.+?href=\\"(.*?)\\"', response.body): i += 1 url = match.group(1) yield Request(url.replace('\\', ''), callback=self.parse_product) if i == 100: page = int(url_query_parameter(response.url, 'p', '1')) page += 1 url = add_or_replace_parameter(response.url, 'p', str(page)) yield Request(url, callback=self.parse_product_list)
def parse_search(self, response): brand = url_query_parameter(response.url, 'Brand', '') urls = response.xpath( '//section[@id="productList"]//a/@href').extract() for url in urls: yield Request(urljoin(get_base_url(response), url), meta={'brand': brand}, callback=self.parse_product) yield Request('http://www.ezyvision.co.nz/ajax/search', callback=self.parse_ajax_search)
def parse_product(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response) category = hxs.select('//div[@id="crumblinks"]//a/text()').extract() category = category[-1] if category else '' image_url = hxs.select('//img[@id="product-big"]/@src').extract() image_url = urljoin_rfc(base_url, image_url[0]) if image_url else '' product_brand = '' brand_url = hxs.select( '//div[@class="description"]//img[@alt="Brand Image"]/parent::a/@href' ).extract() if brand_url: brand_url = urljoin_rfc(base_url, brand_url[0]) product_brand = url_query_parameter(brand_url, 'search') name = hxs.select("//h1[@class='coarse']/text()")[0].extract().strip() options = hxs.select('//div[@class="generated"]/table/tr')[1:] select = hxs.select( '//form[@id="cart_form"]//select[@class="prodoptions"]').extract() if options: # options for option in options: name2 = option.select('./td[position()=4]/text()') name2 = name2[0].extract().strip() if name2 else '' price = option.select('.//td/text()').extract()[-2].strip() loader = ProductLoader(item=Product(), selector=option) loader.add_xpath('identifier', './td[position()=2]/text()') loader.add_xpath('sku', './td[position()=3]/text()') loader.add_value('url', response.url) loader.add_value( 'name', name + ' %s %s' % (loader.get_output_value('identifier'), name2)) loader.add_value('price', price) loader.add_value('category', category) loader.add_value('image_url', image_url) loader.add_value('brand', product_brand) yield loader.load_item() else: price = "".join( hxs.select(".//span[@class='bigprice']/text()").re( r'([0-9\,\. ]+)')).strip() loader = ProductLoader(item=Product(), response=response) loader.add_value('url', response.url) loader.add_value('name', name) loader.add_value('price', price) loader.add_value('identifier', response.url) loader.add_value('image_url', image_url) loader.add_value('category', category) loader.add_xpath('sku', './td[position()=2]/text()') loader.add_value('brand', product_brand) yield loader.load_item()
def parse_categories(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) product_urls = hxs.select('//p[contains(@class, "product-name")]//a/@href').extract() for url in product_urls: yield Request(urljoin_rfc(base_url, url), callback=self.parse_product) if product_urls: next_page = str(int(url_query_parameter(response.url, 'p', 0)) + 1) next_url = add_or_replace_parameter(response.url, 'p', next_page) yield Request(next_url, callback=self.parse_categories)
def parse(self, response): hxs = HtmlXPathSelector(response) products = hxs.select('//div[contains(@data-plugins,"ProductGrid")]//div[contains(@class, "product")]/a/@href').extract() for url in products: yield Request(urljoin_rfc(get_base_url(response), url), callback=self.parse_product) if products: current_page = url_query_parameter(response.url, '_iv_page') current_page = int(current_page) if current_page else 1 next_url = add_or_replace_parameter(response.url, '_iv_page', str(current_page + 1)) yield Request(next_url)
def parse_product_list(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) ''' brand_cats_urls = hxs.select('//div[@class="left_menu"]/div[@class="category_container"]//a/@href').extract() for url in brand_cats_urls: yield Request(urljoin_rfc(base_url, url), meta=response.meta.copy(), callback=self.parse_product_list) ''' if not url_query_parameter(response.url, 'f'): filter_brand_url = hxs.select( u'//p[@class="arrow_head" and span[@class="leftsubcat_categories" ' u'and contains(text(), "Brand")]]/following-sibling::ul[contains(@class, "brand_list")]' u'//span[@id="refine_label" and contains(text(), "%s")]/parent::a/@href' % response.meta['brand']).extract() if filter_brand_url: url = filter_brand_url[0] yield Request(urljoin_rfc(base_url, url), meta=response.meta.copy(), callback=self.parse_product_list) return all_products_link = hxs.select( '//div[@class="left_nav brand_cat"]//a[p[@class="upto_cat"]]/@href' ).extract() if all_products_link: url = all_products_link[0] yield Request(urljoin_rfc(base_url, url), meta=response.meta.copy(), callback=self.parse_product_list) products = hxs.select( '//div[@id="grid-view"]/div[@class="grid_view_row"]' '/div[contains(@class, "products_details_container")]' '/div[contains(@class, "products_details")]' '//li[contains(@class, "description")]/a/@href').extract() for url in products: url = urljoin_rfc(base_url, url) yield Request(url, meta=response.meta.copy(), callback=self.parse_product) pages = hxs.select( '//div[@class="pagination"][1]/a[not(@class="active")]/@href' ).extract() for next_page in pages: url = urljoin_rfc(base_url, next_page) yield Request(url, meta=response.meta.copy(), callback=self.parse_product_list)
def parse(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) products = hxs.select('//div[@class="list-item-info"]//h4/a/@href').extract() for product in products: yield Request(urljoin_rfc(base_url, product), callback=self.parse_product) next = hxs.select('//li[@class="pager-next"]/a/@href').extract() if next: current_page = int(url_query_parameter(response.url, 'cp', '0')) + 1 next_page = add_or_replace_parameter(response.url, 'cp', str(current_page)) yield Request(urljoin_rfc(base_url, next_page))
def parse_item(self, response): dom = url_query_parameter(response.url, "dom") if dom: hxs = HtmlXPathSelector(response) base = hxs.select('/html/body/table/tr[2]/td/table[3]/tr/td/table') l = MarnetLoader(item=MarnetItem(), selector=base) l.add_value('domain', dom) l.add_xpath('dosie', './/tr/td/div/b/i/text()', re=':(\\d+)') l.add_xpath('datum', './/tr[3]/td[2]/text()') l.add_xpath('ime', './/tr[4]/td[2]/text()') l.add_xpath('administrative', './/tr[11]/td[2]/text()') l.add_xpath('techical', './/tr[15]/td[2]/text()') l.add_xpath('dns', './/tr[@align="center"]/td/text()') return l.load_item()
def parse(self, response): base_url = get_base_url(response) data = json.loads(response.body) if data: products = data['Products'] for product in products: yield Request(urljoin_rfc(base_url, product['ProductUrl']), callback=self.parse_product) if products: page = url_query_parameter(response.url, 'page', '1') next_page = add_or_replace_parameter(response.url, 'page', str(int(page) + 1)) yield Request(next_page)
def parse_product_list(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) products = hxs.select( '//*[@id="wrapper_page_content"]//ul[@class="product"]') for product in products: url = product.select('./li[1]/a/@href').extract() if url: url = url[0] discount = product.select( './li[contains(@class,"product_promo")]/img/@alt').re( r'(\d+)') if discount: url = add_or_replace_parameter(url, 'qbDiscount', discount[0]) yield Request(urljoin_rfc(base_url, url), callback=self.parse_product) if not response.meta.get('all_pages_done', False): urls = hxs.select('//div[@class="pages"]//a/@href').extract() if urls: url = "http://www.bhs.co.uk/webapp/wcs/stores/servlet/CatalogNavigationSearchResultCmd" catId = response.xpath( '//form[@id="form_mercado_filters"]/input[@name="categoryId"]/@value' ).extract()[0] parent_categoryId = response.xpath( '//form[@id="form_mercado_filters"]/input[@name="parent_categoryId"]/@value' ).extract()[0] n_field = response.xpath( '//select[@id="sel_sort_field"]/option[@selected="selected"]/@value' ).extract()[0] n_field = url_query_parameter(n_field, 'N').replace(' ', '+') dimSelected = "?N=" + n_field + "&Ndr=100000&Nr=OR%28emailBackInStock%3AY%2CNOT%28product.inventory%3A0%29%29&siteId=%2F13077&sort_field=Relevance&No=0&Nrpp=9999&catId=" + catId + "&parent_categoryId=" + parent_categoryId formdata = {} formdata['langId'] = '-1' formdata['storeId'] = response.xpath( '//input[@name="storeId"]/@value').extract()[0] formdata['isHash'] = 'false' formdata['dimSelected'] = dimSelected formdata['catalogId'] = response.xpath( '//input[@name="catalogId"]/@value').extract()[0] yield FormRequest(url, dont_filter=True, formdata=formdata, callback=self.parse_product_list, meta={'all_pages_done': True})
def parse(self, response): formdata = { 'currency': 'AUD', 'delivery_destination': '13', 'update_currency_destination': 'Update' } base_url = "http://www.uksoccershop.com" categories = response.xpath( '//li[contains(a/span/text(), "Football Shirts")]//a/@href' ).extract() categories += response.xpath( '//div[h4/span/a/text()="Euro 2016 National Teams"]//div[contains(@class, "newitem")]/a/@href' ).extract() for category in categories: yield Request(response.urljoin(category)) products = response.xpath( '//div[contains(@class, "productList")]//div[@class="productListLink"]/a/@href' ).extract() for product in products: yield FormRequest(urljoin_rfc(base_url, product), formdata=self.formdata, method='POST', callback=self.parse_product) if products: next_url = "http://www.uksoccershop.com/index.html?cPath=%s&page=%s&ppp=48" cat_id = re.findall('current_category_id = (\d+)', response.body) if not cat_id: cat_id = response.xpath( '//input[@name="cPath"]/@value').extract() cat_id = cat_id[0].split('_') if cat_id else None if cat_id: cat_id = cat_id[-1] current_page = url_query_parameter(response.url, 'page', '1') next_page = int(current_page) + 1 yield Request(next_url % (cat_id, next_page)) else: request_to_urls = re.findall("var request_to = '(.*)'\+ 48", response.body) all_products = filter( lambda x: x if 'main_page' in x else None, request_to_urls) if all_products: yield Request(response.urljoin(all_products[0]) + '9999')
def parse_search(self, response): json_data = re.search("ispSearchResult\((.*)\);", response.body) brand = response.meta.get('brand', '') if json_data: items = json.loads(json_data.group(1))['items'] for item in items: if brand.upper() in item.get('l', '').upper().strip(): yield Request(item['u'], callback=self.parse_product, meta=response.meta) if items: current_page = int(url_query_parameter(response.url, 'p', 0)) next_url = add_or_replace_parameter(response.url, 'p', str(current_page + 1)) yield Request(next_url, callback=self.parse_search, meta=response.meta)
def parse_products(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response) last_page = int( hxs.select('//div[@class="lastpagenumber"]/@id').extract()[0][1:]) cur_page = int(url_query_parameter(response.url, 'p')) if cur_page <= last_page: cur_page += 1 next_page_url = add_or_replace_parameter(response.url, 'p', str(cur_page)) yield Request(next_page_url, callback=self.parse_products) for url in hxs.select( '//div[@class="products-set"]/ul/li/h4/a/@href').extract(): yield Request(urljoin_rfc(base_url, url), callback=self.parse_product)
def parse(self, response): lx = lxml.html.fromstring(response.body_as_unicode()) episodes = episode_sel(lx) for episode in episodes: url = link_sel(episode)[0] url = urljoin_rfc(self.start_urls[0], url.attrib['href']) yield Request(url=url, callback=self.parse_video_page) # Simulate pagination if episodes: current = url_query_parameter(response.url, 'page') if not current: current = '2' # XHR request starts at page 2 url = "http://blip.tv/pr/show_get_full_episode_list?" url += "users_id=348873&lite=1&esi=1&page=%s" url = url % str(int(current)+ 1) yield Request(url=url, callback=self.parse)
def parse_minutes(self, response): filename = url_query_parameter(response.url, "hfile") daesu = url_query_parameter(response.url, "daesu") date = self.parse_date(response) save_file(minute_filepath_fmt.format(DATA_DIR=DATA_DIR, **locals()), response.body)