def parse_options(self, response): if not isinstance(response, HtmlResponse): return base_url = get_base_url(response) hxs = HtmlXPathSelector(response) res = {} if 'search_q' in response.meta: sku_ = hxs.select("//span[@class='prod-detail-part-value']/text()[contains(.,'"+ response.meta['search_q'] +"')]").extract() #log.msg('SEARCH_Q: '+ response.meta['search_q']) if sku_: name = hxs.select("//div[@id='product-detail-div']/h1/text()")[0].extract().strip() url = response.url price = "".join(hxs.select('//span[@class="prod-detail-cost-value"]/text()').re(r'([0-9\,\. ]+)')).strip() res['url'] = url res['description'] = name + ' ' + response.meta['name2'] res['price'] = price res['sku'] = response.meta['sku'] yield load_product(res, response) else: name = hxs.select("//div[@id='product-detail-div']/h1/text()")[0].extract().strip() url = response.url price = "".join(hxs.select('//span[@class="prod-detail-cost-value"]/text()').re(r'([0-9\,\. ]+)')).strip() res['url'] = url res['description'] = name + ' ' + response.meta['name2'] res['price'] = price yield load_product(res, response)
def parse_options(self, response): if not isinstance(response, HtmlResponse): return base_url = get_base_url(response) hxs = HtmlXPathSelector(response) res = {} if 'search_q' in response.meta: sku_ = hxs.select( "//span[@class='prod-detail-part-value']/text()[contains(.,'" + response.meta['search_q'] + "')]").extract() #log.msg('SEARCH_Q: '+ response.meta['search_q']) if sku_: name = hxs.select("//div[@id='product-detail-div']/h1/text()" )[0].extract().strip() url = response.url price = "".join( hxs.select('//span[@class="prod-detail-cost-value"]/text()' ).re(r'([0-9\,\. ]+)')).strip() res['url'] = url res['description'] = name + ' ' + response.meta['name2'] res['price'] = price res['sku'] = response.meta['sku'] yield load_product(res, response) else: name = hxs.select("//div[@id='product-detail-div']/h1/text()" )[0].extract().strip() url = response.url price = "".join( hxs.select('//span[@class="prod-detail-cost-value"]/text()'). re(r'([0-9\,\. ]+)')).strip() res['url'] = url res['description'] = name + ' ' + response.meta['name2'] res['price'] = price yield load_product(res, response)
def parse_product(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response) res = {} products = hxs.select("//div[@class='product_small_rod']/form") for p in products: options = p.select(".//div[@class='rod_inset']/p[@class='vat']/select/option/text()").extract() if options: #options name = p.select(".//div[@class='rod_inset']/a/h3[@class='coarse']/text()")[0].extract().strip() url = p.select(".//div[@class='rod_inset']/a/@href")[0].extract() for option in options: name2 = re.match(r'(.*) \(.*\) \(',option).group(1) price = re.match(r'.*\(\xa3(.*)\)',option).group(1) res['url'] = urljoin_rfc(base_url,url) res['description'] = name + u' ' + name2 res['price'] = price yield load_product(res, response) else: name = p.select(".//div[@class='rod_inset']/a/h3[@class='coarse']/text()")[0].extract().strip() url = p.select(".//div[@class='rod_inset']/a/@href")[0].extract() price = "".join(p.select(".//span[@class='bigprice']/text()").re(r'([0-9\,\. ]+)')).strip() res['url'] = urljoin_rfc(base_url,url) res['description'] = name res['price'] = price yield load_product(res, response)
def parse_product(self, response): if not isinstance(response, HtmlResponse): return base_url = get_base_url(response) hxs = HtmlXPathSelector(response) res = {} products = hxs.select('//td[@class="PListCell"]') for product in products: sku_ = product.select("./text()[contains(.,'" + response.meta['search_q'] + "')]").extract() if sku_: url = response.url name = product.select( "./a[@class='ProductTitle']/text()").extract() if name: price = "".join( product.select( './font[@class="ProductPrice"]/span/text()').re( r'([0-9\,\. ]+)')).strip() url = product.select( "./a[@class='ProductTitle']/@href").extract() res['url'] = urljoin_rfc(base_url, url[0]) res['description'] = name[0].strip() res['price'] = price if 'sku' in responce.meta: res['sku'] = response.meta['sku'] yield load_product(res, response)
def parse(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response) res = {} cats = hxs.select('//div[@id="my_menu"]//a/@href').extract() for cat in cats: request = Request(urljoin_rfc(base_url, cat), callback=self.parse) yield request subcats = hxs.select('//td[@class="SubcatTitle"]/a/@href').extract() for scat in subcats: request = Request(urljoin_rfc(base_url, scat), callback=self.parse) yield request pages = hxs.select('//td[@class="NavigationCell"]/a/@href').extract() for page in pages: request = Request(urljoin_rfc(base_url, page), callback=self.parse) yield request products = hxs.select('//td[@class="PListCell"]') for product in products: url = response.url name = product.select( "./a[@class='ProductTitle']/text()").extract() if name: price = "".join( product.select('./font[@class="ProductPrice"]/span/text()' ).re(r'([0-9\,\. ]+)')).strip() url = product.select( "./a[@class='ProductTitle']/@href").extract() res['url'] = urljoin_rfc(base_url, url[0]) res['description'] = name[0].strip() res['price'] = price yield load_product(res, response)
def parse_product(self, response): if not isinstance(response, HtmlResponse): return base_url = get_base_url(response) hxs = HtmlXPathSelector(response) options = hxs.select("//select[contains(@name,'ddlVariationGroup')]/option") res = {} if not options: #no options name = hxs.select("//div[@id='product-detail-div']/h1/text()")[0].extract().strip() url = response.url price = "".join(hxs.select('//span[@class="prod-detail-cost-value"]/text()').re(r'([0-9\,\. ]+)')).strip() res['url'] = url res['description'] = name res['price'] = price if 'sku' in response.meta: res['sku'] = response.meta['sku'] yield load_product(res, response) else: is_multioptions = hxs.select("//select[contains(@name,'ddlVariationGroup')]") if len(is_multioptions) < 2: for option in options: select_name = hxs.select("//select[contains(@name,'ddlVariationGroup')]/@name").extract()[0] request = FormRequest.from_response(response, formdata={select_name: option.select('./@value').extract()}, dont_click=True, callback=self.parse_options) request.meta['name2'] = option.select('./text()').extract()[0].strip() if 'sku' in response.meta: request.meta['sku'] = response.meta['sku'] if 'search_q' in response.meta: request.meta['search_q'] = response.meta['search_q'] yield request
def parse_product(self, response): if not isinstance(response, HtmlResponse): return # sub products hxs = HtmlXPathSelector(response) subproduct_urls = hxs.select('//div[@class="ProductDescriptionContainer"]//a/@href').extract() if subproduct_urls: for url in subproduct_urls: try: yield Request(url, callback=self.parse_product) except ValueError: pass product = {} try: product['url'] = response.url product['description'] = hxs.select('//h1/text()').extract()[0] product['price'] = hxs.select('//em[@class="ProductPrice VariationProductPrice"]/text()').extract()[0] try: product['sku'] = hxs.select('//div[@id="sku"]/text()').extract()[0] except IndexError: product['sku'] = '' yield load_product(product, response) except IndexError: return
def parse(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response) res = {} cats = hxs.select('//div[@id="my_menu"]//a/@href').extract() for cat in cats: request = Request(urljoin_rfc(base_url,cat), callback=self.parse) yield request subcats = hxs.select('//td[@class="SubcatTitle"]/a/@href').extract() for scat in subcats: request = Request(urljoin_rfc(base_url,scat), callback=self.parse) yield request pages = hxs.select('//td[@class="NavigationCell"]/a/@href').extract() for page in pages: request = Request(urljoin_rfc(base_url,page), callback=self.parse) yield request products = hxs.select('//td[@class="PListCell"]') for product in products: url = response.url name = product.select("./a[@class='ProductTitle']/text()").extract() if name: price = "".join(product.select('./font[@class="ProductPrice"]/span/text()').re(r'([0-9\,\. ]+)')).strip() url = product.select("./a[@class='ProductTitle']/@href").extract() res['url'] = urljoin_rfc(base_url,url[0]) res['description'] = name[0].strip() res['price'] = price yield load_product(res, response)
def parse_product(self, response): if not isinstance(response, HtmlResponse): return # sub products hxs = HtmlXPathSelector(response) product_links = hxs.select( '//a[contains(@href, "product_info.php")]/@href').extract() for product_link in product_links: if not 'language=' in product_link: yield Request(product_link, callback=self.parse_product) product = {} #try: product['url'] = response.url product['description'] = hxs.select( '//td[@class="pageHeading" and not(@align="right")]/text()' ).extract()[0] if product['description'] in [ 'Welcome, Please Sign In', "Let's See What We Have Here" ]: return special_price = hxs.select( '//span[@class="productSpecialPrice"]/text()').re('\$(.*)') product['price'] = special_price[0] if special_price \ else hxs.select('//td[@class="pageHeading"]/text()').re('\$(.*)')[0] product['sku'] = '' yield load_product(product, response)
def parse_product(self, response): base_url = get_base_url(response) if re.search('all-products\.html$', str(response.url)): hxs = HtmlXPathSelector(response) products = hxs.select( '//div[@id="double-column-wrapper"]/div[@id="col-2"]/div[@class="col-2-text-div"]/div[@class="col-2-divider"]' ) for p in products: if p.select(".//img[@class='add-to-basket']").extract(): res = {} name = p.select('.//h2[@class="subcategory-h2"]/a/text()' )[0].extract() if name: url = p.select('.//h2[@class="subcategory-h2"]/a/@href' )[0].extract() price = p.select( './/div[@class="product-list-right"]/div[@class="product-list-price-etc"]/div/p[@class="the-price"]/text()' ).re(r'([0-9\,\. ]+)')[0].strip() identifier = p.select( './/a[contains(@onclick, "iposAddToBasket")]/@onclick' ).re(r'iposAddToBasket\(\'(\d+)\',')[0] res['url'] = urljoin_rfc(base_url, url) res['description'] = name res['price'] = price res['identifier'] = identifier yield load_product(res, response) else: suburl = p.select( './/h2[@class="subcategory-h2"]/a/@href')[0].extract() yield Request(urljoin_rfc(base_url, suburl))
def parse_product(self, response): if not isinstance(response, HtmlResponse): return # sub products hxs = HtmlXPathSelector(response) product_links = hxs.select('//a[contains(@href, "product_info.php")]/@href').extract() for product_link in product_links: if not 'language=' in product_link: yield Request(product_link, callback=self.parse_product) product = {} #try: product['url'] = response.url product['description'] = hxs.select('//td[@class="pageHeading" and not(@align="right")]/text()').extract()[0] if product['description'] in ['Welcome, Please Sign In', "Let's See What We Have Here"]: return special_price = hxs.select('//span[@class="productSpecialPrice"]/text()').re('\$(.*)') product['price'] = special_price[0] if special_price \ else hxs.select('//td[@class="pageHeading"]/text()').re('\$(.*)')[0] product['sku'] = '' yield load_product(product, response)
def parse_product(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response) res = {} name = hxs.select("//div[@class='product-name']/h1/text()").extract() url = response.url price = "".join(hxs.select("//div[@class='col-right']/div/div[@class='price-block']/span/span[@class='price']/text()").re(r'([0-9\,\. ]+)')).strip() if not price: price = "".join(hxs.select("//div[@class='col-right']/div/p[@class='special-price']/span[@class='price']/text()").re(r'([0-9\,\. ]+)')).strip() sku = hxs.select("//dd[@class='identifier']/text()")[0].extract() res['url'] = urljoin_rfc(base_url,url) res['description'] = sku + ' ' + name[0].strip() res['price'] = price res['sku'] = sku sku2 = hxs.select("//div[@class='1']/text()").extract() if not sku2: sku2_ = 0 else: sku2_ = sku2[0] sku3 = hxs.select("//div[@class='2']/text()").extract() if not sku3: sku3_ = 0 else: sku3_ = sku3[0] model = hxs.select("//dd[@class='model']/text()").extract() if not model: model_ = '' else: model_ = model[0] self.csv_writer.writerow([res['sku'],sku2_,sku3_,model_,name[0].strip()]) yield load_product(res, response)
def parse_product(self, response): if not isinstance(response, HtmlResponse): return base_url = get_base_url(response) hxs = HtmlXPathSelector(response) res = {} name = hxs.select("//div[@class='details']/h3/text()").extract() if name: url = response.url price = "".join(hxs.select('//li[@class="priceBigD"]' '/text()').re(r'([0-9\,\. ]+)')).strip() try: sku = hxs.select('//ul[@id="pricing"]/li[@class="priceStockNo"]' '/text()').re(r'No.:(.*)$')[0].strip() except: sku = '' image = hxs.select('//div[@class="image"]//img/@src') if image: res['image_url'] = urljoin_rfc(base_url, image[0].extract()) category = hxs.select('//div[@class="breadcrumbs"]/a/text()') if category: res['category'] = category[-1].extract() res['identifier'] = hxs.select('//*[@id="addtocart"]/img/@onclick')\ .re(r'AddToCart\(\d+,(\d+)')[0] if price: res['url'] = url res['description'] = name[0].strip() res['price'] = price res['sku'] = sku yield load_product(res, response)
def parse_product(self, response): hxs = HtmlXPathSelector(response) products = hxs.select('//form[@name="priceMatch"]/../..') for p in products: res = {} try: name = p.select('.//a/text()').extract()[0] url = p.select('.//a/@href').extract()[0] url = urljoin_rfc(self.URL_BASE, url) price = p.re('OFFER \xa3(.*)</b') if not price: price = p.re('OUR PRICE \xa3(.*) <') # if there isn't a special offer price = price[0] if price else '0' res['url'] = url res['description'] = name price = Decimal(price) + Decimal(3) price = str(price) res['price'] = price yield load_product(res, response) except IndexError: return
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) res = {} try: name = hxs.select( '//div[@class="productInfo"]/h1/text()').extract()[0] url = response.url price = hxs.select('//span[@class="price last"]/text()').re( '\xa3(.*)') if price: price = price[0] else: price = hxs.select('//span[@class="price"]/text()').re( '\xa3(.*)')[0] #sku = hxs.select('//div[@class="productInfo"]/p/text()').re(': (.*)')[0] sku = hxs.select('//div[@class="productInfo"]/p/text()').re( 'Product code:\r\n(.*)')[0].strip() res['url'] = url res['description'] = name res['price'] = price res['sku'] = sku yield load_product(res, response) except IndexError: return
def parse_product(self, response): hxs = HtmlXPathSelector(response) products = hxs.select('//form[@name="priceMatch"]/../..') for p in products: res = {} try: name = p.select('.//a/text()').extract()[0] url = p.select('.//a/@href').extract()[0] url = urljoin_rfc(self.URL_BASE, url) price = p.re('OFFER \xa3(.*)</b') if not price: price = p.re('OUR PRICE \xa3(.*) <' ) # if there isn't a special offer price = price[0] if price else '0' res['url'] = url res['description'] = name price = Decimal(price) + Decimal(3) price = str(price) res['price'] = price yield load_product(res, response) except IndexError: return
def parse_product(self, response): base_url = get_base_url(response) if re.search("all-products\.html$", str(response.url)): hxs = HtmlXPathSelector(response) products = hxs.select( '//div[@id="double-column-wrapper"]/div[@id="col-2"]/div[@class="col-2-text-div"]/div[@class="col-2-divider"]' ) for p in products: if p.select(".//img[@class='add-to-basket']").extract(): res = {} name = p.select('.//h2[@class="subcategory-h2"]/a/text()')[0].extract() if name: url = p.select('.//h2[@class="subcategory-h2"]/a/@href')[0].extract() price = ( p.select( './/div[@class="product-list-right"]/div[@class="product-list-price-etc"]/div/p[@class="the-price"]/text()' ) .re(r"([0-9\,\. ]+)")[0] .strip() ) res["url"] = urljoin_rfc(base_url, url) res["description"] = name res["price"] = price yield load_product(res, response) else: suburl = p.select('.//h2[@class="subcategory-h2"]/a/@href')[0].extract() yield Request(urljoin_rfc(base_url, suburl))
def parse_product(self, response): if not isinstance(response, HtmlResponse): return base_url = get_base_url(response) hxs = HtmlXPathSelector(response) res = {} url = response.url name = hxs.select( "//div[contains(@class,'BlockContent')]/h1/text()").extract() if name: price = "".join( hxs.select( '//em[@class="ProductPrice VariationProductPrice"]/span[@class="SalePrice"]/text()' ).re(r'([0-9\,\. ]+)')).strip() if not price: price = "".join( hxs.select( '//em[@class="ProductPrice VariationProductPrice"]/text()' ).re(r'([0-9\,\. ]+)')).strip() res['url'] = url res['description'] = name[0].strip() res['price'] = price if 'sku' in response.meta: res['sku'] = response.meta['sku'] yield load_product(res, response)
def parse_product(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response) res = {} options = hxs.select("//select[@name='SKURecNum']/option/text()").extract() if options: #options name = hxs.select("//div[@class='buybox']/table/tr/td/h1/text()").extract() if not name: name = hxs.select("//h1[@class='productNameDN']/text()").extract() url = response.url for option in options: try: name2 = re.match(r'(.*) -.*',option.strip()).group(1) except: continue try: price = re.match(r'.*\xa3(.*)',option.replace("\r","").replace("\n","").strip()).group(1) except: price = None if not price: price = "".join(hxs.select("//p[@class='ProductDetailPrice']/a/font[@class='BodyMain']/text()").re(r'\xa3([0-9\,\. ]+)')).strip() if not price: price = "".join(hxs.select('//p[@class="ProductDetailPrice"]/font[1]/b/text()').re(r'\xa3([0-9\,\. ]+)')).strip() res['url'] = urljoin_rfc(base_url,url) res['description'] = name[0].strip() + u' ' + name2 res['price'] = price yield load_product(res, response) else: name = hxs.select("//div[@class='buybox']/table/tr/td/h1/text()").extract() if not name: name = hxs.select("//h1[@class='productNameDN']/text()").extract() if not name: name = hxs.select("//div[@class='buybox']/table/tr/td/table/tr/td/h1/text()").extract() url = response.url price = "".join(hxs.select("//p[@class='ProductDetailPrice']/strong/a/font[@class='BodyMain']/text()").re(r'\xa3([0-9\,\. ]+)')).strip() if not price: price = "".join(hxs.select("//p[@class='ProductDetailPrice']/a/font[@class='BodyMain']/text()").re(r'\xa3([0-9\,\. ]+)')).strip() if not price: price = "".join(hxs.select('//p[@class="ProductDetailPrice"]/strong/font/b/text()').re(r'\xa3([0-9\,\. ]+)')).strip() if not price: price = "".join(hxs.select('//p[@class="ProductDetailPrice"]/font/b/text()').re(r'\xa3([0-9\,\. ]+)')).strip() res['url'] = urljoin_rfc(base_url,url) res['description'] = name[0].strip() res['price'] = price yield load_product(res, response)
def parse_row(self, response, row): res = {} name = row['name'] url = '' price = row['price'] res['url'] = url res['description'] = name res['price'] = price return load_product(res, response)
def parse(self, response): if not isinstance(response, HtmlResponse): return base_url = get_base_url(response) #categories hxs = HtmlXPathSelector(response) category_urls = hxs.select('//td[@width="33.3%"]/table/tr/td/table/tr/td/a[1]/@href').extract() for url in category_urls: yield Request(urljoin_rfc(base_url,url)) #log.msg(str(response.url), level=log.WARNING) if re.search('category\.asp\?dept_id\=', str(response.url)): groups_urls = hxs.select('//a[contains(@href,"product_group.asp?dept_id=")]/@href').extract() for group in groups_urls: yield Request(urljoin_rfc(base_url,group)) prod_urls = hxs.select('//a[contains(@href,"product.asp?dept_id=")]/@href').extract() for prod in prod_urls: yield Request(urljoin_rfc(base_url,prod)) groups_urls = hxs.select('//form[contains(@action,"product_group.asp?dept_id=")]/@action').extract() for group in groups_urls: yield Request(urljoin_rfc(base_url,group)) prod_urls = hxs.select('//form[contains(@action,"product.asp?dept_id=")]/@action').extract() for prod in prod_urls: yield Request(urljoin_rfc(base_url,prod)) prod_urls = hxs.select('//form[contains(@action,"xt_orderform_additem.asp?")]/p/a/@href').extract() for prod in prod_urls: yield Request(urljoin_rfc(base_url,prod)) #add product if re.search('product\.asp\?dept_id\=', str(response.url)): res = {} name = hxs.select('//table/tr/td[1]/h1/text()').extract()[0] price = hxs.select('//table/tr/td[2]/h1/text()').re('[0-9\. \,]+')[0] url = response.url res['url'] = url res['description'] = name res['price'] = price yield load_product(res, response) #pages pages = hxs.select('//td[@class="ps_scrollnav"]/a/@href').extract() for page in pages: yield Request(urljoin_rfc(base_url,page)) # products for p in self.parse_product(response): yield p
def parse_product(self, response): if 'destock' in response.url: return hxs = HtmlXPathSelector(response) base_url = get_base_url(response) res = response.meta brand = hxs.select('//tr[th/text()="Marque"]/td/text()').extract() res['brand'] = brand[0] if brand else '' res['stock'] = 1 yield load_product(res, response)
def parse_product(self, response): if not isinstance(response, HtmlResponse): return base_url = get_base_url(response) hxs = HtmlXPathSelector(response) site_mfrgid = hxs.select( '//span[@class="prod-detail-man-part-value"]/text()').extract() if site_mfrgid: site_mfrgid = site_mfrgid[0].strip() mfrgid = response.meta['mfrgid'] product_name = response.meta['name'].split(' ') if not site_mfrgid or (site_mfrgid != mfrgid and site_mfrgid not in product_name): return options = hxs.select( "//select[contains(@name,'ddlVariationGroup')]/option") res = {} if not options: #no options name = hxs.select("//div[@id='product-detail-div']/h1/text()" )[0].extract().strip() url = response.url price = "".join( hxs.select('//span[@class="prod-detail-cost-value"]/text()'). re(r'([0-9\,\. ]+)')).strip() res['url'] = url res['description'] = response.meta['sku'] + ' ' + name res['price'] = price res['sku'] = response.meta['sku'] res['identifier'] = response.meta['sku'].lower() yield load_product(res, response) else: is_multioptions = hxs.select( "//select[contains(@name,'ddlVariationGroup')]") if len(is_multioptions) < 2: for option in options: hxs.select( "//span[@class='prod-detail-part-value']/text()[contains(.," + response.meta['search_q'] + ")]") select_name = hxs.select( "//select[contains(@name,'ddlVariationGroup')]/@name" ).extract()[0] request = FormRequest.from_response( response, formdata={ select_name: option.select('./@value').extract() }, dont_click=True, callback=self.parse_options) request.meta['name2'] = option.select( './text()').extract()[0].strip() request.meta['sku'] = response.meta['sku'].lower() request.meta['search_q'] = response.meta['search_q'] yield request
def parse_product(self, response): hxs = HtmlXPathSelector(response) products = hxs.select('//ol[@class="category list"]/li') for p in products: res = {} name = join(p.select('.//div/div/h2//a[1]/text()').extract()) url = join(p.select('.//div/div/h2/a[1]/@href').extract()) if name + url in self.inserted_products: continue """price = p.select('.//div/div/div[@class="prix"]/span[@class="price-including-tax"]/span[@class="price"]/text()').re(r'([0-9\,\. ]+)') if not price: price = p.select('.//div/div/div[@class="prix"]/ins/span[@class="price-including-tax"]/span[@class="price"]/text()').re(r'([0-9\,\. ]+)') if len(price) > 1: price_ = str(price[0]) + str(price[1]) else: price_ = price[0]""" price_url = join( p.select( './/div/div/div[@class="prix"]/span[@class="price-including-tax"]/span[1]/img/@src' ).extract()) if not price_url: price_url = join( p.select( './/div/div/div[@class="prix"]/ins/span[@class="price-including-tax"]/span[1]/img/@src' ).extract()) if price_url: params = urllib.urlencode({ 'url': price_url, 'resize': '200', 'mode': '8', 'blur': '1', 'format': 'float' }) f = urllib.urlopen( "http://178.63.95.196/ocr/get_price_from_image?%s" % params) jdata = json.loads(f.read()) log.msg(str(jdata), log.DEBUG) price = jdata['price'].encode('utf-8') price = price.replace(" ", "").replace(",", ".") log.msg(str(price), log.DEBUG) else: price = "" res['url'] = url res['description'] = name res['price'] = price self.inserted_products.add(name + url) yield load_product(res, response)
def parse_row(self, response, row): res = {} name = row['name'] split = name.split(" - ") sku = split[0] name = " - ".join(split[1:]) url = '' price = row['price'] res['url'] = url res['description'] = name res['price'] = price res['sku'] = sku return load_product(res, response)
def parse_product(self, response): if not isinstance(response, HtmlResponse): return # sub products hxs = HtmlXPathSelector(response) # compound product sub_products = hxs.select('//table[@id="super-product-table"]//tr') if sub_products: sub_products = sub_products[1:] for p in sub_products: product = {} product['url'] = response.url product['description'] = p.select('td[1]//text()').extract()[0] product['price'] = ''.join( p.select('td[2]//text()').extract()).strip() product['sku'] = '' yield load_product(product, response) return product = {} try: product['url'] = response.url product['sku'] = '' product['description'] = hxs.select( '//div[@class="product-name"]/h2/text()').extract()[0] try: product['price'] = hxs.select('//div[@class="product-shop"]//p[@class="special-price"]/span[2]/text()')\ .extract()[0] except IndexError: product['price'] = hxs.select('//div[@class="product-shop"]//span[@class="regular-price"]/span/text()')\ .extract()[0] yield load_product(product, response) except IndexError: return
def parse_product(self, response): hxs = HtmlXPathSelector(response) products = hxs.select('//li[contains(@class,"hreview-aggregate hproduct")]') for p in products: res = {} name = p.select('.//div/div/span/a/text()')[0].extract() url = p.select('.//div/div/span/a/@href')[0].extract() price = p.select('.//div/div/div/span/small/text()').re(r'([0-9\.\, ]+)')[0] res['url'] = url res['description'] = name res['price'] = price yield load_product(res, response)
def parse_product(self, response): hxs = HtmlXPathSelector(response) products = hxs.select('//div[@class="list-item"]') for p in products: res = {} name = p.select('./a[@class="headline"]/text()')[0].extract() url = p.select('./a[@class="headline"]/@href')[0].extract() price = p.select('./span[@class="price"]/text()').re('\xa3(.*)')[0] res['url'] = url res['description'] = name res['price'] = price yield load_product(res, response)
def parse_product(self, response): hxs = HtmlXPathSelector(response) res = {} options = hxs.select("//div[@id='optionscontainer']/div[@class='variants']/select/option/text()").extract() if options: #options name = hxs.select("//div[@class='ProductDetails']/h1/text()")[0].extract().strip() url = response.url for option in options: name2 = re.match(r'(.*) -',option).group(1) price = re.match(r'.*\xa3(.*)',option).group(1) res['url'] = url res['description'] = name + u' ' + name2 res['price'] = price yield load_product(res, response) else: name = hxs.select("//div[@class='ProductDetails']/h1/text()")[0].extract().strip() url = response.url price = "".join(hxs.select("//div[@id='unitprice']/span/text()").re(r'([0-9\,\. ]+)')).strip() res['url'] = url res['description'] = name res['price'] = price yield load_product(res, response)
def parse_product(self, response): if not isinstance(response, HtmlResponse): return # sub products hxs = HtmlXPathSelector(response) # compound product sub_products = hxs.select('//table[@id="super-product-table"]//tr') if sub_products: sub_products = sub_products[1:] for p in sub_products: product = {} product['url'] = response.url product['description'] = p.select('td[1]//text()').extract()[0] product['price'] = ''.join(p.select('td[2]//text()').extract()).strip() product['sku'] = '' yield load_product(product, response) return product = {} try: product['url'] = response.url product['sku'] = '' product['description'] = hxs.select('//div[@class="product-name"]/h2/text()').extract()[0] try: product['price'] = hxs.select('//div[@class="product-shop"]//p[@class="special-price"]/span[2]/text()')\ .extract()[0] except IndexError: product['price'] = hxs.select('//div[@class="product-shop"]//span[@class="regular-price"]/span/text()')\ .extract()[0] yield load_product(product, response) except IndexError: return
def parse_product(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response) res = {} products = hxs.select("//div[@class='product_small_rod']/form") for p in products: options = p.select( ".//div[@class='rod_inset']/p[@class='vat']/select/option/text()" ).extract() if options: #options name = p.select( ".//div[@class='rod_inset']/a/h3[@class='coarse']/text()" )[0].extract().strip() url = p.select( ".//div[@class='rod_inset']/a/@href")[0].extract() for option in options: name2 = re.match(r'(.*) \(.*\) \(', option).group(1) price = re.match(r'.*\(\xa3(.*)\)', option).group(1) res['url'] = urljoin_rfc(base_url, url) res['description'] = name + u' ' + name2 res['price'] = price yield load_product(res, response) else: name = p.select( ".//div[@class='rod_inset']/a/h3[@class='coarse']/text()" )[0].extract().strip() url = p.select( ".//div[@class='rod_inset']/a/@href")[0].extract() price = "".join( p.select(".//span[@class='bigprice']/text()").re( r'([0-9\,\. ]+)')).strip() res['url'] = urljoin_rfc(base_url, url) res['description'] = name res['price'] = price yield load_product(res, response)
def parse_product(self, response): if not isinstance(response, HtmlResponse): return base_url = get_base_url(response) hxs = HtmlXPathSelector(response) site_mfrgid = hxs.select('//span[@class="prod-detail-man-part-value"]/text()').extract() if site_mfrgid: site_mfrgid = site_mfrgid[0].strip() mfrgid = response.meta['mfrgid'] product_name = response.meta['name'].split(' ') if not site_mfrgid or (site_mfrgid != mfrgid and site_mfrgid not in product_name): return options = hxs.select("//select[contains(@name,'ddlVariationGroup')]/option") res = {} if not options: # no options name = hxs.select("//div[@id='product-detail-div']/h1/text()")[0].extract().strip() url = response.url price = "".join(hxs.select('//span[@class="prod-detail-cost-value"]/text()').re(r'([0-9\,\. ]+)')).strip() res['url'] = url res['description'] = response.meta['sku'] + ' ' + name res['price'] = price res['sku'] = response.meta['sku'] res['identifier'] = response.meta['search_q'] if (name, price) not in self._items: self._items.add((name, price)) yield load_product(res, response) else: event_validation = hxs.select('//*[@id="__EVENTVALIDATION"]/@value').extract()[0] view_state = hxs.select('//*[@id="__VIEWSTATE"]/@value').extract()[0] is_multioptions = hxs.select("//select[contains(@name,'ddlVariationGroup')]") if len(is_multioptions) < 2: select_name = hxs.select("//select[contains(@name,'ddlVariationGroup')]/@name").extract()[0] for option in options[1:]: option_value = option.select('./@value')[0].extract() request = FormRequest(url=response.url, formdata={u'ctl00$MainContent$ViewTypeCheckBox': u'on', u'__EVENTTARGET': select_name, u'__EVENTARGUMENT': u'', u'__EVENTVALIDATION': event_validation, u'__VIEWSTATE': view_state, select_name: option_value}, meta={'option': option.select('./text()').extract()[0].strip(), 'option_id': option_value, 'sku': response.meta['sku'].lower(), 'search_q': response.meta['search_q']}, callback=self.parse_options) yield request
def parse_product(self, response): hxs = HtmlXPathSelector(response) products = hxs.select('//li[@class="item last"]') if products: for p in products: res = {} name = p.select('.//h2[@class="product-name"]/a/text()')[0].extract().strip() url = p.select('.//h2[@class="product-name"]/a/@href')[0].extract().strip() price = "".join(p.select('.//div[@class="price-box"]/span[@class="regular-price"]/span[@class="price"]/text()').re(r'([0-9\,\. ]+)')).strip() if not price: price = "".join(p.select('.//div[@class="price-box"]/p[@class="special-price"]/span[@class="price"]/text()').re(r'([0-9\,\. ]+)')).strip() res['url'] = url res['description'] = name res['price'] = price yield load_product(res, response)
def parse_product(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response) products = hxs.select('//table[@class="prodtable"]/tr') for p in products: res = {} name = p.select('.//td[@class="prodtableitem"]/a/text()').extract() if name: url = p.select('.//td[@class="prodtableitem"]/a/@href')[0].extract() price = "".join(p.select('.//td[@class="prodtableprice"]/text()').re(r'([0-9\,\. ]+)')).strip() res['url'] = urljoin_rfc(base_url,url) res['description'] = name[0] res['price'] = price yield load_product(res, response)
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) res = {} name = hxs.select('.//span[@id="productInformationHeaderReference"]/text()').extract()[0] url = response.url price = hxs.select('.//div[@class="AXISBreakPricing1"]/div[@class="AXISBreakPricingPrice"]').re("\xa3(.*)<")[0] res['url'] = url res['description'] = name res['price'] = price res['sku'] = res['description'] yield load_product(res, response)
def parse_product(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response) products = hxs.select('//div[@class="product-single"]') for p in products: res = {} name = p.select('./form/div/div[@class="section-product-title"]/a/text()').extract() if name: url = p.select('./form/div/div[@class="section-product-title"]/a/@href')[0].extract() price = "".join(p.select('./form/div/div/span[@class="price"]/text()').re(r'([0-9\,\. ]+)')).strip() res['url'] = urljoin_rfc(base_url,url) res['description'] = name[0] res['price'] = price yield load_product(res, response)
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) res = {} name = hxs.select('.//span[@id="productInformationHeaderReference"]/text()').extract()[0] url = response.url price = hxs.select('.//div[@class="AXISBreakPricing1"]/div[@class="AXISBreakPricingPrice"]').re("\xa3(.*)<")[0] res["url"] = url res["description"] = name res["price"] = price res["sku"] = res["description"] yield load_product(res, response)
def parse_product(self, response): URL_BASE = 'http://www.djkit.com' hxs = HtmlXPathSelector(response) products = hxs.select('//a[@class="product"]/../..') for p in products: res = {} name = p.select('.//div[@class="title"]/a/text()')[0].extract() url = p.select('..//div[@class="title"]/a/@href')[0].extract() url = urljoin_rfc(URL_BASE, url) price = p.select('.//div[@class="showprice"]/span/text()').re('.(.*)')[0] res['url'] = url res['description'] = name res['price'] = price yield load_product(res, response)
def parse_product(self, response): URL_BASE = 'http://www.djstore.com' hxs = HtmlXPathSelector(response) products = hxs.select('//div[@class="fulllist"]//table') for p in products: res = {} name = p.select('.//a[@class="producttitle"]/text()')[0].extract() url = p.select('.//a[@class="producttitle"]/@href')[0].extract() price = p.select('.//span[@class="price"]/text()').re('\xa3(.*)')[0] #TODO Check price is the correct one. res['url'] = url res['description'] = name res['price'] = price yield load_product(res, response)
def parse_product(self, response): URL_BASE = 'http://www.decks.co.uk' hxs = HtmlXPathSelector(response) products = hxs.select('//div[@id="search-results"]/ul/li') for p in products: res = {} name = p.select('.//h2/a/text()')[0].extract() url = p.select('.//h2/a/@href')[0].extract() url = urljoin_rfc(URL_BASE, url) price = p.select('.//h3[@class="price"]/strong/text()').re('\xa3(.*)')[0] res['url'] = url res['description'] = name res['price'] = price yield load_product(res, response)
def parse_product(self, response): hxs = HtmlXPathSelector(response) products = hxs.select( '//li[contains(@class,"hreview-aggregate hproduct")]') for p in products: res = {} name = p.select('.//div/div/span/a/text()')[0].extract() url = p.select('.//div/div/span/a/@href')[0].extract() price = p.select('.//div/div/div/span/small/text()').re( r'([0-9\.\, ]+)')[0] res['url'] = url res['description'] = name res['price'] = price yield load_product(res, response)
def parse_product(self, response): URL_BASE = 'http://www.dv247.com' hxs = HtmlXPathSelector(response) products = hxs.select('//div[@class="listItem clearfix"]') for p in products: res = {} name = ' '.join(p.select('.//a//text()').extract()) url = p.select('.//a/@href')[0].extract() url = urljoin_rfc(URL_BASE, url) price = p.select('.//li[@class="price"]/text()').re('\xa3(.*)')[0] res['url'] = url res['description'] = name res['price'] = price yield load_product(res, response)
def parse_product(self, response): if not isinstance(response, HtmlResponse): return res = {} hxs = HtmlXPathSelector(response) res['url'] = response.url name = hxs.select('//div[@class="productname"]//text()').extract()[0].strip() price = hxs.select('//span[@class="productPrice"]/span/text()').extract()[0] if price: res['description'] = self.names.get(response.url, name) res['price'] = price yield load_product(res, response) else: return
def parse_product(self, response): URL_BASE = 'http://www.getinthemix.co.uk' hxs = HtmlXPathSelector(response) products = hxs.select('//div[contains(@class,"prod_item prod_grid")]') for p in products: res = {} name = p.select('.//div[@class="prod_title"]/a/@title')[0].extract() url = p.select('.//div[@class="prod_title"]/a/@href')[0].extract() url = urljoin_rfc(URL_BASE, url) price = p.select('.//div[@class="prod_price_web"]/text()')[0].extract() res['url'] = url res['description'] = name res['price'] = price yield load_product(res, response)
def parse_product(self, response): hxs = HtmlXPathSelector(response) products = hxs.select('//ol[@class="category list"]/li') for p in products: res = {} name = join(p.select(".//div/div/h2//a[1]/text()").extract()) url = join(p.select(".//div/div/h2/a[1]/@href").extract()) if name + url in self.inserted_products: continue """price = p.select('.//div/div/div[@class="prix"]/span[@class="price-including-tax"]/span[@class="price"]/text()').re(r'([0-9\,\. ]+)') if not price: price = p.select('.//div/div/div[@class="prix"]/ins/span[@class="price-including-tax"]/span[@class="price"]/text()').re(r'([0-9\,\. ]+)') if len(price) > 1: price_ = str(price[0]) + str(price[1]) else: price_ = price[0]""" price_url = join( p.select('.//div/div/div[@class="prix"]/span[@class="price-including-tax"]/span[1]/img/@src').extract() ) if not price_url: price_url = join( p.select( './/div/div/div[@class="prix"]/ins/span[@class="price-including-tax"]/span[1]/img/@src' ).extract() ) if price_url: params = urllib.urlencode( {"url": price_url, "resize": "200", "mode": "8", "blur": "1", "format": "float"} ) f = urllib.urlopen("http://178.63.95.196/ocr/get_price_from_image?%s" % params) jdata = json.loads(f.read()) log.msg(str(jdata), log.DEBUG) price = jdata["price"].encode("utf-8") price = price.replace(" ", "").replace(",", ".") log.msg(str(price), log.DEBUG) else: price = "" res["url"] = url res["description"] = name res["price"] = price self.inserted_products.add(name + url) yield load_product(res, response)
def parse_product(self, response): if not isinstance(response, HtmlResponse): return res = {} hxs = HtmlXPathSelector(response) res['url'] = response.url name = hxs.select('//*[@itemprop="name"]//text()').extract()[0].strip() try: price = hxs.select( '//*[@itemprop="price"]/span/text()').extract()[0] except: price = "" res['description'] = name res['price'] = price try: res['sku'] = hxs.select( '//div[@class="manufacturer-part-number"]' '/span[@class="value"]/text()').extract()[0].strip() except: pass res['identifier'] = hxs.select( '//div[@itemtype="http://schema.org/Product"]/@data-productid' ).extract()[0] image = hxs.select('//div[@class="picture"]//img/@src').extract() if image: res['image_url'] = urljoin_rfc(get_base_url(response), image[0].strip()) try: res['category'] = hxs.select( '//div[@class="breadcrumb"]/ul/li//a/span/text()')[-1].extract( ) except: pass try: res['brand'] = hxs.select( '//div[@class="manufacturers"]/*[@class="value"]/*/text()' ).extract()[0] except: try: res['brand'] = hxs.select( '//div[@class="manufacturers"]/*[@class="value"]/text()' ).extract()[0] except: pass yield load_product(res, response)
def parse_product(self, response): URL_BASE = 'http://www.djkit.com' hxs = HtmlXPathSelector(response) products = hxs.select('//a[@class="product"]/../..') for p in products: res = {} name = p.select('.//div[@class="title"]/a/text()')[0].extract() url = p.select('..//div[@class="title"]/a/@href')[0].extract() url = urljoin_rfc(URL_BASE, url) price = p.select('.//div[@class="showprice"]/span/text()').re( '.(.*)')[0] res['url'] = url res['description'] = name res['price'] = price yield load_product(res, response)
def parse_product(self, response): URL_BASE = 'http://www.djstore.com' hxs = HtmlXPathSelector(response) products = hxs.select('//div[@class="fulllist"]//table') for p in products: res = {} name = p.select('.//a[@class="producttitle"]/text()')[0].extract() url = p.select('.//a[@class="producttitle"]/@href')[0].extract() price = p.select('.//span[@class="price"]/text()').re('\xa3(.*)')[ 0] #TODO Check price is the correct one. res['url'] = url res['description'] = name res['price'] = price yield load_product(res, response)
def parse(self, response): if not isinstance(response, HtmlResponse): return base_url = get_base_url(response) # categories hxs = HtmlXPathSelector(response) category_urls = hxs.select( '//div[@id="category-menu"]/ul/li/a/@href').extract() for url in category_urls: yield Request(urljoin_rfc(base_url, url)) # subcategories subcategory_urls = hxs.select( '//div[@id="double-column-wrapper"]/div[@id="col-2"]/div[@class="col-2-text-div"]/div[@class="col-2-divider"]/div[@class="category-right"]/a/@href' ).extract() for suburl in subcategory_urls: yield Request(urljoin_rfc(base_url, suburl)) # detailed products counter = 0 products = hxs.select('//div[@class="product-right"]/form/table/tr') for p in products: counter = counter + 1 if p.select("./td[@style='font-size: 12px;']").extract(): res = {} name = p.select( './td[@style="font-size: 12px;"]/a/text()')[0].extract() url = p.select( './td[@style="font-size: 12px;"]/a/@href')[0].extract() price = products[counter].select( './td[1]/span[@class="the-price"]/text()').re( r'([0-9\,\. ]+)')[0].strip() identifier = p.select( './/input[contains(@name, "stkcode-")]/@value')[0].extract( ).strip() res['url'] = urljoin_rfc(base_url, url) res['description'] = name res['price'] = price res['identifier'] = identifier yield load_product(res, response) # products for p in self.parse_product(response): yield p
def parse_product(self, response): hxs = HtmlXPathSelector(response) pages = hxs.select('//select[@name="nbPagesPerPage"]') cat_text = hxs.select('//h2[@class="titre_image titre_image_niv1"]') if not pages and not cat_text: products = hxs.select('//div[@class="bloc_cadre_pied"]/form[@class="mini_fiche_ligne"]') for p in products: res = {} name = p.select('.//div[@class="colonne_1"]/div[@class="ligne_titre"]/span[@class="titre_descriptif"]/strong/text()')[0].extract().strip() url = response.url price = "".join(p.select('.//div[@class="lignebeige"]/div[@class="wrapperPrix"]/div/div/div/b/text()').re(r'([0-9\,\. ]+)')).strip() res['url'] = url res['description'] = name res['price'] = price yield load_product(res, response)
def parse_product(self, response): base_url = get_base_url(response) if re.search('product_group\.asp\?dept_id\=', str(response.url)): hxs = HtmlXPathSelector(response) products = hxs.select('//form[@action="xt_orderform_additem.asp?"]') for p in products: res = {} name = p.select('./p/a/@title').extract() if name: url = p.select('./p/a/@href')[0].extract() price = p.select('./p/span[@class="price"]/text()').re(r'([0-9][0-9\,\.]+)') if not price: price = p.select('./p/span[@class="price"][2]/text()').re(r'([0-9][0-9\,\.]+)') res['url'] = urljoin_rfc(base_url,url) res['description'] = name[0] res['price'] = price[0].strip() yield load_product(res, response)
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) products = hxs.select('//article[starts-with(@class, "product")]') for p in products: res = {} name = p.select('.//span[@itemprop="ProductID"]/text()')[0].extract() url = p.select('.//h3/a/@href')[0].extract() url = urljoin_rfc(self.URL_BASE, url) price = p.select('.//span[@class="ex_vat"]/text()').re('\xa3(.*) ex')[0] res['url'] = url res['description'] = name res['price'] = price res['sku'] = res['description'] yield load_product(res, response)
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) res = {} try: name = hxs.select('//div[@id="product-detail"]/h1/text()').extract()[0] url = response.url price = hxs.select('//div[@id="product-detail"]/p[@class="ours"]/strong/text()').re('\xa3(.*)')[0] res['url'] = url res['description'] = name res['price'] = price res['sku'] = '' yield load_product(res, response) except IndexError: return