def parse(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) # categories categories = hxs.select('//div[@id="atoz"]//a/@href').extract() for url in categories: url = urljoin_rfc(get_base_url(response), url) yield Request(url) # subcategories subcategories = hxs.select(u'//div[@class="CategoryChildCategoriesLink"]/a/@href').extract() subcategories += hxs.select('//div[@id="relatedbrands"]//a/@href').extract() for url in subcategories: url = urljoin_rfc(get_base_url(response), url) yield Request(url) # pagination next_page = hxs.select(u'//div[@class="CategoryPageNavigation"]//a[child::span[contains(text(),"Next")]]/@href').extract() if next_page: next_page = urljoin_rfc(get_base_url(response), next_page[0]) yield Request(next_page) # products for product in self.parse_product(response): yield product
def parse(self, response): next_page = response.xpath(u"//a[contains(text(), '下一页')]/@href").extract() if next_page: yield scrapy.Request(urljoin_rfc(get_base_url(response), next_page[0])) alist = response.xpath("//td[@id='div_list']/ul[@class='ul_art_row']/li[@class='li_art_title']/a/@href").extract() for a in alist: yield scrapy.Request(urljoin_rfc(get_base_url(response), a), callback=self.parse_detail)
def parse_products(self, response): hxs = HtmlXPathSelector(response) products = hxs.select('//div/div/table') for product in products: name = ''.join(product.select('tr/td/div[@class="featuredProductLinks"]/a/text()').extract()) if name: loader = ProductLoader(item=Product(), selector=product) brand = ''.join(product.select('tr/td/div[@class="featuredMIS"]/a/text()').extract()) loader.add_value('name', ' '.join((brand, name))) relative_url = product.select('tr/td/div[@class="featuredProductLinks"]/a/@href').extract() loader.add_value('url', urljoin_rfc(get_base_url(response), relative_url[0])) price = ''.join(product.select('tr/td/div/div' '[@class="featuredProductPrice"]' '/span/span[@class="SalePrice1"]' '/text()').extract()).replace('.','').replace(',','.') if not price: price = ''.join(product.select('tr/td/div/div' '[@class="featuredProductPrice"]' '/span/span[@class="variantprice1"]' '/text()').extract()).replace('.','').replace(',','.') loader.add_value('price', price) yield loader.load_item() next = hxs.select('//div[@class="pagingdiv"]/a[not(@class)]/@href').extract() if next: url = urljoin_rfc(get_base_url(response), next[-1]) yield Request(url, callback=self.parse_products)
def parse_products(self, response): hxs = HtmlXPathSelector(response) products = hxs.select('//li[@class="item" or @class="item lastItem"]') for product in products: name = product.select('div/h3/a/span/text()').extract()[0] url = product.select('div/h3/a/@href').extract() if url: url = urljoin_rfc(get_base_url(response), url[0]) options_from = ''.join(product.select('div/p[@class="price money"]/span/abbr/text()').extract()).strip() options_now = ''.join(product.select('div/p[@class="price money"]/text()').extract()).strip() if ('From' in options_from) or ('Now' in options_now): yield Request(url, callback=self.parse_options, meta={'name':name}) else: loader = ProductLoader(item=Product(), selector=product) loader.add_value('name', name) loader.add_value('url', url) price = product.select('div/p[@class="price money"]/span/span/text()').extract() if not price: price = product.select('div/p[@class="price money"]/ins/span/text()').extract() if not price: price = [''] loader.add_value('price', price[0]) yield loader.load_item() next = hxs.select('//a[@rel="nofollow" and span/text()="Next \xc2\xbb"]/@href'.decode('utf')).extract() if next: url = urljoin_rfc(get_base_url(response), next[0]) yield Request(url, callback=self.parse_products) else: sub_categories = hxs.select('//*[@id="categoryNavigation"]/li/ul/li/a/@href').extract() for sub_category in sub_categories: url = urljoin_rfc(get_base_url(response), sub_category) yield Request(url, callback=self.parse_products)
def parse(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) # categories categories = hxs.select(u'//div[@class="navigation-inner"]//a/@href').extract() for url in categories: url = urljoin_rfc(get_base_url(response), url) yield Request(url) subcategories = hxs.select(u'//div[@class="category-wrap"]//a/@href').extract() for url in subcategories: url = urljoin_rfc(get_base_url(response), url) yield Request(url) # pagination # next_page = hxs.select(u'').extract() # if next_page: # next_page = urljoin_rfc(get_base_url(response), next_page[0]) # yield Request(next_page) # products products = hxs.select(u'//div[@class="product-wrap"]//a/@href').extract() for url in products: url = urljoin_rfc(get_base_url(response), url) yield Request(url, callback=self.parse_product)
def parse(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) monitored_categories = [u'Bricolage', u'Technique de nettoyage', u'Outillage Jardin'] for category in monitored_categories: category_urls = hxs.select(u'//div[@class="suckerdiv"]/ul[child::li[child::a[contains(text(),"%s")]]]//a/@href' % category).extract() for url in category_urls: url = urljoin_rfc(get_base_url(response), url) yield Request(url) # categories categories = hxs.select(u'//td[@class="smallText"]/a[child::br]/@href').extract() for url in categories: url = urljoin_rfc(get_base_url(response), url) yield Request(url) # pagination next_page = hxs.select(u'//a[child::u[contains(text(),"Suivante")]]/@href').extract() if next_page: next_page = urljoin_rfc(get_base_url(response), next_page[0]) yield Request(next_page) # products for product in self.parse_product(response): yield product
def parse(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) # brands brands = hxs.select(u'//ul[@class="bare-list"]//a/@href').extract() for url in brands: url = urljoin_rfc(get_base_url(response), url) yield Request(url) # categories categories = hxs.select(u'//li[contains(@class,"level-0")]/a/@href').extract() for url in categories: url = urljoin_rfc(get_base_url(response), url) yield Request(url) # subcategories subcategories = hxs.select(u'//div[@class="fieldset-custom" and descendant::h2[text()="Categories"]]//div[@class="cat_name"]//a/@href').extract() for url in subcategories: url = urljoin_rfc(get_base_url(response), url) yield Request(url) # pagination next_page = hxs.select(u'//a[@class="next" and @title="Next" and contains(text(), "Next")]/@href').extract() if next_page: next_page = urljoin_rfc(get_base_url(response), next_page[0]) yield Request(next_page) # products for product in self.parse_product(response): yield product
def parse_tianshannet(self,response): soup = BeautifulSoup.BeautifulSoup(response.body) search_mapping = {'div': {'class':['title_txt','txt link05']}} for t, p in search_mapping.items(): for p_t, p_p in p.items(): list = soup.findAll(t, {p_t: p_p}) if list: for rs in list: url = rs.findAll('li') if url: for i in url: rs_url = i.findAll('a') for j in rs_url: u = j['href'] if not u.startswith('http'): base_url = get_base_url(response) u = urljoin_rfc(base_url, u) yield Request(url = u, meta = response.meta, callback = self.parse_item) else: rs_url = rs.findAll('a') for i in rs_url: u = i['href'] if not u.startswith('http'): base_url = get_base_url(response) u = urljoin_rfc(base_url, u) yield Request(url = u, meta = response.meta, callback = self.parse_item)
def parse_sina(self,response): base_url = urlparse.urlparse(response.url).hostname if base_url == "gd.news.sina.com.cn": soup = BeautifulSoup.BeautifulSoup(response.body) search_mapping = {'div': {'class':['list']}} for t, p in search_mapping.items(): for p_t, p_p in p.items(): list = soup.findAll(t, {p_t: p_p}) if list: for i in list: rs_url = i.findAll('a',target='_blank') for j in rs_url: u = j['href'] if not u.startswith('http'): base_url = get_base_url(response) u = urljoin_rfc(base_url, u) yield Request(url = u, meta = response.meta, callback = self.parse_item) else: pattern = '\{"title":".*?","url":"(.*?)","createdate":".*?"\},' ob = re.compile(pattern) rs = ob.findall(response.body) for i in rs: if not i.startswith('http'): base_u = get_base_url(response) i = urljoin_rfc(base_u, i) yield Request(url = i, meta = response.meta, callback = self.parse_item)
def parse(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) # categories categories = hxs.select(u'//div[@class="store-switcher"]//a/@href').extract() for url in categories: url = urljoin_rfc(get_base_url(response), url) yield Request(url) subcategories = hxs.select(u'//div[@class="col-left sidebar"]//a/@href').extract() subcategories += hxs.select(u'//ul[@class="category-grid"]//a/@href').extract() for url in subcategories: url = urljoin_rfc(get_base_url(response), url) yield Request(url) # pagination next_page = hxs.select(u'//a[@class="next i-next"]/@href').extract() if next_page: next_page = urljoin_rfc(get_base_url(response), next_page[0]) yield Request(next_page) # products products = hxs.select(u'//ul[@class="products-grid"]//a/@href').extract() for url in products: url = urljoin_rfc(get_base_url(response), url) yield Request(url, callback=self.parse_product)
def parse(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) # categories categories = hxs.select(u'//li/a[contains(text(),"Aspirateur ")]/@href').extract() categories += hxs.select(u'//li[child::a[contains(text(),"Aspirateur ")]]/ul//a/@href').extract() # aspirateurs categories += hxs.select(u'//div[preceding-sibling::div[child::strong[contains(text(), "Plein Air")]] and \ following-sibling::div[child::strong[contains(text(),"Outillage")]]]//a[not(child)]/@href').extract() #categories += hxs.select(u'//ul[preceding-sibling::div[child::strong[contains(text(), "Plein Air")]] and \ #following-sibling::div[child::strong[contains(text(),"Outillage")]]]//a[not(child)]/@href').extract() for url in categories: url = urljoin_rfc(get_base_url(response), url) yield Request(url) # pagination next_page = hxs.select(u'//ul[@class="PaginationButtons"]//a[contains(text(),"Suivant")]/@href').extract() if next_page: next_page = urljoin_rfc(get_base_url(response), next_page[0]) yield Request(next_page) # products products = hxs.select(u'//div[contains(@class,"productCell") or contains(@class,"productLine")]//a/@href').extract() products += hxs.select(u'//div[contains(@id,"tabContent0")]//div[contains(@class,"product100")]//a/@href').extract() products += hxs.select(u'//div[contains(@class,"productHomeListBody")]//a/@href').extract() products = [url for url in products if url != '#'] products = set(products) for url in products: url = urljoin_rfc(get_base_url(response), url) yield Request(url, callback=self.parse_product)
def parse(self, response): hxs = HtmlXPathSelector(response) for item in hxs.select(u'//td[@class="Description_ProductList"]'): product_loader = ProductLoader(item=Product(), selector=item) product_loader.add_xpath('name', u'.//a/@title') price = item.select(u'../..//span[@class="Price_Productlist"]/text()').extract()[0] price = price.strip().rstrip(' DKK').replace('.', '').replace(',', '.') if price == u'Ring for pris!': price = 0 product_loader.add_value('price', price) url = item.select(u'.//a/@href').extract()[0] url = urljoin_rfc(get_base_url(response), url) product_loader.add_value('url', url) yield product_loader.load_item() level = response.meta.get('level', 1) sub_url = u'//table[@id="ProductMenu_Table"]/../' + u'/'.join([u'table/tr/td'] * level) + '/a/@href' subcategories = hxs.select(sub_url).extract() for subcategory in subcategories: url = urljoin_rfc(get_base_url(response), subcategory) yield Request(url, meta={'level': level+1})
def parse(self, response): hxs = HtmlXPathSelector(response) ''' categories = hxs.select('//ul[@class="side_menu bolded" and position()=1]/li[not(@class)]/a/@href').extract() for cat in categories: yield Request(cat) subcategories = hxs.select('//a[@class="category_name"]/@href').extract() for subcat in subcategories: yield Request(urljoin_rfc(get_base_url(response), subcat)) ''' categories = ['http://www.webstaurantstore.com/vendor/CAR150/cardinal-international.html', 'http://www.webstaurantstore.com/vendor/LIB500/libbey.html', 'http://www.webstaurantstore.com/vendor/VOL300/vollrath.html', 'http://www.webstaurantstore.com/vendor/RUS600/dexter-russell.html', 'http://www.webstaurantstore.com/vendor/GET600/get-enterprises.html', 'http://www.webstaurantstore.com/vendor/BEV500/beverage-air.html'] for cat in categories: yield Request(cat) next_page = hxs.select('//a[@title="Next page"]/@href').extract() if next_page: yield Request(urljoin_rfc(get_base_url(response), next_page[0])) products = hxs.select('//td[@class="search_product_title"]/a/@href').extract() for product in products: yield Request(urljoin_rfc(get_base_url(response), product), callback=self.parse_product)
def parse_products(self, response): hxs = HtmlXPathSelector(response) products = hxs.select('//div[@class="prod"]') for product in products: loader = ProductLoader(item=Product(), selector=product) #loader.add_xpath('name', 'div/form/fieldset/div/h5/a/span/text()') name = product.select('div/form/fieldset/div/h5/a/span/text()').extract()[0].strip() url = product.select('div/form/fieldset/div/h5/a/@href').extract() if url: url = urljoin_rfc(get_base_url(response), url[0]) #loader.add_value('url', url) #loader.add_xpath('price', 'div/form/fieldset/div/span[@class="productPrice priceExVAT"]/text()') #yield loader.load_item() price = product.select('div/form/fieldset/div/span[@class="productPrice priceExVAT"]/text()').extract()[0].strip() yield Request(url, callback=self.parse_product, meta={'name':name, 'price':price}) pages = hxs.select('//span[@class="pagingButton"]/a/@href').extract() if pages: if response.meta['do_pagination']: for page in pages: url = urljoin_rfc(get_base_url(response), page) yield Request(url, callback=self.parse_products, meta={'do_pagination':False}) else: sub_categories = hxs.select('//div[@class="subcat"]/div/a/@href').extract() for sub_category in sub_categories: url = urljoin_rfc(get_base_url(response), sub_category) yield Request(url, callback=self.parse_products, meta={'do_pagination':True})
def parse_subcategory(self, response): hxs = HtmlXPathSelector(response) urls = [urljoin_rfc(get_base_url(response), x.strip()) for x in hxs.select(".//*[@class='prodnamelink']/a[1]/@href").extract() if x.strip()] if not urls: urls = [urljoin_rfc(get_base_url(response), x.strip()) for x in hxs.select(".//*[@id='pagecontentleft']/.//a[contains(text(), 'more info')]/@href").extract() if x.strip()] for url in urls: yield Request(url, callback=self.parse_product_page)
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) # pages next_page = hxs.select(u'//div[@style="float:left;padding-right:8px;"]/a[child::img]/@href').extract() if next_page: next_page = urljoin_rfc(get_base_url(response), next_page[0]) yield Request(next_page, callback=self.parse_product) products = hxs.select(u'//div[contains(@class,"itemGrid")]') for product in products: product_loader = ProductLoader(item=Product(), selector=product) url = product.select(u'.//a[@class="oesLink"]/@href').extract()[0] url = urljoin_rfc(get_base_url(response), url) product_loader.add_value('url', url) name = product.select(u'.//a[@class="oesLink"]/span/text()').extract()[0] name += ' ' + product.select(u'.//a[@class="oesLink"]/text()').extract()[0] product_loader.add_value('name', name) product_loader.add_xpath('price', u'.//span[@class="PlistOfferPrice"]/text()', re=u'\$(.*)') product_loader.add_xpath('price', u'.//div[@class="pricing"]/span/div/span/text()', re=u'\$(.*)') loaded = product_loader.get_output_value('name') and product_loader.get_output_value('price') if not loaded: continue yield product_loader.load_item()
def parse(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) # categories categories = hxs.select(u'//a[@class="siteNavLink"]/@href').extract() for url in categories: url = urljoin_rfc(get_base_url(response), url) yield Request(url) # categories categories = hxs.select(u'//ul[@id="categoryList"]/li/a/@href').extract() for url in categories: url = urljoin_rfc(get_base_url(response), url) yield Request(url) # pages next_page = hxs.select(u'//a[child::span[@class="blueArrowRightBtn"]]/@href').extract() if next_page: next_page = urljoin_rfc(get_base_url(response), next_page[0]) yield Request(next_page) products = hxs.select(u'//div[@class="show"]/ul/li//h1[@class="showName"]/a/@href').extract() for url in products: url = urljoin_rfc(get_base_url(response), url) yield Request(url, callback=self.parse_product)
def parse(self, response): hxs = HtmlXPathSelector(response) ''' if response.url == self.start_urls[0]: cats = hxs.select('//font[@size="2.5"]/../@href').extract() for cat in cats: url = urljoin_rfc(get_base_url(response), cat) yield Request(url) ''' subcats = hxs.select('//img[contains(@src, "orange-arrow.gif")]/../font/a/@href').extract() subcats += hxs.select('//table[@class="categorytable"]//td[@class="categorymodelcell"]//a/@href').extract() for subcat in subcats: yield Request(urljoin_rfc(get_base_url(response), subcat)) ''' price_list = hxs.select('//a[contains(text(), "Price List")]/@href').extract() if not price_list: price_list = hxs.select('//a[contains(@href, "PriceList")]/@href').extract() if price_list: yield Request(urljoin_rfc(get_base_url(response), price_list[0])) ''' next_page = hxs.select('//a/b[contains(text(), "Next Page")]/../@href').extract() if next_page: yield Request(urljoin_rfc(get_base_url(response), next_page[0])) for product in self.parse_products(hxs, response): yield product
def parse(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) # brands brands = hxs.select('//div[@id="trucks"]//a/@href').extract() for category in brands: url = urljoin_rfc(get_base_url(response), category) yield Request(url) # categories categories = hxs.select('//div[@id="categories"]//a/@href').extract() for category in categories: url = urljoin_rfc(get_base_url(response), category) yield Request(url) # pages next_page = hxs.select('//ul[contains(@class, "pagination")]/li/a[contains(text(), "Next")]/@href').extract() if next_page: url = urljoin_rfc(get_base_url(response), next_page[0]) yield Request(url) products = hxs.select(u'//div[contains(@class,"products_content")]/ul/li/h4/a/@href').extract() for url in products: url = urljoin_rfc(get_base_url(response), url) yield Request(url, callback=self.parse_product)
def parse(self, response): hxs = HtmlXPathSelector(response) for item in hxs.select(u'//tr[contains(@class,"product-item")]'): product_loader = ProductLoader(item=Product(), selector=item) product_loader.add_xpath('name', u'.//td[@class="productListingNewName"]/b/a/text()') price = item.select(u'.//span[@class="js_price_tax"]/text()').extract()[0] price = price.strip().replace('.', '').replace(',', '.') product_loader.add_value('price', price) url = item.select(u'.//td[@class="productListingNewName"]/b/a/@href').extract()[0] url = urljoin_rfc(get_base_url(response), url) product_loader.add_value('url', url) # If quantity field is not present on page, there are subproducts qty = item.select(u'.//input[@name="products_qty"]').extract() if qty: yield product_loader.load_item() else: yield Request(url, callback=self.parse_sub) level = response.meta.get('level', 1) sub_url = u'//div[@class="box-content"]/' + u'/'.join([u'ul/li'] * level) + '/a/@href' subcategories = hxs.select(sub_url).extract() for subcategory in subcategories: url = urljoin_rfc(get_base_url(response), subcategory) yield Request(url, meta={'level': level+1}) next_url = hxs.select(u'//li[@class="page-next"]/a/@href').extract() if next_url: next_url = urljoin_rfc(get_base_url(response), next_url[0]) yield Request(next_url, meta={'level': level})
def parse(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) products = hxs.select(u'//div[@class="prodInfo"]') for product in products: product_loader = ProductLoader(item=Product(), selector=product) url = product.select(u'.//a[contains(@class,"prodLink")]/@href').extract()[0] url = urljoin_rfc(get_base_url(response), url) product_loader.add_value('url', url) name = product.select(u'.//a[contains(@class,"prodLink")]/text()').extract()[0].strip() product_loader.add_value('name', name) try: price = product.select(u'.//div[@class="PriceContent"]//div[@class="camelPrice"]/span[@class="bigPriceText2"]/text()').re('\$(.*)')[0] price += product.select(u'.//div[@class="PriceContent"]//div[@class="camelPrice"]/span[@class="smallPriceText2"]/text()').extract()[0] except IndexError: price_big = product.select(u'.//div[@class="PriceContent"]//div[@class="camelPrice"]/span[@class="bigPriceTextOutStock2"]/text()').re('\$(.*)') price_small = product.select(u'.//div[@class="PriceContent"]//div[@class="camelPrice"]/span[@class="smallPriceTextOutStock2"]/text()').extract() if price_big and price_small: price = price_big[0] + price_small[0] else: continue product_loader.add_value('price', price) yield product_loader.load_item() # pages next_page = hxs.select(u'//a[@class="jump next"]/@href').extract() if next_page: next_page = urljoin_rfc(get_base_url(response), next_page[0]) yield Request(next_page, callback=self.parse, meta={'cats': response.meta['cats'][:]}) elif response.meta.get('cats'): yield Request(response.meta['cats'][0], meta={'cats': response.meta['cats'][1:]})
def parse(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) # categories categories = hxs.select(u'//div[@class="menuItem"]/@onclick').re('\.assign\(\'(.*)\'') for url in categories: url = urljoin_rfc(get_base_url(response), '/' + url) if ('javascript' not in url) and ('Javascript' not in url): yield Request(url) # pages # next_pages = hxs.select(u'').extract() # for next_page in next_pages: # url = urljoin_rfc(get_base_url(response), next_page) # yield Request(url) # products products = hxs.select(u'//div/img/../@onclick').re('assign\(\'(.*)\'') products += hxs.select(u'//div[@class="catpadding"]//div[@class="DefaultFont"]/a/@href').extract() products += hxs.select(u'//table[@id="Table_01"]//div/a[child::img]/@href').extract() for url in products: url = urljoin_rfc(get_base_url(response), url) if ('javascript' not in url) and ('Javascript' not in url): yield Request(url, callback=self.parse_product) for product in self.parse_product(response): yield product
def parse(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) # categories categories = hxs.select(u'//ul[@id="MenuBar1"]/li/a/@href').extract() for url in categories: url = urljoin_rfc(get_base_url(response), url) yield Request(url) # subcategories subcategories = hxs.select(u'//table[@class="TextListingTable"]//a/@href').extract() for url in subcategories: url = urljoin_rfc(get_base_url(response), url) yield Request(url) # pages # next_pages = hxs.select(u'').extract() # for next_page in next_pages: # url = urljoin_rfc(get_base_url(response), next_page) # yield Request(url) # products products = hxs.select(u'//table[@class="ListingTable"]//a/@href').extract() for url in products: url = urljoin_rfc(get_base_url(response), url) yield Request(url, callback=self.parse_product)
def parse_course_list(self, response): hxs = HtmlXPathSelector(response) programs = hxs.select(u'//ul[@id="SearchResults"]/li/h2/a/@href').extract() if programs: for url in programs: url = urljoin_rfc(get_base_url(response), url) response.meta['courses'].setdefault(url, []).append(response.meta['field_name']) else: response.meta['field_names'] = [response.meta['field_name']] for x in self.parse_course(response): yield x # Pagination next_url = hxs.select(u'//div[@id="PageNumbers"]/a[@class="next"]/@href').extract() if next_url: url = urljoin_rfc(get_base_url(response), next_url[0]) yield Request(url, meta=response.meta, callback=self.parse_course_list) # Next field elif response.meta['fields']: field_name, field_url = response.meta['fields'].pop() response.meta['field_name'] = field_name yield Request(field_url, meta=response.meta, callback=self.parse_course_list) # All fields processed, do courses else: for course, fields in response.meta['courses'].items(): yield Request(course, meta={'field_names': fields}, callback=self.parse_course)
def parse(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) # categories categories = hxs.select(u'//div[@id="ctl00_menu_products_pnlsmenu"]//a/@href').extract() for url in categories: url = urljoin_rfc(get_base_url(response), url) yield Request(url) subcategories = hxs.select(u'//div[@class="item" and not(parent::div[@class="catprods"])]//a/@href').extract() for url in subcategories: url = urljoin_rfc(get_base_url(response), url) yield Request(url) # pagination next_page = hxs.select(u'//a[@class="next i-next"]/@href').extract() if next_page: next_page = urljoin_rfc(get_base_url(response), next_page[0]) yield Request(next_page) # products products = hxs.select(u'//div[@class="item" and parent::div[@class="catprods"]]//a/@href').extract() for url in products: url = urljoin_rfc(get_base_url(response), url) yield Request(url, callback=self.parse_product)
def parse(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) # getting product details from product list prod_names = hxs.select('//h4/a/@title').extract() prod_urls = hxs.select('//h4/a/@href').extract() prices = hxs.select('//td[@class="ProductPrice"]/h4/text()').extract() prices = [p.strip().strip(u'\xa3') for p in prices] names_urls_prices = zip(prod_names, prod_urls, prices) for name, url, price in names_urls_prices: url = urljoin_rfc(get_base_url(response), url) if url: loader = ProductLoader(item=Product(), selector=hxs) loader.add_value('url', url) loader.add_value('name', name) loader.add_value('price', price) yield loader.load_item() # pages next_page = hxs.select('//a[@class="NextPage"]/@href').extract() if next_page: url = urljoin_rfc(get_base_url(response), next_page[0]) yield Request(url)
def parse(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) soup = BeautifulSoup(response.body, convertEntities=BeautifulSoup.HTML_ENTITIES) # categories # categories = hxs.select(u'//td[@id="left"]//a/@href').extract() # try: # categories = [a['href'] for a in soup.find('td', id='left').findAll('a')] # except AttributeError: # categories = [] # for url in categories: # url = urljoin_rfc(get_base_url(response), url) # yield Request(url) # pagination next_page = hxs.select(u'//div[@class="pager"]/a[contains(text(),"Next")]/@href').extract() if next_page: next_page = urljoin_rfc(get_base_url(response), next_page[0]) if '127.0.0.1' in next_page: next_page = next_page.replace('127.0.0.1', 'argonautliquor.com') yield Request(next_page, dont_filter=True) else: next_page = soup.find(lambda tag: tag.name == 'a' and 'Next' in tag.text and tag.findParent('div', 'pager')) if next_page: next_page = urljoin_rfc(get_base_url(response), next_page['href']) if '127.0.0.1' in next_page: next_page = next_page.replace('127.0.0.1', 'argonautliquor.com') yield Request(next_page, dont_filter=True) # products for product in self.parse_product(response): yield product
def parse(self, response): if response.url in self.junk_urls: return hxs = HtmlXPathSelector(response) for item in hxs.select(u'//div[@class="item_wrapper"]'): product_loader = ProductLoader(item=Product(), selector=item) product_loader.add_xpath('name', u'.//div[@class="name"]/a/text()') price = item.select(u'.//div[@class="price"]/text()[last()]').extract()[0] price = price.strip().lstrip('Kr. ').replace('.', '').replace(',', '.') product_loader.add_value('price', price) url = item.select(u'.//div[@class="name"]/a/@href').extract()[0] url = urljoin_rfc(get_base_url(response), url) product_loader.add_value('url', url) yield product_loader.load_item() level = response.meta.get('level', 1) sub_url = u'//div[@id="shopnav"]/' + u'/'.join([u'ul/li'] * level) + '/a/@href' subcategories = hxs.select(sub_url).extract() for subcategory in subcategories: url = urljoin_rfc(get_base_url(response), subcategory) yield Request(url, meta={'level': level+1})
def parse(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) # brands brands = hxs.select(u'//div[@id="shop_content"]//li[not(@class="first")]//a/@href').extract() for url in brands: url = urljoin_rfc(get_base_url(response), url) yield Request(url) cats = hxs.select('//h2[text()="Product Category"]/following-sibling::div[1]/form[@id="product-refinement"]//a/@href').extract() for url in cats: yield Request(urljoin_rfc(get_base_url(response), url)) #show_all = hxs.select(u'//div[@class="sidenav-content gray top-level"]//a[contains(text(),"Shop All")]/@href').extract() show_all = hxs.select('//p[@class="arrow_sym"]/a[@class="SearchLinkBold" and starts-with(@title, "View More")]/@href').extract() if show_all: for url in show_all: link = urljoin_rfc(get_base_url(response), url) yield Request(link) next_page = hxs.select(u'//div[@id="pagination" and @class="pagination"]//a[child::img and @title="Next Page"]/@href').extract() if next_page: next_page = urljoin_rfc(get_base_url(response), next_page[0]) yield Request(next_page) # products for product in self.parse_product(response): yield product
def parse(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) # categories categories = hxs.select(u'//div[@class="Category"]//a/@href').extract() categories += hxs.select(u'//h3[@class="Org LeftNavMenu"]//a/@href').extract() for url in categories: url = urljoin_rfc(get_base_url(response), url) yield Request(url, meta=response.meta) # pagination next_page = hxs.select(u'//div[@class="pager"]//a[contains(text(),"Next")]/@href').extract() if next_page: next_page = urljoin_rfc(get_base_url(response), next_page[0]) if next_page.count(u'&page=') > 1: next_page = re.sub(u'&page=\d+', u'', next_page, 1) yield Request(next_page, meta=response.meta) # products products = hxs.select(u'//div[contains(@class,"ProDes1")]/div/a/@href').extract() for url in products: url = urljoin_rfc(get_base_url(response), url) yield Request(url, callback=self.parse_product, meta=response.meta)
def parse_product(self, response): hxs = HtmlXPathSelector(response) product_data = {} product_data['name'] = hxs.select(self.name_xpath)[0].extract() product_data['url'] = response.url product_data['category'] = hxs.select(self.category_xpath)[0].extract() product_data['image_url'] = hxs.select(self.img_xpath).extract() if product_data['image_url']: product_data['image_url'] = urljoin_rfc( get_base_url(response), product_data['image_url'][0]) product_data['brand'] = hxs.select(self.brand_xpath).extract() product_data['shipping_cost'] = '19.99' product_data['stock'] = '1' req_url = 'http://dreamtimebeds.co.uk/index.php?dispatch=products.options&'\ 'changed_option[{product_id}]={option_id}&'\ 'appearance[show_sku]=1&'\ 'appearance[show_price_values]=1&'\ 'appearance[show_old_price]=1&'\ 'appearance[show_price_values]=1&'\ 'appearance[show_price]=1&'\ 'appearance[show_price_values]=1&'\ 'appearance[show_list_discount]=1&'\ 'appearance[show_discount_label]=1&'\ 'appearance[show_price_values]=1&'\ 'appearance[show_product_amount]=1&'\ 'appearance[show_product_options]=1&'\ 'appearance[details_page]=1&'\ 'additional_info[info_type]=D&'\ 'additional_info[get_icon]=1&'\ 'additional_info[get_detailed]=1&'\ 'additional_info[get_options]=1&'\ 'additional_info[get_discounts]=1&'\ 'additional_info[get_features]=&'\ 'additional_info[get_extra]=&'\ 'additional_info[get_categories]=&'\ 'additional_info[get_taxed_prices]=1&'\ 'additional_info[get_for_one_product]=1&'\ 'appearance[show_qty]=1&'\ 'appearance[show_list_buttons]=1&'\ 'appearance[but_role]=big&'\ 'appearance[separate_buttons]=1&'\ 'appearance[quick_view]=&'\ 'appearance[capture_options_vs_qty]=&'\ 'product_data[{product_id}][amount]=1&' option_url = 'product_data[{product_id}][product_options][{select_id}]={option_value}' headers = { 'Accept': 'application/json, text/javascript, */*; q=0.01', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'en-US,en;q=0.5', 'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Host': 'dreamtimebeds.co.uk', 'Pragma': 'no-cache', 'Referer': response. url, #' http://dreamtimebeds.co.uk/new-world-windsor-divan-bed.html', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36', 'X-Requested-With': 'XMLHttpRequest' } form = hxs.select('//form[contains(@name,"product_form_")]') form_name = form.select('./@name')[0].extract() product_id = form_name.split('_')[-1] inputs = form.select('.//input') form_data = { 'result_ids': 'product_images_{product_id}_update,sku_update_{product_id},old_price_update_{product_id},price_update_{product_id},line_discount_update_{product_id},discount_label_update_{product_id},product_amount_update_{product_id},product_options_update_{product_id},advanced_options_update_{product_id},qty_update_{product_id}' .format(product_id=product_id) } req = FormRequest(response.url, formdata=form_data, headers=headers, callback=self.parse_option, dont_filter=True) req.meta['req'] = req req.meta['product_id'] = product_id selects = hxs.select(self.select_xpath) if selects: first_select = [(elem.select('./@value')[0].extract(), elem.select('./text()')[0].extract()) for elem in selects[0].select('./option')] first_select_id = selects[0].select('./@id').re('_(\d+)$')[0] for value, text in first_select: select_name = selects[0].select('./@name')[0].extract() if value: option_url = req_url.format( product_id=product_id, option_id=first_select_id) + select_name + '=' + value option_req = req.replace(url=option_url) option_req.meta['option_level'] = 1 option_req.meta['product'] = product_data yield option_req else: loader = ProductLoader(item=Product(), response=response) price = hxs.select(self.price_xpath).extract() if price: price = extract_price(price[0]) in_store = hxs.select(self.in_store_xpath) if in_store: price += Decimal('19.99') identifier = hxs.select('//input[@type="hidden"]').re( 'product_data\[(\d+?)\]\[product_id\]')[0] loader.add_value('price', price) loader.add_value('url', response.url) loader.add_value('name', product_data['name']) loader.add_value('stock', '1') loader.add_xpath('category', self.category_xpath) loader.add_xpath('brand', self.brand_xpath) loader.add_value('shipping_cost', '19.99') loader.add_value('sku', identifier) loader.add_value('identifier', identifier) image_url = hxs.select(self.img_xpath).extract() if image_url: image_url = urlparse.urljoin(get_base_url(response), image_url[0]) loader.add_value('image_url', image_url) yield loader.load_item()
def parse_option(self, response): if response.status == 411: yield response.request return option_level = response.meta.get('option_level', 0) req = response.meta.get('req') product_id = response.meta.get('product_id') options = json.loads(response.body) hxs = HtmlXPathSelector( text=options['html']['product_options_update_' + product_id]) selects = hxs.select( './/div[not(child::label[contains(text(),"Select Your Delivery")])]/select' ) if option_level < len(selects): select = [(elem.select('./@value')[0].extract(), elem.select('./text()')[0].extract()) for elem in selects[option_level].select('./option')] select_id = selects[option_level].select('./@id').re('_(\d+)$')[0] select_name = selects[option_level].select('./@name')[0].extract() for value, text in select: if value: option_url = response.url + '&' + select_name + '=' + value option_url = re.sub('(changed_option\[.*?\]=)\d+?&', '\g<1>{}&'.format(select_id), urllib.unquote(option_url)) option_req = req.replace(url=option_url, meta=response.meta) option_req.meta['option_level'] = option_level + 1 yield option_req else: price_sel = HtmlXPathSelector( text=options['html']['price_update_' + product_id]) stock_sel = HtmlXPathSelector( text=options['html']['product_amount_update_' + product_id]) image_sel = HtmlXPathSelector(text=options['html'][ 'product_images_{}_update'.format(product_id)]) image_url = image_sel.select( './/a[contains(@class,"cm-image-previewer cm-previewer")]/@href' ).extract() product_data = response.meta.get('product') option_name = ' '.join( selects.select('./option[@selected]/text()').extract()) identifier = product_id + '_' + '_'.join( selects.select('./option[@selected]/@value').extract()) loader = ProductLoader(item=Product(), response=response) loader.add_value( 'name', clean_name('{} {}'.format(product_data['name'], option_name))) loader.add_value('category', product_data['category']) loader.add_value('brand', product_data['brand']) loader.add_value('url', product_data['url']) loader.add_value('shipping_cost', '19.99') price = price_sel.select( './/span[@class="price"]/span[@class="price-num"][2]/text()' ).extract() loader.add_value('price', price if price else '0.00') loader.add_value( 'stock', 1 if stock_sel.select('.//span[@class="in-stock"]') else 0) if image_url: loader.add_value( 'image_url', urljoin_rfc(get_base_url(response), image_url[0])) loader.add_value('identifier', identifier) loader.add_value('sku', identifier) yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) try: name = hxs.select( '//*[@itemprop="name"]/text()').extract().pop().strip() except IndexError: yield Request(response.url.replace( 'hamleys.com/', 'hamleys.com/detail.jsp?pName=').replace('.ir', ''), callback=self.parse_product) return out_of_stock = 'OUT OF STOCK' in ''.join( hxs.select( '//li[@class="stockStatus"]/span/text()').extract()).upper() price = "".join( hxs.select('//div[@class="productprice "]/text()').re("([.0-9]+)") or hxs.select( '//div[@class="productprice "]/span[@class="detailOurPrice"]/text()' ).re("([.0-9]+)")) loader = ProductLoader(response=response, item=Product()) loader.add_value('url', urljoin(base_url, response.url)) loader.add_value('name', name) loader.add_xpath('image_url', '//img[@class="productMain"]/@src', TakeFirst()) loader.add_value('price', price) category = hxs.select( '//div[@class="pagetopnav"]/ul[contains(@class, "crumb")]/li/a/text()' ).extract()[-2] loader.add_value('category', category) loader.add_value('sku', name, re=' (\d\d\d+)\s*$') loader.add_value('brand', response.meta.get('brand', '')) identifier = hxs.select( '//*[@itemprop="productID"]/text()').extract()[0].replace( 'Code: ', '') loader.add_value('identifier', identifier) if out_of_stock: loader.add_value('stock', 0) item = loader.load_item() metadata = ToyMonitorMeta() promotions = response.meta.get('promotions', '') metadata['reviews'] = [] item['metadata'] = metadata if promotions: item['metadata']['promotions'] = self.promos.get( promotions, promotions) reviews = hxs.select('//div[@class="reviewbody"]') prod_id = response.xpath('//input[@name="id"]/@value').extract()[0] has_reviews = response.xpath( '//a[@class="writeReviewLink" and contains(@onclick, "' + prod_id + '")]').extract() if has_reviews: for review in reviews: review_loader = ReviewLoader(item=Review(), response=response, date_format="%B %d, %Y") #review_date = datetime.datetime.strptime(review['SubmissionTime'].split('.')[0], '%Y-%m-%dT%H:%M:%S') #review_loader.add_value('date', review_date.strftime("%B %d, %Y")) title = ''.join( review.select( './/div[@class="reviewTagLine"]/text()').extract()) text = ''.join( review.select( './/div[@class="reviewText"]/text()').extract()) if title: full_text = title.encode('utf-8') + '\n' + text.encode( 'utf-8') else: full_text = text.encode('utf-8') review_loader.add_value('full_text', unicode(full_text, errors='ignore')) rating = float( review.select('.//div[@class="reviewStarsInner"]/@style'). re('\d+.\d+')[0]) / 20 review_loader.add_value('rating', rating) review_loader.add_value('url', item['url']) item['metadata']['reviews'].append(review_loader.load_item()) yield item
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) brands = hxs.select( '//select[contains(@class, "searchManufacturer")]/option/text()' ).extract() loader = ProductLoader(item=Product(), response=response) product_name = hxs.select('//h1[@itemprop="name"]/text()').extract()[0] product_price = hxs.select( '//input[@class="priceBox"]/@value').extract()[0] product_code = hxs.select( '//input[@name="prod_id"]/@value').extract()[0] image_url = hxs.select('//img[@itemprop="image"]/@src').extract() product_brand = '' for brand in brands: if brand.upper() in product_name.upper(): product_brand = brand sku = re.findall(u'ecomm_prodid: \'(.*)\'', response.body) sku = sku[0] if sku else '' categories = hxs.select( '//h2[@class="breadcrumbs"]/span/a/span/text()').extract()[1:-1] product_price = extract_price(product_price) options = hxs.select('//select[@name="options"]/option') if options: for option in options: loader = ProductLoader(response=response, item=Product()) option_identifier = option.select('@value').extract()[0] option_name = option.select('text()').re(r'(.*) \(')[0] option_price = option.select('text()').re(u'\(\xa3(.*)\)') option_price = option_price[0] if option_price else '0' loader.add_value("identifier", product_code + '-' + option_identifier) loader.add_value('name', product_name + ' ' + option_name) if image_url: loader.add_value('image_url', urljoin_rfc(base_url, image_url[0])) loader.add_value('price', extract_price(option_price)) loader.add_value('url', response.url) loader.add_value('sku', sku) loader.add_value('brand', product_brand) for category in categories: loader.add_value('category', category) product = loader.load_item() yield product else: options_containers = hxs.select('//select[@name="options[]"]') if options_containers: combined_options = [] for options_container in options_containers: element_options = [] for option in options_container.select('option'): option_id = option.select('@value').extract()[0] option_name = option.select('text()').extract()[0] option_attr = (option_id, option_name) element_options.append(option_attr) combined_options.append(element_options) combined_options = list(itertools.product(*combined_options)) options = [] for combined_option in combined_options: final_option = {} for option in combined_option: final_option['desc'] = final_option.get( 'desc', '') + ' ' + option[1] final_option['identifier'] = final_option.get( 'identifier', '') + '-' + option[0] options.append(final_option) for option in options: loader = ProductLoader(response=response, item=Product()) option_name = option['desc'] option_identifier = option['identifier'] loader.add_value("identifier", product_code + option_identifier) loader.add_value('name', product_name + option_name) if image_url: loader.add_value('image_url', urljoin_rfc(base_url, image_url[0])) loader.add_value('price', product_price) loader.add_value('url', response.url) loader.add_value('sku', sku) loader.add_value('brand', product_brand) for category in categories: loader.add_value('category', category) product = loader.load_item() yield product else: loader = ProductLoader(item=Product(), response=response) loader.add_value('name', product_name) loader.add_value('url', response.url) loader.add_value('sku', sku) loader.add_value('identifier', product_code) loader.add_value('brand', product_brand) if image_url: loader.add_value('image_url', urljoin_rfc(base_url, image_url[0])) for category in categories: loader.add_value('category', category) loader.add_value('price', product_price) yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) url = urljoin_rfc(base_url, response.url) image_url = hxs.select('//img[@id="product-image-main"]/@src').extract() product_name = hxs.select('//*[@id="product-header"]//h1/text()').extract() if product_name: product_name = product_name[0].strip() else: log.msg('Skips product without name: ' + response.url) return category = hxs.select('//div[@class="crumbs"]/span/a/span/text()').extract()[-1] brand = hxs.select('//*[@id="product-header"]/a/img/@alt').extract() brand = brand[0] if brand else '' options = hxs.select('//table[@class="child-list with-hover"][1]/tbody/tr') if options: for option in options: columns = option.select('./td') name = '' sku = '' get_name = 1 in_stock = 1 identifier = '' for column in columns: ctype = column.select('./@class').extract()[0] if ctype == 'code': get_name = 0 name = product_name + name sku = column.select('./text()').extract()[0] if get_name: name += ' - ' + column.select('./text()').extract()[0] if ctype == 'price': price = column.select('.//input/@value').extract()[-1] price = extract_price(price) if ctype == 'status out-of-stock': in_stock = 0 identifier = sku loader = ProductLoader(item=Product(), selector=option) loader.add_value('identifier', identifier) loader.add_value('url', url) colour = hxs.select('//li[.//td[text()="'+sku+'"]]/div[contains(@class, "colour")]/p/text()').extract() if colour: name = name + ' ' + colour[0] loader.add_value('name', name) if image_url: loader.add_value('image_url', urljoin_rfc(base_url, image_url[0])) loader.add_value('price', price) loader.add_value('sku', sku) loader.add_value('brand', brand) loader.add_value('category', category) if not in_stock: loader.add_value('stock', 0) if price <= 49.99: loader.add_value('shipping_cost', 3.95) else: loader.add_value('shipping_cost', 0) yield loader.load_item() else: options = hxs.select('//div[@class="product-options"]//option[not(@title="Not Selected")]') if options: try: options_mappings = json.loads(re.findall(re.compile("childMap\': (\{.+?}),\n"), response.body)[0]) options_prices = json.loads(re.findall(re.compile("prices\': (\{.+?}),\n"), response.body)[0]) options_skus = json.loads(re.findall(re.compile("skus\': (\{.+?}),\n"), response.body)[0]) options_stocks = json.loads(re.findall(re.compile("stockStatuses\': (\{.+?}),\n"), response.body)[0]) except: return for option in options: loader = ProductLoader(item=Product(), selector=hxs) option_name = product_name + ' ' + option.select("./@title").extract()[0] option_id = option.select("./@value").extract()[0] option_mapping = str(options_mappings[option_id]) option_price = extract_price(str(options_prices[option_mapping][0]['purchase'])) option_sku = options_skus[option_mapping] option_stock = 1 if not 'Out' in options_stocks[option_mapping] else 0 loader.add_value('identifier', option_sku) loader.add_value('sku', option_sku) loader.add_value('url', url) loader.add_value('name', option_name) loader.add_value('price', option_price) loader.add_value('brand', brand) loader.add_value('category', category) loader.add_value('stock', option_stock) if image_url: loader.add_value('image_url', urljoin_rfc(base_url, image_url[0])) if option_price < 49.99: loader.add_value('shipping_cost', 3.95) else: loader.add_value('shipping_cost', 0) yield loader.load_item() else: loader = ProductLoader(item=Product(), selector=hxs) sku = hxs.select('//div[@class="title"]//p/text()').extract()[0] sku = sku.replace('Product Code: P', '') identifier = sku loader.add_value('identifier', identifier) loader.add_value('sku', sku) loader.add_value('url', url) loader.add_value('name', product_name) if image_url: loader.add_value('image_url', urljoin_rfc(base_url, image_url[0])) price = hxs.select('//*[@id="product-price"]//input/@value').extract()[0] price = extract_price(price) loader.add_value('price', price) loader.add_value('brand', brand) loader.add_value('category', category) in_stock = hxs.select('//*[@id="product-stock"]/text()').extract()[0] if in_stock != 'In stock': loader.add_value('stock', 0) if price < 49.99: loader.add_value('shipping_cost', 3.95) else: loader.add_value('shipping_cost', 0) yield loader.load_item()
def parse_product(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response) for product in self.parse_category(response): yield product if hxs.select('//div[@id="distil_ident_block"]'): retries = response.meta.get('retries', 0) if retries < 50: self.log('Retrying %s (antibot protection)' % response.url) yield response.request.replace(meta={'retries': retries + 1}, dont_filter=True) else: self.log('Gave up retrying %s (antibot protection)' % response.url) return loader = ProductLoader(selector=hxs, item=Product()) products = hxs.select( '//div[@class="u-cms-frame"]//h1/a/@href').extract() for product in products: url = urljoin(base_url, product) yield Request( url, callback=self.parse_product, errback=lambda failure, retries=0, url=url, callback=self. parse_product: self.on_error(failure, retries, url, callback)) product_name = hxs.select( '//*[@id="product_addtocart_form"]//div[@class="product-name"]/h1/text()' ).extract() if not product_name: self.log('Warning: no product name: {}'.format(response.url)) return else: product_name = product_name[0] brand = hxs.select( '//*[@id="product_addtocart_form"]//span[@class="product-phrase"]/text()' ).extract() if not brand: brand = hxs.select( '//div[@class="product-attributes"]//span[@class="product-phrase"]/text()' ).extract() brand = brand[0].strip() if brand else '' loader.add_xpath( 'price', '//*[@id="product_addtocart_form"]//span[@class="price"]/text()') price = hxs.select( '//*[@id="product_addtocart_form"]//span[@class="price"]/text()' ).extract() if price: price = extract_price(price[0]) else: price = extract_price('0') image_url = hxs.select('//*[@id="image-main"]/@src').extract() sku = hxs.select( '//div[@class="label-container"]/label[text()="Sku:"]/../../div/span/text()' ).extract() sku = sku[0].strip() if sku else '' product_identifier = urlsplit(response.url).path product_identifier = product_identifier.strip('/') product_identifier = product_identifier.split('.')[0] products = {} has_options = False options_config = re.search( r'var spConfig *= *new Product.Config\((.*)\)', response.body) colors_data = re.search(r'<script.*>ColorOverlay\.setData\((.*?)\);', response.body) if colors_data: colors_data = json.loads(colors_data.groups()[0]) if isinstance(colors_data, dict): if colors_data.keys()[0] != u'': has_options = True for color in colors_data.itervalues(): products[color['color_label']] = color['color_name'] if options_config: product_data = json.loads(options_config.groups()[0]) if product_data['attributes']: has_options = True for attr in product_data['attributes'].itervalues(): for option in attr['options']: products[option['label']] = option['label'] if has_options: for identifier, option_name in products.iteritems(): product_loader = ProductLoader(item=Product(), selector=hxs) product_loader.add_value('identifier', product_identifier + '_' + identifier) product_loader.add_value('name', product_name + ' ' + option_name) if image_url: product_loader.add_value('image_url', urljoin(base_url, image_url[0])) if price < 50.0: product_loader.add_value('shipping_cost', 7.95) else: product_loader.add_value('shipping_cost', 0) product_loader.add_value('price', price) product_loader.add_value('url', response.url) product_loader.add_value('brand', brand) product_loader.add_value('sku', sku) product = product_loader.load_item() yield product else: product_loader = ProductLoader(item=Product(), selector=hxs) product_loader.add_value('identifier', product_identifier) product_loader.add_value('name', product_name) if image_url: product_loader.add_value('image_url', urljoin(base_url, image_url[0])) product_loader.add_value('price', price) product_loader.add_value('url', response.url) product_loader.add_value('brand', brand) if price < 50.0: product_loader.add_value('shipping_cost', 7.95) else: product_loader.add_value('shipping_cost', 0) product_loader.add_value('sku', sku) product = product_loader.load_item() yield product
def parse(self, response): hxs = HtmlXPathSelector(response) for url in hxs.select(u'//a[@class="tableLink"]/@href').extract(): url = urljoin_rfc(get_base_url(response), url) yield Request(url, callback=self.parse_product_list)
def parse(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) for url in hxs.select('//div[@id="custommenu"]//a/@href').extract(): yield Request(urljoin(base_url, url), callback=self.parse_category)
def get_base_url(self, response): try: return get_base_url(response) except: return
def parse(self, response): hxs = HtmlXPathSelector(response) for url in hxs.select('//menu[@id="menu_container"]//a/@href').extract(): yield Request(urljoin(get_base_url(response), url), callback=self.parse_cat, meta={'dont_merge_cookies':True})
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) loader = ProductLoader(item=Product(), selector=hxs) identifier = response.xpath( '//input[@id="products-id"]/@value').extract_first() loader.add_value('identifier', identifier) sku = response.xpath( '//span[@itemprop="model"]/text()').extract_first() loader.add_value('sku', sku) name = response.xpath('//h2/span[@itemprop="name"]/text()' ).extract_first() or response.xpath( '//h1/text()').extract_first() loader.add_value('name', name) loader.add_value('url', response.url) price = response.xpath( '//span[@itemprop="price"]/@content').extract_first() if price: price = price.replace('.', ',') else: price = response.xpath( '//span[@itemprop="price"]/text()').extract_first( ) or response.css('div.current-price-container').xpath( 'br/following::text()').extract_first() or response.css( 'div.current-price-container ::text').extract_first( ) or 0 loader.add_value('price', price) category = hxs.select( '//div[@id="breadcrumb_navi"]/span/a/span/text()').extract() category = category[1:-1] if len(category) > 2 else '' loader.add_value('category', category) image_url = response.xpath( '//img[@itemprop="image"]/@src').extract_first() if image_url: loader.add_value('image_url', response.urljoin(image_url)) item = loader.load_item() options = response.css('fieldset.attributes div div label') if options: for option in options: option_item = deepcopy(item) option_item['identifier'] += '-' + option.xpath( './/input/@value').extract_first() option_name = ' '.join( option.xpath('text()').extract()).strip() if '(' in option_name: price = extract_price(option_name.split('(')[-1]) option_name = option_name.split('(')[0].strip() option_item['price'] += price option_item['name'] += ' ' + option_name yield option_item else: yield item
def extract_links(self, response): html = Selector(response) base_url = get_base_url(response) return self._extract_links(html, response.url, response.encoding, base_url)
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_loader = ProductLoader(item=Product(), selector=hxs) # the full name of the tyre (name variable) is used to extract metadata (i.e. run flat, xl), # the pattern should be set as the product's name brand = response.meta.get('brand') or '' product_name = hxs.select('//h2[@class="heading black"]/text()')[0].extract().strip() product_name = re.sub(brand, '', product_name).strip() fitting_method = 'Delivered' base_loader.add_value('url', response.url) image_url = hxs.select('//div[@class="item"]/a/img/@src').extract() options = hxs.select('//div[@style="background: #fff; padding: 6px; "]') for option in options: loader = ProductLoader(item=Product(), selector=hxs) loader.add_value('name', product_name) loader.add_value('brand', unify_brand(brand)) loader.add_value('category', find_brand_segment(loader.get_output_value('brand'))) loader.add_value('url', response.url) if image_url: loader.add_value('image_url', urljoin(get_base_url(response), image_url[0])) identifier = option.select('../input[@type="hidden" and @name="item_id"]/@value').extract() if not identifier: identifier = option.select('./a/@href').re('email_me_stock/(.*)') if not identifier: continue loader.add_value('identifier', identifier[0]) price = option.select('./strong[@class="price" and not(contains(text(),"On Backorder"))]/text()').extract() if price: loader.add_value('price', price[0]) else: if response.meta.get('price'): loader.add_value('price', response.meta['price']) else: loader.add_value('price', '0.00') loader.add_value('stock', 0) pattern_name = option.select('./p/strong/text()').extract() if not pattern_name: pattern_name = option.select('./strong/text()').extract() pattern_name = pattern_name[0] data = re.search('(?P<Width>\d+)/(?P<Aspect_Ratio>\d+) R(?P<Rim>\d+) (?P<Speed_Rating>[A-Za-z]{1,2}) \((?P<Load_Rating>\d+).*?\)', pattern_name) if data: data = data.groupdict() else: msg = 'ERROR parsing "{}" [{}]'.format(pattern_name, response.url) log.msg(msg) self.errors.append(msg) continue metadata = MicheldeverMeta() metadata['aspect_ratio'] = data['Aspect_Ratio'] metadata['rim'] = data['Rim'] metadata['speed_rating'] = data['Speed_Rating'].upper() metadata['width'] = data['Width'] metadata['fitting_method'] = fitting_method metadata['load_rating'] = data['Load_Rating'] or '' metadata['alternative_speed_rating'] = '' xl = 'XL' in pattern_name metadata['xl'] = 'Yes' if xl else 'No' run_flat = 'run flat' in pattern_name.lower() or 'runflat' in pattern_name.lower() metadata['run_flat'] = 'Yes' if run_flat else 'No' manufacturer_mark = [mark for mark in self.all_man_marks.keys() if mark in pattern_name.split(' ')] manufacturer_mark = manufacturer_mark[0].strip() if manufacturer_mark else [] metadata['manufacturer_mark'] = find_man_mark(manufacturer_mark) if manufacturer_mark else '' metadata['full_tyre_size'] = '/'.join((metadata['width'], metadata['aspect_ratio'], metadata['rim'], metadata['load_rating'], metadata['speed_rating'])) #metadata['alternative_speed_rating'])) product = loader.load_item() product['metadata'] = metadata if not is_product_correct(product): continue product['metadata']['mts_stock_code'] = find_mts_stock_code(product, spider_name=self.name, log=self.log) new_speed_rating = get_speed_rating(product) new_alt_speed = get_alt_speed(product) product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \ product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else '' product['metadata']['speed_rating'] = new_speed_rating yield product
def parse_product_list(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) products = hxs.select('//div[contains(@class, "product producttile")]') for product in products: product_loader = ProductLoader(item=Product(), selector=product) product_name = product.select( './/div[@class="name"]/a/text()').extract() if not product_name: continue else: product_name = product_name[0].strip() image_url = product.select( './/img[@id="firimg"]/@src').extract()[0] product_loader.add_value('image_url', urljoin_rfc(base_url, image_url)) product_loader.add_value('name', product_name) url = product.select('.//div[@class="name"]/a/@href').extract()[0] product_loader.add_value('url', url) match = re.search(r"(\d+)\.html", url) if match: identifier = match.group(1) if identifier in self.ids: continue else: self.ids.append(identifier) else: continue product_loader.add_value('identifier', identifier) product_loader.add_value('sku', identifier) price = product.select( './/div[@class="salesprice"]/text()').extract()[0] product_loader.add_value('price', extract_price(price)) category = response.meta.get('category', '') if not category or response.meta.get('full'): category2 = product.select( './/div[@class="capacityType"]/text()').extract() if category2: category2 = category2[0].split(u'\u2022') if len(category2) > 1: category2 = category2[1] else: category2 = category2[0] if category2.strip(): category = category2 if category: # Diageo have requested that we group all categories with 'Whisky' or 'Whiskey' in the name into one category named 'Whisky' # https://www.assembla.com/spaces/competitormonitor/tickets/2254 if 'whisky' in category.lower() or 'whiskey' in category.lower( ): category = 'Whisky' product_loader.add_value('category', category.strip()) for brand in self.brands: if brand in product_name.replace('A&J', 'Alexander & James').replace( u'C\xeeroc', 'Ciroc'): product_loader.add_value('brand', brand) break product = product_loader.load_item() self.jar_counter += 1 yield Request(url, callback=self.parse_product, meta={ 'product': product, 'cookiejar': self.jar_counter }, cookies={})
def parse_product(self, response): hxs = HtmlXPathSelector(response) try: name = hxs.select(u'//h1/a/text()').extract()[-1].strip() except: try: name = hxs.select(u'//h1/text()').extract()[-1].strip() except: retry = int(response.meta.get('retry', 0)) if retry < 10: new_meta = response.meta.copy() new_meta['retry'] = retry + 1 yield Request(response.url, meta=new_meta, callback=self.parse_product, dont_filter=True) return price = hxs.select( u'//tr/td//font[starts-with(text(),"$")]/text()').extract() if price: price = price[0].split()[0] else: price = hxs.select( u'//tr/td[starts-with(text(),"Price:")]/text()').extract() if price: price = price[0].split('$')[-1] else: price = '' hxs = HtmlXPathSelector(response) category = hxs.select(u'//a[@class="linkHeading"]/text()').extract() if category: category = category[1].split(' - ')[0].strip() # For some products name does not change by selecting different options name_selected = hxs.select( u'//tr/td/select/option[@selected]/text()').extract() if name_selected: try: name += name_selected[0][name_selected[0].index('~') + 1:].strip() except: # http://www.patrollersupply.com/equipment/item_703.asp only price try: name += name_selected[0][name_selected[0].index(' ') + 1:].strip() except: pass sku = hxs.select( u'//tr/td[contains(text(),"SKU") or contains(text(),"Part #")]/text()' ).re("Part #\xa0(.+)$") product_loader = ProductLoader(item=Product(), selector=hxs) product_loader.add_value('url', response.url) product_loader.add_value('name', name) product_loader.add_value('price', price) product_loader.add_value('sku', sku) identifier = response.url.split('_')[-1].split('.asp')[0] product_loader.add_value('identifier', identifier) product_loader.add_value('category', category) img = hxs.select( '//tr/td/img[contains(@src, "products")]/@src').extract() if img: img = urljoin_rfc(get_base_url(response), img[0]) product_loader.add_value('image_url', img) product_loader.add_xpath( 'brand', u'//tr/td[contains(text(),"Manufacturer")]/../td[last()]/a/text()') product_loader.add_value('shipping_cost', '') yield product_loader.load_item() options = hxs.select(u'//tr/td/select/option/@value').extract() for opt in options: yield Request( 'http://www.patrollersupply.com/store/cart_item_review.asp?ID=' + opt, callback=self.parse_product)
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) base_url = get_base_url(response) product_code = re.search('/(\d+)\.html', response.url).group(1) product_name = hxs.select( u'//div[@id="title_product"]//h1[@class="title"]/text()').extract( ) image_url = hxs.select( '//div[@id="prod-img-slider"]//img/@src').extract() price = hxs.select(u'//span[@itemprop="price"]/@content').extract() if not price: price = hxs.select( u'//span[@id="mainprice"]//span[@class="price2"]/text()' ).extract() if price: price = re.sub('[^,\d]', '', price[0]).replace(",", ".") else: price = price[0] category = hxs.select( '//div[@id="breadcrumb"]/span/a/span/text()').extract() brand = hxs.select('//a[@class="logo"]/@title').extract() base_rrp = self.extract_rrp(hxs) options = hxs.select('//select[@class="variantSelect"]/option') for option in options[1:]: product_loader = ProductLoader(response=response, item=Product()) option_id = option.select('./@value')[0].extract() product_loader.add_value('identifier', '{}.{}'.format(product_code, option_id)) product_loader.add_value('sku', product_code) option_name = re.sub( ' {2,}', ' ', option.select(u'./text()')[0].extract().strip()) product_loader.add_value( 'name', u'{} {}'.format(product_name[0], option_name)) product_loader.add_value('url', response.url) base_option_price = float(option.select(u'./@title')[0].extract()) option_price = '{:.2f}'.format(float(price) + base_option_price) product_loader.add_value('price', str(option_price)) if image_url: product_loader.add_value('image_url', urljoin_rfc(base_url, image_url[0])) if len(category) > 1: product_loader.add_value('category', category[1].strip()) if brand: product_loader.add_value('brand', brand[0].strip().title()) product = product_loader.load_item() try: option_rrp = '{:.2f}'.format( float(base_rrp) + base_option_price) except ValueError: option_rrp = 0 metadata = CRCMeta() metadata['rrp'] = option_rrp if option_rrp > 0 else '' product['metadata'] = metadata yield product if not options: product_loader = ProductLoader(response=response, item=Product()) product_loader.add_value('name', product_name) product_loader.add_value('url', response.url) if not price: product_loader.add_value('price', '0.00') else: product_loader.add_value('price', price) if image_url: product_loader.add_value('image_url', urljoin_rfc(base_url, image_url[0])) if len(category) > 1: product_loader.add_value('category', category[1].strip()) if brand: product_loader.add_value('brand', brand[0].strip().title()) product_loader.add_value('identifier', product_code) product_loader.add_value('sku', product_code) product = product_loader.load_item() metadata = CRCMeta() metadata['rrp'] = base_rrp if base_rrp > 0 else '' product['metadata'] = metadata yield product
def extract_links(self, response): base_url = get_base_url(response) return self._extract_links(response.selector, response.url, response.encoding, base_url)
def parse(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response) yield Request("http://www.mytheresa.com/en-de", callback=self.parse_site)
def parse(self, response): product_urls = [] tcins = [] if 'tws.target.com/searchservice/item/search_results/v2/by_keyword' in response.url: # Using Target Web Services data = json.loads(response.body) items = data['searchResponse']['items']['Item'] for item in items: tcins.append(item['tcin']) try: # Pages pages_info = {} for d in data['searchResponse']['searchState']['Arguments'][ 'Argument']: pages_info[d['name']] = d['value'] current_page = int(pages_info['currentPage']) total_pages = int(pages_info['totalPages']) page_count = int(pages_info['resultsPerPage']) offset = int(pages_info.get('offset', 0)) except: self.log('Next page not found => %s' % response.url) else: if current_page < total_pages: url = add_or_replace_parameter(response.url, 'offset', str(offset + page_count)) yield Request(url, meta=response.meta) for tcin in tcins: url = 'http://redsky.target.com/v1/pdp/tcin/%s' % tcin yield Request(url, self.parse_product_json, meta=response.meta) else: # Old method ... hxs = HtmlXPathSelector(response) product_urls = hxs.select( '//div[@class="productTitle"]/a[@class="productClick"]/@href' ).extract() if not product_urls: product_urls = hxs.select( '//a[contains(@class, "productTitle")]/@href').extract() search_term = hxs.select( '//*[@id="searchTermDbp"]/@value').extract() if not search_term: search_term = hxs.select( '//span[@class="srhTerm"]/text()').extract() if not search_term: search_term = response.meta.get( 'search_term') or response.meta.get( 'brand', '').strip().lower().replace(' ', '+') else: search_term = search_term[0] product_count = hxs.select( '//*[@id="productCountValue"]/@value').extract() if not product_count: product_count = hxs.select('//span[@id="countMsg"]/text()').re( '\d+') if not product_count: product_count = response.meta.get('product_count') else: product_count = product_count[0] if not product_urls: try: data = json.loads(response.body) data = data['productListArea']['productListForm'] hxs = HtmlXPathSelector(text=data) product_urls = hxs.select( '//div[@class="productTitle"]/a[@class="productClick"]/@href' ).extract() except: pass next_page = hxs.select('//a[@id="seeMoreItemButton"]/@href') if next_page: self.log('Next page') yield Request(next_page.extract()[0], meta=response.meta, dont_filter=True) else: next_page = hxs.select( '//div[contains(@class, "next")]/a/@href').extract() if next_page: self.log('Next page') yield Request(urljoin_rfc(get_base_url(response), next_page[0]), meta=response.meta, dont_filter=True) if product_urls: nao = response.meta.get('nao', 16) hash_value = '#navigation=true&Nao=%s&viewType=medium&RatingFacet=0&customPrice=false&productsCount=%s&isDbp=true' % ( nao, product_count) ajaxlinkdata = ( 'http://www.target.com/SearchNavigationView?viewType=medium&customPrice=false&productsCount=%s&RatingFacet=0' '&searchTerm=%s&dbpSeeMore=true&Nao=%s&isDbp=true') % ( product_count, search_term, nao) formdata = { 'ajaxLinkData': ajaxlinkdata, 'hashValue': hash_value, 'stateData': '', 'searchTerm': search_term, 'viewType': 'medium' } yield FormRequest( 'http://www.target.com/bp/SearchNavigationView', method='POST', formdata=formdata, dont_filter=True, meta={ 'nao': int(nao) + 16, 'formdata': formdata, 'brand': response.meta.get('brand', ''), 'product_count': product_count, 'search_term': search_term }) for url in product_urls: yield Request( url, callback=self.parse_product, meta={'brand': response.meta.get('brand', '')}) else: retry = int(response.meta.get('retry', 0)) if retry < 10: retry += 1 meta = response.meta meta['retry'] = retry if meta.get('formdata'): yield FormRequest( 'http://www.target.com/bp/SearchNavigationView', method='POST', formdata=meta['formdata'], dont_filter=True, meta=meta) else: yield Request(response.url, dont_filter=True, meta=meta) else: return
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) image_url = hxs.select('//*[@id="image"]/@src').extract() try: product_identifier = hxs.select( '//input[@name="product"]/@value').extract()[0].strip() except: product_identifier = hxs.select( '//form[@id="product_addtocart_form"]/@action').re( r'/product/(\d+)')[0] product_name = hxs.select('//*[@id="productname"]/text()').extract()[0] category = hxs.select( '//div[@class="breadcrumbs"]//a/text()').extract()[1:] sku = product_identifier options_config = re.search( r'var spConfig = new Product.Config\((.*)\)', response.body) if options_config: product_data = json.loads(options_config.groups()[0]) products = {} attributes = {} for attr_id, attr in product_data['attributes'].iteritems(): for option in attr['options']: for product in option['products']: products[product] = ' '.join( (products.get(product, ''), option['label'])) attributes.setdefault(product, []).append({ 'attr_id': attr_id, 'val': option['id'] }) for identifier, option_name in products.iteritems(): product_loader = ProductLoader(item=Product(), selector=hxs) product_loader.add_value('identifier', product_identifier + '_' + identifier) product_loader.add_value('name', product_name + option_name) if image_url: product_loader.add_value( 'image_url', urljoin_rfc(base_url, image_url[0])) product_loader.add_value('url', response.url) product_loader.add_value('category', category) product_loader.add_value('sku', sku) product_loader.add_value('brand', response.meta.get('brand')) product = product_loader.load_item() form_data = {'product': product_identifier, 'billing_qty': '1'} for attr in attributes[identifier]: form_data['super_attribute[{}]'.format( attr['attr_id'])] = str(attr['val']) yield FormRequest( url='http://www.musclefood.com/billing/ajax/servingsinfo/', formdata=form_data, meta={'product': product}, callback=self.parse_price, dont_filter=True) else: product_loader = ProductLoader(item=Product(), selector=hxs) product_loader.add_value('identifier', product_identifier) product_loader.add_value('name', product_name) if image_url: product_loader.add_value('image_url', urljoin_rfc(base_url, image_url[0])) price = ''.join( hxs.select('//span[@class="price"]/text()').extract()).strip() price = extract_price(price) product_loader.add_value('price', price) product_loader.add_value('url', response.url) product_loader.add_value('category', category) product_loader.add_value('sku', sku) product_loader.add_value('brand', response.meta.get('brand')) if price < 75: product_loader.add_value('shipping_cost', 3.95) product = product_loader.load_item() yield product
def parse_product(self, response): if not isinstance(response, HtmlResponse): return base_url = get_base_url(response) # sub products hxs = HtmlXPathSelector(response) # compound product identifier = hxs.select('//input[@type="hidden" and @name="product"]/@value')[0].extract() image_url = hxs.select('//div[@class="onsale-product-container"]/a/img/@src').extract() if not image_url: image_url = hxs.select('//p[@class="product-image"]/a[@id="zoom1"]/@href').extract() category = hxs.select('//div[@class="breadcrumbs"]//a/text()').extract() loader = WindowsCleaningProductLoader(item=Product(), selector=hxs) if image_url: loader.add_value('image_url', urljoin_rfc(base_url, image_url[0])) if category: loader.add_value('category', category[-1]) sub_products = hxs.select('//table[@id="super-product-table"]//tr')[1:] if sub_products: item = loader.load_item() sub_products.sort(key=lambda p: p.select('td[1]//text()')[0].extract()) i = 0 for p in sub_products: name = p.select('td[1]//text()')[0].extract() price = ''.join(p.select('td[2]//text()').extract()).strip() in_stock = p.select('td[3]/input') loader = WindowsCleaningProductLoader(item=item, selector=hxs) loader.add_value('url', response.url) loader.add_value('name', name) loader.add_value('price', price) loader.add_value('sku', '') loader.add_value('identifier', '%s.%s' % (identifier, i)) if not in_stock: loader.add_value('stock', 0) yield loader.load_item() i += 1 return name = hxs.select('//div[@class="product-name"]/h1/text()')[0].extract() loader.add_value('url', response.url) loader.add_value('sku', '') loader.add_value('identifier', identifier) loader.add_value('name', name) out_of_stock = hxs.select('//p[contains(@class, "availability") and contains(@class, "out-of-stock")]') if out_of_stock: loader.add_value('stock', 0) price = hxs.select('//div[@class="product-shop"]//p[@class="special-price"]/span[2]/text()').extract() if not price: price = hxs.select('//div[@class="product-shop"]//span[@class="regular-price"]/span/text()').extract() price = price if price else '0.00' loader.add_value('price', price) # TODO stock options = re.search('var spConfig = new Product\.Config\(({.*})\);', response.body) if options: item = loader.load_item() options = json.loads(options.group(1)) base_price = float(options['basePrice']) for attribute in options['attributes'].values(): for option in attribute['options']: opt_item = Product(item) opt_item['identifier'] += '.%s.%s' % (attribute['id'], option['id']) opt_item['name'] += ' %s' % option['label'] opt_item['price'] = Decimal(str(float(base_price) + float(option['price']))) yield opt_item else: yield loader.load_item()
def parse_item(self, response): items = [] sel = Selector(response) base_url = get_base_url(response) # print base_url item = SohuitItem() item['url'] = base_url if len(sel.css('h1').xpath('text()').extract()) == 0: item['title'] = None else: item['title'] = sel.css('h1').xpath('text()').extract()[0] if len( sel.css('.source .sc #media_span span').xpath( 'text()').extract()) != 0: item['source'] = sel.css('.source .sc #media_span span').xpath( 'text()').extract()[0] else: item['source'] = None if len(sel.css('.time').xpath('text()').extract()) != 0: item['createtime'] = sel.css('.time').xpath('text()').extract()[0] else: item['createtime'] = None if len(sel.css('.source #author_baidu').xpath( 'text()').extract()) != 0: item['author'] = sel.css('.source #author_baidu').xpath( 'text()').extract()[0] else: item['author'] = None # item['author'] = None item['abstract'] = None if len(sel.xpath('//div[@id="contentText"]//p/text()').extract()) != 0: contentarr = sel.xpath( '//div[@id="contentText"]//p/text()').extract() item['content'] = "".join(contentarr) else: item['content'] = None if len(sel.xpath('//div[@id="channel-nav"]/div/span/a/text()')) != 0: categoryarr = sel.xpath( '//div[@id="channel-nav"]/div/span/a/text()').extract() item['category'] = "/".join(categoryarr) else: item['category'] = None # item['commentnum'] = None # item['gettime'] = time.strftime('%Y-%m-%d',time.localtime(time.time())) timeStamp = time.time() timeArray = time.localtime(timeStamp) otherStyleTime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray) item['gettime'] = otherStyleTime # view = webkit.WebView() # websettings = view.get_property('settings') # websettings.set_property("auto-load-images", False) # websettings.set_property("enable-java-applet", False) # websettings.set_property('enable-plugins', False) # websettings.set_property('enable-page-cache', False) # view.connect( 'load-finished', lambda v,f: gtk.main_quit()) # view.load_uri(response.url) # gtk.main() # view.execute_script("document.title=document.getElementById('changyan_parti_unit').innerHTML") # commentnum = view.get_main_frame().get_title() # item['commentnum'] = commentnum # js = jswebkit.JSContext( view.get_main_frame().get_global_context() ) # time.sleep(1) # try: # commentnum = js.EvaluateScript("document.getElementById('changyan_parti_unit').innerHTML") # item['commentnum'] = js.EvaluateScript("document.getElementsByClassName('red')[0].innerHTML") # except: # item['commentnum'] = None # item['commentnum'] = comme # selcommentnum = sel.xpath('//span[@class="f12"]/span[@class="red"]/text()') if (len(sel.xpath('//span[@class="f12"]/span[@class="red"]/text()')) != 0): item['commentnum'] = sel.xpath( '//span[@class="f12"]/span[@class="red"]/text()')[0].extract() else: item['commentnum'] = None items.append(item) #print items # print item # print items return items
def parse_item(self, response): ext = response.meta['ext'] if 'one' in ext: url = ext['url'] title = ext['title'] publish_time = response.xpath( '//div[@class="blog-post"]/ul/li[1]/text()').extract_first( ).strip().replace('\t', '').replace('\r', '').replace('\n', '') # 时间格式 publish_time = publish_time[5:15] publish_time = datetime.strptime(publish_time, '%Y-%m-%d') item = GGFundNoticeItem() item['sitename'] = self.sitename item['channel'] = self.channel item['url_entry'] = self.entry item['url'] = url item['title'] = title item['publish_time'] = publish_time yield item else: urlOne = response.xpath( '//div[@id="tab6"]/dl/dd/h5/a/@href').extract_first() urlOne = urljoin(get_base_url(response), urlOne) titleOne = response.xpath('//div[@id="tab6"]/dl/dd/h5/a/text()' ).extract_first().strip().replace( '\t', '').replace('\r', '').replace('\n', '') publish_timeOne = '' if publish_timeOne == '' or publish_timeOne is None: self.ips.append({ 'url': 'http://www.rtcapital.cn/product/cpgg/qsbg/857.html', 'ext': { 'one': '1', 'url': urlOne, 'title': titleOne } }) rows = response.xpath('//div[@id="tab6"]/div/ul/li') for row in rows: url = row.xpath('./a/@href').extract_first() url = urljoin(get_base_url(response), url) title = row.xpath( './a/text()').extract_first().strip().replace( '\t', '').replace('\r', '').replace('\n', '') publish_time = row.xpath( './span/text()').extract_first().strip().replace( '\t', '').replace('\r', '').replace('\n', '') publish_time = publish_time[0:10] publish_time = datetime.strptime(publish_time, '%Y-%m-%d') item = GGFundNoticeItem() item['sitename'] = self.sitename item['channel'] = self.channel item['url_entry'] = self.entry item['url'] = url item['title'] = title item['publish_time'] = publish_time yield item
def parse(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) products = hxs.select('//tr[contains(@class,"tyre-search-row")]') next_page = [] if next_page: yield Request(urljoin_rfc(base_url, next_page[0]), meta=response.meta) not_found_count = 0 for product in products: url = product.select('.//td/b/a/@href')[0].extract() winter_tyre = product.select('.//td/b/a/text()')[0].extract() winter_tyre = 'winter' in winter_tyre.lower() if not winter_tyre: brand = product.select('.//a/img/@src')[0].extract() brand = re.search('/public/brands/(.*?)(-tyres)?\.', brand).group(1).replace('-', ' ').title() product_name = product.select('.//td/b/a/text()')[0].extract() product_name = re.sub(brand, '', product_name).strip() fitting_method = 'Delivered' identifier = product.select( './/input[@name="item_id"]/@value').extract() if not identifier: identifier = product.select('.//a/@href').re( 'email_me_stock/(.*)') if not identifier: continue try: fuel, grip, noise = map( unicode.strip, product.select( './/img[contains(@alt, "Tyre Label")]/following-sibling::text()' ).extract()) except: fuel = '' grip = '' noise = '' price = product.select("td[3]/b/text()").extract() loader = ProductLoader(item=Product(), selector=hxs) loader.add_value('identifier', identifier[0]) loader.add_value('name', product_name) loader.add_value('brand', unify_brand(brand)) loader.add_value( 'category', find_brand_segment(loader.get_output_value('brand'))) loader.add_value('url', url) if price: loader.add_value('price', price[0]) else: loader.add_value('price', '0.00') loader.add_value('stock', 0) pattern_name = product.select('.//i/text()').extract() if not pattern_name: continue pattern_name = pattern_name[0] data = re.search( '(?P<Width>\d+)/(?P<Aspect_Ratio>\d+) R(?P<Rim>\d+) (?P<Speed_Rating>[A-Za-z]{1,2}) \((?P<Load_Rating>\d+).*?\)', pattern_name) if data: data = data.groupdict() else: msg = 'ERROR parsing "{}" [{}]'.format( pattern_name, response.url) self.log(msg) continue metadata = MicheldeverMeta() metadata['aspect_ratio'] = data['Aspect_Ratio'] metadata['rim'] = data['Rim'] metadata['speed_rating'] = data['Speed_Rating'].upper() metadata['width'] = data['Width'] metadata['fitting_method'] = fitting_method metadata['load_rating'] = data['Load_Rating'] or '' metadata['alternative_speed_rating'] = '' xl = 'XL' in pattern_name metadata['xl'] = 'Yes' if xl else 'No' run_flat_found = is_run_flat(pattern_name) run_flat = 'run flat' in pattern_name.lower( ) or 'runflat' in pattern_name.lower() or run_flat_found metadata['run_flat'] = 'Yes' if run_flat else 'No' manufacturer_mark = [ mark for mark in self.all_man_marks.keys() if mark in pattern_name.split(' ') ] manufacturer_mark = manufacturer_mark[0].strip( ) if manufacturer_mark else [] metadata['manufacturer_mark'] = find_man_mark( manufacturer_mark) if manufacturer_mark else '' metadata['full_tyre_size'] = '/'.join( (metadata['width'], metadata['aspect_ratio'], metadata['rim'], metadata['load_rating'], metadata['speed_rating'])) metadata['fuel'] = fuel metadata['grip'] = grip metadata['noise'] = noise product = loader.load_item() product['metadata'] = metadata if not is_product_correct(product): not_found_count += 1 self.log('%s - PRODUCT IS NOT CORRECT: %r' % (not_found_count, product)) continue product['metadata']['mts_stock_code'] = find_mts_stock_code( product, spider_name=self.name, log=self.log) if product['url'] in self.images: product['image_url'] = self.images[product['url']] yield product else: yield Request(product['url'], callback=self.parse_image, meta={'product': product}, dont_filter=True)
def parse_product_list(self, response): hxs = HtmlXPathSelector(response) found_something = False matched_any = False suggested_product_list = response.meta.get('suggested_search_peek', False) meta = response.meta for result in hxs.select( u'//div[@id="atfResults" or @id="btfResults"]//div[starts-with(@id, "result_")]' ): found_something = True more_buying_choices = result.select( './/li[@class="sect mbc"]/../li[contains(@class,"mkp2")]/a/@href' ).extract() if more_buying_choices: url = urljoin_rfc(get_base_url(response), more_buying_choices[0]) append_request(url, self.parse_mbc_list, response.meta) continue try: product_name = result.select( u'.//h3/a/span/text()').extract()[0] except: continue try: identifier = result.select('./@name').extract()[0] except: if not result.select('./@id').extract()[0].endswith('_empty'): raise continue price = result.select( './/span[@class="bld lrg red"]//text()').extract() if not price: price = result.select( './/span[contains(@class, "price")]//text()').extract() if not price: self.log('No price on %s' % (response.url)) continue price = self.extract_price(price[0]) product = Product(response.meta['search_item']) product['name'] = product_name brand = hxs.select( u'.//h3/span[contains(text(),"by")]/text()').extract() if brand: product['brand'] = brand[0].replace('by ', '').replace( 'de ', '').replace('(', '').strip() product['price'] = price if self._use_amazon_identifier: product['identifier'] = product.get('identifier', '') + ':' + identifier url = result.select(u'.//h3/a/@href').extract()[0] product['url'] = urljoin_rfc(get_base_url(response), url) image_url = result.select( u'.//img[@class="productImage"]/@src').extract() if image_url: product['image_url'] = urljoin_rfc(get_base_url(response), image_url[0]) if self.match(response.meta['search_item'], product): matched_any = True # Go and extract vendor meta = dict(response.meta) meta['_product'] = product append_request(product['url'], self.parse_product, meta) # Follow suggested links only on original search page if not suggested_product_list and not found_something: urls = hxs.select( u'//div[contains(@class,"fkmrResults")]//h3[@class="fkmrHead"]//a/@href' ).extract() if urls: self.log( 'No results found for [%s], trying suggested searches' % (response.meta['search_string'])) else: self.log('No results found for [%s], no suggested searches' % (response.meta['search_string'])) row = Product(response.meta['search_item']) search_term = '' if row['sku']: search_term = row['brand'] + ' ' + row['sku'] meta['search_item']['sku_search'] = True self.log( 'No results found for [%s], trying searching by Brand + Model' % (search_term)) else: search_term = row['name'].replace(' ', '+') self.log( 'No results found for [%s], trying searching by name' % (search_term)) search_url = 'http://www.amazon.it/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords=' + search_term urls.append(search_url) for url in urls: url = urljoin_rfc(get_base_url(response), url) append_request_suggested(url, self.parse_product_list, meta) next_url = hxs.select(u'//a[@id="pagnNextLink"]/@href').extract() # Follow to next pages only for original search # and suggested search if at least one product matched from first page # otherwise it tries to crawl the whole Amazon or something like that if next_url and (not suggested_product_list or matched_any): page = response.meta.get('current_page', 1) if self.max_pages is None or page <= self.max_pages: response.meta['suggested_search_peek'] = False response.meta['current_page'] = page + 1 url = urljoin_rfc(get_base_url(response), next_url[0]) append_request(url, self.parse_product_list, response.meta) else: self.log('Max page limit %d reached' % (self.max_pages)) for x in self._continue_requests(response): yield x
def parse(self, response): hxs = HtmlXPathSelector(response) categories = hxs.select('//ul[@class="b_main"]/li/a/@href').extract() for category in categories: url = urljoin_rfc(get_base_url(response), category) yield Request(url, callback=self.parse_categories)
def parse_product(self, response): # This looses part of HTML on http://www.tweekscycles.com/clearance/clearance-bikes/scott-scale-935-29er-hardtail-mountain-bike-2014 # No idea why and how but hxs.select('//select') finds only one elem # while the same with text=response.body finds them all hxs = HtmlXPathSelector(text=response.body.decode('ISO-8859-1')) if not hxs.select('//td[@id="product-title"]'): # nope, category for x in self.parse_cat(response): yield x return category = ''.join( hxs.select( 'normalize-space(//div[@id="breadcrumb"]/a[position()=last()]/text())' ).extract()) brand = ''.join( hxs.select('//td[@id="brand-location"]/img/@alt').extract()) img = hxs.select('//img[@id="mainImage"]/@src').extract() img = urljoin_rfc(get_base_url(response), img[0]) if img else '' url = response.url shipping_cost = '0' loader = ProductLoader(item=Product(), selector=hxs) loader.add_value('shipping_cost', shipping_cost) loader.add_value('brand', brand) loader.add_value('url', url) loader.add_value('category', category) loader.add_value('image_url', img) opts = [] req = hxs.select( '//div[@id="attribOption"]//select[@onchange]/@onchange').extract( ) self.log(repr(req)) if req: m = re.search("updateAttrib\('(.*)', '(.*)', '(.*)'\)", req[0]) n, g = m.group(1), m.group(2) i = hxs.select('//input[@name="iid"]/@value').extract().pop() for sel in hxs.select( '//div[@id="attribOption"]//select[@onchange]'): group = [] for opt in sel.select( './option[position()!=1]/@value').extract(): group.append(opt) opts.append(group) for opt in multiply(opts): url = 'http://www.tweekscycles.com/Product.do?method=prodAttrib&n=%s&g=%s&a=%s&i=%s&q=1&uid=%s' try: yield Request(url % (n, g, urllib.quote_plus(opt + '@'), i, datetime.now().strftime("%s000")), meta={'item': loader.load_item()}, callback=self.parse_opt) except KeyError: pass else: options = hxs.select( "//div[@id='buyButton']/parent::td/parent::tr") if options: for option in options: name = hxs.select("//td[@id='product-title']/text()" ).extract()[0].strip() name = name + ' - ' + option.select( "./td[1]/text()").extract()[0].strip() price = option.select( "./td[3]/span/text()").extract()[0].strip() price = extract_price(price) stock = 1 if price > 0 else 0 identifier = option.select( ".//div[@id='buyButton']").extract()[0] identifier = re.findall( re.compile("addExpandBasket\(.+?\'(\d*)\'\)"), identifier) loader = ProductLoader(item=Product(), selector=hxs) loader.add_value('shipping_cost', shipping_cost) loader.add_value('brand', brand) loader.add_value('url', url) loader.add_value('category', category) loader.add_value('image_url', img) loader.add_value('name', name) loader.add_value('identifier', identifier) loader.add_value('sku', identifier) loader.add_value('price', str(price)) loader.add_value('stock', stock) yield loader.load_item() else: loader.add_xpath( 'name', 'normalize-space(//td[@id="product-title"]/text())') loader.add_xpath( 'sku', 'normalize-space(//span[@id="prodTitle"]/span[position()=last()]/text())' ) loader.add_xpath( 'identifier', 'normalize-space(//span[@id="prodTitle"]/span[position()=last()]/text())' ) if not loader.get_output_value('identifier'): loader.add_value( 'identifier', re.search("review\('[^']*', '0', '([^']*)'\)", response.body).group(1)) loader.add_xpath( 'price', '//span[@id="prodPriceLower" or @id="prodPrice"]/span/text()' ) if loader.get_output_value('price') > 0: loader.add_value('stock', '1') rrp = hxs.select('//span[@id="prodPrice"]/span/text()').re( r'Was (.*)') rrp = str(extract_price(rrp[-1])) if rrp else '' prod = loader.load_item() metadata = CRCMeta() metadata['rrp'] = rrp prod['metadata'] = metadata yield prod
def urljoin_to_context(url, loader_context): response = loader_context.get('response') response_url = get_base_url(response) return get_absolute_url(url, response_url)
def parse(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) products = hxs.select('//table[@id="productlist-table"]/tbody/tr') category = hxs.select('//div[@id="divTitle"]/h1/text()').extract() if category: category = category[0] else: category = response.meta.get('category') if not category: self.log("Couldn't extract category from: %s" % response.url) tries = response.meta.get('try', 1) if tries < 10: self.log("Retrying page: %s" % response.url) yield Request(response.url, dont_filter=True, meta={'try': tries + 1}) else: self.log("Gave up retrying: %s. Using blanl category" % response.url) self.errors.append("Blank category on page: %s" % response.url) category = '' for product in products: loader = ProductLoader(item=Product(), selector=product) image_url = product.select('td[@class="imgCol"]/a/img/@src').extract() if image_url: image_url = urljoin(base_url, image_url[0]) else: image_url = '' loader.add_value('image_url', image_url) loader.add_xpath('dealer', 'td[@class="mercCol"]/a/img/@alt') loader.add_xpath('name', 'td[@class="descCol"]/a/b/text()') loader.add_value('category', category) loader.add_value('sku', '') url = product.select('td[@class="descCol"]/a/@href').extract()[0] url = url.partition("?")[0] identifier = url.partition('goto/')[-1] loader.add_value('identifier', identifier) loader.add_value('url', urljoin(base_url, url)) price = "".join(product.select('td[@class="prodListPrezzo"]/text()').extract()) if not price: continue loader.add_value('price', price.strip().replace('.', '').replace(',', '.')) shipping_cost = product.select('td[@class="prodListPrezzo"]/span[contains(@class, "deliveryCost")]/text()')[0].re(r'([\d,]+)') if shipping_cost: loader.add_value('shipping_cost', shipping_cost[0].replace(',', '.')) item = loader.load_item() if item['identifier'] in self.product_ids: item['name'] = self.product_ids[item['identifier']] else: self.product_ids[item['identifier']] = item['name'] yield item pagination = hxs.select('//div[@class="pagination"]/a/@href').extract() for page in pagination: url = urljoin(base_url, page) yield Request(url, meta={'category': category}) sub_categories = hxs.select('//tr[@class="subCatsMother"]/td/b/a/@href').extract() for sub_cat in sub_categories: url = urljoin(base_url, sub_cat) yield Request(url) all_products = hxs.select('//td[@id="col_sxcent"]/div/a[contains(text(), "Tutte le offerte")]/@href').extract() if all_products: url = urljoin(base_url, all_products[0]) yield Request(url)
def urljoin(self, url): """Join this Response's url with a possible relative url to form an absolute interpretation of the latter.""" return urljoin(get_base_url(self), url)