def extract_values_from_list(self, item, response): url = get_absolute_url(response, item.xpath('@href').extract()[0]) list_images = get_absolute_url(response, item.xpath('img/@src').extract()[0]) product_number = re.search(r'(?!.*\/)(.*?)$', url).group(1) return { 'url': url, 'list_images': list_images, 'product_number': product_number }
def parse_sub_sub(self, response): lev2_sel_list = response.meta['lev2_sel_list'] top_category_loader = response.meta['top_category_loader'] parent_loader_list = response.meta['parent_loader_list'] idx = response.meta['idx'] for sel in response.xpath('//div[@id="refineBycategory"]/a'): category_loader = self.generate_loader(sel, response) parent_loader_list[idx].add_value('sub_categories', category_loader.load_item()) top_category_loader.add_value('sub_categories', parent_loader_list[idx].load_item()) if idx == len(lev2_sel_list) - 1: yield top_category_loader.load_item() else: # go to the next level2 node idx += 1 url = get_absolute_url( response, lev2_sel_list[idx].xpath('@href').extract()[0]) yield Request(url, callback=self.parse_sub_sub, meta={ 'lev2_sel_list': lev2_sel_list, 'idx': idx, 'top_category_loader': top_category_loader, 'parent_loader_list': parent_loader_list })
def parse(self, response): for top_level_sel in response.xpath('//div[@class="category"]/div[@class="category_static"]/ul/li/a'): top_level_category_loader = self.generate_loader(top_level_sel, response) url = get_absolute_url(response, top_level_sel.xpath('@href').extract()[0]) request = Request(url, callback=self.parse_sub) request.meta['top_level_category_loader'] = top_level_category_loader yield request
def parse(self, response): for top_level_sel in response.xpath('//div[@id="mainNavGOL"]/ul[@class="gpnavigation"]/li[not(@id)]'): top_level_category_loader = self.generate_loader(top_level_sel, response) url = get_absolute_url(response, top_level_sel.xpath('a/@href').extract()[0]) request = Request(url, callback=self.parse_sub) request.meta['top_level_category_loader'] = top_level_category_loader yield request
def extract_values_from_list(self, item, response): url = get_absolute_url(response, item.xpath('a/@href').extract()[0]) list_images = item.xpath('a/img/@productimagepath').extract()[0] return { 'url': url, 'list_images': list_images }
def extract_values_from_list(self, item, response): url = get_absolute_url( response, item.xpath('div[@class="prodImage"]/a/@href').extract()[0]) list_images = item.xpath( 'div[@class="prodImage"]/a/img/@src').extract()[0] return {'url': url, 'list_images': list_images}
def parse_sub(self, response): lev2_sel_list = [] top_category_loader = response.meta['top_category_loader'] parent_loader_list = [] for sel in response.xpath('//div[@id="refineBycategory"]/a'): url = get_absolute_url(response, sel.xpath('@href').extract()[0]) lev2_sel_list.append(sel) category_loader = self.generate_loader(sel, response) parent_loader_list.append(category_loader) # go to the first level2 node idx = 0 url = get_absolute_url(response, lev2_sel_list[idx].xpath('@href').extract()[0]) yield Request(url, callback=self.parse_sub_sub, meta={ 'lev2_sel_list': lev2_sel_list, 'idx': idx, 'top_category_loader': top_category_loader, 'parent_loader_list': parent_loader_list })
def parse(self, response): for sel in response.xpath('//div[@class="webstoremenu"]/ul/li/a'): url = get_absolute_url(response, sel.xpath('@href').extract()[0]) if any( map(lambda x: x in url, [ 'fsa-store', 'gnc-store', 'green-and-natural', 'the-sale' ])): continue top_category_loader = self.generate_loader(sel, response) yield Request(url, callback=self.parse_sub, meta={'top_category_loader': top_category_loader})
def extract_values_from_list(self, item, response): url = item.css( 'div.product-tile > div.product-image > a.thumb-link::attr(href)' ).extract()[0] url = get_absolute_url(response, url) list_images = item.css( 'div.product-tile > div.product-image > a.thumb-link > img::attr(src)' ).extract()[0] product_number = item.css( 'div.product-tile > div.product-image > a.thumb-link::attr(href)' ).re(r'(?!.*\/)(.*?).html')[0] return { 'url': url, 'list_images': list_images, 'product_number': product_number }
def parse_list(self, response): data = json.loads(response.body) for item in data['items']: if self.check_scrapable(item): url = get_absolute_url(response, item['link']) values_from_list = self.extract_values_from_list(item, response) selector_list = [".BVRRWidget", "iframe#contentFrame"] script = make_lua_script(selector_list, '&&') request = Request(url, callback=self.parse_item, meta={ 'splash': { 'endpoint': 'execute', 'args': {'lua_source': script} } }) request.meta['values_from_list'] = values_from_list yield request