Example #1
0
    def extract_values_from_list(self, item, response):
        url = get_absolute_url(response, item.xpath('@href').extract()[0])
        list_images = get_absolute_url(response, item.xpath('img/@src').extract()[0])
        product_number = re.search(r'(?!.*\/)(.*?)$', url).group(1)

        return {
            'url': url,
            'list_images': list_images,
            'product_number': product_number
        }
Example #2
0
    def parse_sub_sub(self, response):
        lev2_sel_list = response.meta['lev2_sel_list']
        top_category_loader = response.meta['top_category_loader']
        parent_loader_list = response.meta['parent_loader_list']
        idx = response.meta['idx']

        for sel in response.xpath('//div[@id="refineBycategory"]/a'):
            category_loader = self.generate_loader(sel, response)
            parent_loader_list[idx].add_value('sub_categories',
                                              category_loader.load_item())

        top_category_loader.add_value('sub_categories',
                                      parent_loader_list[idx].load_item())

        if idx == len(lev2_sel_list) - 1:
            yield top_category_loader.load_item()
        else:
            # go to the next level2 node
            idx += 1
            url = get_absolute_url(
                response, lev2_sel_list[idx].xpath('@href').extract()[0])
            yield Request(url,
                          callback=self.parse_sub_sub,
                          meta={
                              'lev2_sel_list': lev2_sel_list,
                              'idx': idx,
                              'top_category_loader': top_category_loader,
                              'parent_loader_list': parent_loader_list
                          })
Example #3
0
 def parse(self, response):
     for top_level_sel in response.xpath('//div[@class="category"]/div[@class="category_static"]/ul/li/a'):
         top_level_category_loader = self.generate_loader(top_level_sel, response)
         url = get_absolute_url(response, top_level_sel.xpath('@href').extract()[0])
         request = Request(url, callback=self.parse_sub)
         request.meta['top_level_category_loader'] = top_level_category_loader
         yield request
Example #4
0
 def parse(self, response):
     for top_level_sel in response.xpath('//div[@id="mainNavGOL"]/ul[@class="gpnavigation"]/li[not(@id)]'):
         top_level_category_loader = self.generate_loader(top_level_sel, response)
         url = get_absolute_url(response, top_level_sel.xpath('a/@href').extract()[0])
         request = Request(url, callback=self.parse_sub)
         request.meta['top_level_category_loader'] = top_level_category_loader
         yield request
Example #5
0
    def extract_values_from_list(self, item, response):
        url = get_absolute_url(response, item.xpath('a/@href').extract()[0])
        list_images = item.xpath('a/img/@productimagepath').extract()[0]

        return {
            'url': url,
            'list_images': list_images
        }
Example #6
0
    def extract_values_from_list(self, item, response):
        url = get_absolute_url(
            response,
            item.xpath('div[@class="prodImage"]/a/@href').extract()[0])
        list_images = item.xpath(
            'div[@class="prodImage"]/a/img/@src').extract()[0]

        return {'url': url, 'list_images': list_images}
Example #7
0
    def parse_sub(self, response):
        lev2_sel_list = []
        top_category_loader = response.meta['top_category_loader']
        parent_loader_list = []

        for sel in response.xpath('//div[@id="refineBycategory"]/a'):
            url = get_absolute_url(response, sel.xpath('@href').extract()[0])

            lev2_sel_list.append(sel)
            category_loader = self.generate_loader(sel, response)
            parent_loader_list.append(category_loader)

        # go to the first level2 node
        idx = 0
        url = get_absolute_url(response,
                               lev2_sel_list[idx].xpath('@href').extract()[0])
        yield Request(url,
                      callback=self.parse_sub_sub,
                      meta={
                          'lev2_sel_list': lev2_sel_list,
                          'idx': idx,
                          'top_category_loader': top_category_loader,
                          'parent_loader_list': parent_loader_list
                      })
Example #8
0
    def parse(self, response):
        for sel in response.xpath('//div[@class="webstoremenu"]/ul/li/a'):
            url = get_absolute_url(response, sel.xpath('@href').extract()[0])

            if any(
                    map(lambda x: x in url, [
                        'fsa-store', 'gnc-store', 'green-and-natural',
                        'the-sale'
                    ])):
                continue

            top_category_loader = self.generate_loader(sel, response)
            yield Request(url,
                          callback=self.parse_sub,
                          meta={'top_category_loader': top_category_loader})
Example #9
0
    def extract_values_from_list(self, item, response):
        url = item.css(
            'div.product-tile > div.product-image > a.thumb-link::attr(href)'
        ).extract()[0]
        url = get_absolute_url(response, url)
        list_images = item.css(
            'div.product-tile > div.product-image > a.thumb-link  > img::attr(src)'
        ).extract()[0]
        product_number = item.css(
            'div.product-tile > div.product-image > a.thumb-link::attr(href)'
        ).re(r'(?!.*\/)(.*?).html')[0]

        return {
            'url': url,
            'list_images': list_images,
            'product_number': product_number
        }
Example #10
0
    def parse_list(self, response):
        data = json.loads(response.body)
        for item in data['items']:
            if self.check_scrapable(item):
                url = get_absolute_url(response, item['link'])
                values_from_list = self.extract_values_from_list(item, response)

                selector_list = [".BVRRWidget", "iframe#contentFrame"]
                script = make_lua_script(selector_list, '&&')

                request = Request(url, callback=self.parse_item, meta={
                    'splash': {
                        'endpoint': 'execute',
                        'args': {'lua_source': script}
                    }
                })

                request.meta['values_from_list'] = values_from_list
                yield request