def parse_product_list(self, response):
        hxs = HtmlXPathSelector(response)

        categories = hxs.select('//li[@class="PANEL ALL"]//a/@href').extract()
        categories += hxs.select(
            '//li[@class="PANEL BY-SIZE"]//a/@href').extract()
        categories += hxs.select(
            '//li[@class="PANEL BY-TYPE"]//a/@href').extract()
        for url in categories:
            url = url_query_cleaner(response.urljoin(url))
            yield Request(url, callback=self.parse_product_list)

        products = hxs.select('//div[@id="pdList"]//a/@href').extract()
        products += hxs.select(
            '//div[@class="product-tile"]//a/@href').extract()
        for url in products:
            pid = url.split('_')[-1]
            if pid not in self.parsed_products:
                self.parsed_products.append(pid)
                url = url_query_cleaner(response.urljoin(url))
                yield Request(url, callback=self.parse_product)

        product_variants = hxs.select(
            '//div[@class="productVariantTypeOptions"]/a/@href').extract()
        for url in product_variants:
            self.log('productVariantTypeOptions! {}'.format(url))
            pid = url.split('_')[-1]
            if pid not in self.parsed_products:
                self.parsed_products.append(pid)
                url = url_query_cleaner(response.urljoin(url))
                yield Request(url, callback=self.parse_product)

        next_page = None
        cur_page = url_query_parameter(response.url, 'pi', None)
        if cur_page:
            # The spider is already crawling the pages, we just assing the current url
            # so we can increment the 'pi' argument
            next_page = response.url
        else:
            # First page of the product list, we extract the pagination url with regex
            next_page = re.findall('.get\( "(.*)pi=', response.body)
            if next_page:
                next_page = response.urljoin(next_page[0])

        if (next_page and products != response.meta.get('products', [])) or (
                next_page and
                product_variants != response.meta.get('product_variants', [])):
            cur_page = url_query_parameter(next_page, 'pi', '1')
            url = add_or_replace_parameter(next_page, 'pi',
                                           str(int(cur_page) + 1))
            self.log('Goes to next page: ' + url)
            yield Request(url,
                          callback=self.parse_product_list,
                          meta={
                              'products': products,
                              'product_variants': product_variants
                          })
Example #2
0
 def test_url_query_cleaner_keep_fragments(self):
     self.assertEqual(
         'product.html?id=200#foo',
         url_query_cleaner("product.html?id=200&foo=bar&name=wired#foo",
                           ['id'],
                           keep_fragments=True))
     self.assertEqual(
         'product.html?id=200',
         url_query_cleaner("product.html?id=200&foo=bar&name=wired", ['id'],
                           keep_fragments=True))
Example #3
0
    def _set_image_value(self, instance, image_value):
        """
        Instance에 Image value를
        :param instance:
        :param image_value:
        :return:
        """
        try:
            decoded_data = self._parse_base64(image_base64=image_value)
            filename = self._generate_filename()
            image = self._process_image(data=decoded_data, max_size=600)
            instance.image.save(
                filename,
                image,
                save=False,
            )
            url = url_query_cleaner(instance.image.url)
            instance.image_insert_value = {"image": f"{url}"}

        # image 가 base64가 아닌 경우
        # url 주소일 경유 담겨있을 경우 image_insert_value에 url 추가
        # image 가 base64도 아니고 link도 아닌 잘못된 형식일 경우 ValueError
        except AttributeError:
            if image_value[:
                           4] == "http" or image_value[:
                                                       6] == settings.MEDIA_URL:
                instance.image_insert_value = {"image": f"{image_value}"}
            else:
                raise ValueError(
                    "올바른 형태의 이미지 Base64가 아닙니다. data:image/png;base64로 시작하는지 확인해주세요 "
                )
Example #4
0
 def parse_json(self, response):
     data = json.loads(response.body)
     selector = Selector(text=data['products'])
     for url in selector.xpath('//a/@href[contains(., ".prd")]').extract():
         yield Request(
             url_query_cleaner(response.urljoin(url), ('skuId', )),
             self.parse_product)
Example #5
0
def clean_url(url):
    if "youtube.com" in url or "youtu.be" in url:
        return url
    # u = url_normalize(url)
    u = url
    u = url_query_cleaner(u,
                          parameterlist=[
                              'utm_source', 'utm_medium', 'utm_campaign',
                              'utm_term', 'utm_content'
                          ],
                          remove=True,
                          keep_fragments=True)
    # if "cdn.discordapp.com" in u and ".gif" in u:
    #     u = u.replace("cdn.discordapp.com", "media.discordapp.net")

    if len(u) == len(url):
        u = url
    # headers = {'User-Agent': 'Mozilla/5.0'}
    # response = requests.get(u, headers=headers)
    # if response.history:
    #     u = response.url
    if "https://www.google.com/url?q=" in u:
        u = clean_url(u.replace("https://www.google.com/url?q=", ""))
    # ancre = re.search(r"\#\w*$", url)
    # if ancre is not None and ancre.group(0) not in u:
    #     u = u + ancre.group(0)
    if u[-1:] == '#':
        return u[:-1]
    else:
        return u
Example #6
0
    def request_fingerprint(self, request):

        url = url_query_cleaner(request.url, ['snr'], remove=True)

        request = request.replace(url=url)

        return super().request_fingerprint(request)
class AsosItem(scrapy.Item):
    """Scrapy item to store scraped data from asos.com.

    Attributes:
        article_type (scrapy.Field): List of the associated article type, for example: ['women', 'shoes'].
        product_name (scrapy.Field): Str of the product name, for example: New Look Satin Twist Slider.
        product_url (scrapy.Field): Str of the url of the product.
        brand_name (scrapy.Field): Str of associated brand of the product, for example: New Look.
        price (scrapy.Field): String of the price of the product.
        fit (scrapy.Field): List of the different sizes for the product.
        colors (scrapy.Field): List of colors for the product.
        details_and_care_info (scrapy.Field): List of care information for the product.
        details_and_care_list (scrapy.Field): List of care and details information.
        image_urls (scrapy.Field): List of urls of the images.
        images (scrapy.Field): List of hashes for corresponding image_urls.
        spider_name (scrapy.Field): Str of spider name.

    """

    article_type = scrapy.Field(output_processor=RemoveSaleHome())
    product_name = scrapy.Field(output_processor=TakeFirst())
    product_url = scrapy.Field(output_processor=TakeFirst())
    brand_name = scrapy.Field(output_processor=TakeFirst())
    price = scrapy.Field(output_processor=TakeFirst())
    fit = scrapy.Field()
    colors = scrapy.Field(output_processor=TakeFirst())
    details_and_care_info = scrapy.Field(output_processor=TakeFirst())
    details_and_care_list = scrapy.Field(output_processor=TakeFirst())
    image_urls = scrapy.Field(
        output_processor=MapCompose(lambda x: url_query_cleaner(x)))
    images = scrapy.Field()
    spider_name = scrapy.Field(output_processor=TakeFirst())
Example #8
0
    def parse_product(self, response):
        loader = ProductLoader(Product(), response=response)
        identifier = response.xpath(
            '//input[@name="productId"]/@value').extract_first()
        if not identifier:
            loader.add_value('stock', 0)
            identifier = response.xpath('//text()').re('productId=(.+?)&')
        loader.add_value('identifier', identifier)
        loader.add_value('url', url_query_cleaner(response.url))
        loader.add_css('name', 'div.productTitleDescriptionContainer h1::text')
        loader.add_css('price', 'p.pricePerUnit::text')
        loader.add_css('sku', 'p.itemCode::text', re='Item code:(.+)')
        category = response.xpath(
            '//ul[@id="breadcrumbNavList"]//a/span/text()').extract()
        if 'Home' in category:
            category.remove('Home')
        loader.add_value('category', category)
        image_url = response.css(
            'img#productImageID::attr(src)').extract_first()
        if image_url:
            loader.add_value('image_url', response.urljoin(image_url))
        item = loader.load_item()
        item['metadata'] = {'reviews': []}

        review_id = response.xpath('//text()').re_first("productId: '(.+?)'")
        reviews_url = 'http://sainsburysgrocery.ugc.bazaarvoice.com/8076-en_gb/%s/reviews.djs?format=embeddedhtml' % review_id
        yield Request(reviews_url,
                      callback=self.parse_review_page,
                      meta={'item': item})
Example #9
0
 def parse(self, response):
     self.state['items_count'] = self.state.get('items_count', 0) + 1
     response = response.replace(url=url_query_cleaner(response.url))
     #self.log('Page: %s' % response.url)
     hxs = HtmlXPathSelector(response)
     index_level = self.determine_level(response)
     if index_level in [1, 2, 3, 4]:
         #self.save_to_file_system(index_level, response)
         relative_urls = self.get_follow_links(index_level, hxs)
         if relative_urls is not None:
             for url in relative_urls:
                 yield Request(url, callback=self.parse)
     elif index_level == 5:
         #self.log('Level 5, parsing profile');
         linkedin_id = self.get_linkedin_id(response.url)
         person_profile = LinkedinProfileParser.parse_profile(hxs)
         if person_profile is None:
             return
         linkedin_id = UnicodeDammit(urllib.unquote_plus(linkedin_id)).markup.encode("utf-8")
         #self.log('ID: ' + linkedin_id)
         if linkedin_id:
             m = hashlib.md5()
             m.update(linkedin_id)
             person_profile['_id'] = UnicodeDammit(m.hexdigest()).markup
             person_profile['profile_id'] = linkedin_id
             person_profile['url'] = UnicodeDammit(response.url).markup
             yield person_profile
Example #10
0
    def parse_product(self, response):
        loader = ProductLoader(Product(), response=response)
        identifier = response.css(
            'input.productId::attr(value)').extract_first()
        loader.add_value('identifier', identifier)
        loader.add_value('url', url_query_cleaner(response.url))
        loader.add_css('name', '.title h1::text')
        category = response.css('.breadcrumbs a::text').extract()
        loader.add_value('category', category[2:])
        image_url = response.css(
            '.productDetail1 .image img::attr(src)').extract_first()
        if image_url:
            loader.add_value('image_url', response.urljoin(image_url))
        loader.add_value('brand', category[-1])
        item = loader.load_item()

        for option in response.xpath('//div[@id="valStaffelSelection"]//li'):
            loader = ProductLoader(Product(), selector=option)
            loader.add_value(None, item)
            identifier = item['identifier'] + '-' + option.xpath(
                'input/@value').extract_first()
            loader.replace_value('identifier', identifier)
            url = item['url'] + '?' + option.xpath('@class').extract_first()
            loader.replace_value('url', url)
            loader.add_css('name', 'span.label::text')
            price = option.css('div.price::text').extract()
            loader.replace_value('price', price.pop())
            loader.replace_value('sku', identifier)
            yield loader.load_item()
Example #11
0
def load_products(response):
    """Load a ProductItem from the product page response."""
    loader = ProductItemLoader(item=ProductItem(),response=response)
    url = url_query_cleaner(response.url, ['snr'], remove=True)
    url = canonicalize_url(url)
    loader.add_value('url', url)
    publisher = response.xpath('//div[contains(concat(" ", normalize-space(@class), " "), " product-details-row ")][6]/div[2]/a[1]/span[2]/text()')
    if publisher is None:
        loader.add_xpath('developer','//div[contains(concat(" ", normalize-space(@class), " "), " product-details-row ")][5]/div[2]/a[1]/span[2]/text()')
        loader.add_xpath('publisher','//div[contains(concat(" ", normalize-space(@class), " "), " product-details-row ")][5]/div[2]/a[2]/span[2]/text()')
    else:
        loader.add_xpath('developer','//div[contains(concat(" ", normalize-space(@class), " "), " product-details-row ")][6]/div[2]/a[1]/span[2]/text()')
        loader.add_xpath('publisher','//div[contains(concat(" ", normalize-space(@class), " "), " product-details-row ")][6]/div[2]/a[2]/span[2]/text()')
    loader.add_xpath('release_date','//div[contains(concat(" ", normalize-space(@class), " "), " product-details-row ")][4]/div[2]/text()')
    loader.add_css('app_name', '.header__title ::text')
    loader.add_css('specs', '.game-features__title ::text')
    loader.add_css('genre', '.product-details__data span a.un ::text')

    try:
        price = response.css('.module-buy__info > meta:nth-child(2) ::attr(content)').extract_first()
        price_disc = price
    except:
        price = None
        price_disc = price

    if price is None:
        price = '0.00'
        price_disc = price
    loader.add_value('price', price)
    loader.add_value('discount_price', price_disc)
    

    loader.add_css('rating', 'div.average-rating:nth-child(1) > meta:nth-child(4) ::attr(content)')
    
    return loader.load_item()
Example #12
0
    def parse_product(self, response):
        options = response.css('.pg_select')
        if options:
            selected_option = options.xpath('option[@selected]')
            if not selected_option:
                for url in options.xpath('.//@data-href').extract():
                    yield Request(response.urljoin(url_query_cleaner(url)),
                                  self.parse_product)
                return

        loader = ProductLoader(Product(), response=response)
        sku = response.xpath(
            '//div[@id="content"]//input[@name="sku"]/@value').extract_first()
        loader.add_value('identifier', sku)
        loader.add_value('sku', sku)
        loader.add_value('url', response.url)
        loader.add_xpath('name', '//strong[@itemprop="name"]/text()')
        loader.add_css('price', 'div.show h5 ::text')
        loader.add_css('price', '.nowPrice ::text')
        loader.add_css('price', '.typicalPrice h5 ::text')
        category = response.xpath('//input[@name="productDetailsDTO"]/@value'
                                  ).re('"category":"(.+?)"')
        if category:
            loader.add_value('category', category[0].split('/'))
        image_url = response.css(
            'ul#galleryImages a::attr(href)').extract_first()
        if image_url:
            loader.add_value('image_url', response.urljoin(image_url))
        loader.add_xpath(
            'brand',
            '//span[@itemprop="brand"]//span[@itemprop="name"]/text()')
        if response.css('div#content p.oos'):
            loader.add_value('stock', 0)
        yield loader.load_item()
    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        categories = response.xpath(
            '//ul[@id="category-level-1"]//a/@href').extract()
        for category in categories:
            yield Request(response.urljoin(category))

        products = response.css('div.b-product_title a::attr(href)').extract()
        for product in products:
            yield Request(url_query_cleaner(response.urljoin(product)),
                          callback=self.parse_product,
                          meta=response.meta)

        pages = response.css('ul.b-pagination a::attr(href)').extract()
        for url in pages:
            yield Request(url, meta=response.meta)

        identifier = hxs.select(
            '//p[contains(@class, "productid")]/@class').re('p_(.*)')
        if identifier:
            yield Request(response.url,
                          dont_filter=True,
                          callback=self.parse_product,
                          meta=response.meta)
    def parse_url(self, url: URL) -> str:
        # Keep the query strings if they might be feed strings.
        # Wikipedia for example uses query strings to differentiate feeds.
        if any(key in url.query for key in self.valid_keys):
            return canonicalize_url(str(url))

        # Canonicalizing the URL is about 4x slower, but worth it to prevent duplicate requests.
        return canonicalize_url(url_query_cleaner(str(url)))
Example #15
0
 def test_url_query_cleaner_keep_fragments(self):
     self.assertEqual(
         "product.html?id=200#foo",
         url_query_cleaner(
             "product.html?id=200&foo=bar&name=wired#foo",
             ["id"],
             keep_fragments=True,
         ),
     )
Example #16
0
    def parse_product(self, response):
        base_sku = response.xpath('//@data-ref').extract_first()
        identifier = re.search('p(\d+)$',
                               url_query_cleaner(response.url)).group(1)
        url = 'https://www.andrewjamesworldwide.com/ajax/get_product_options/{0}'.format(
            identifier)
        data = json.load(urlopen(url))
        attributes = [attr['values'] for attr in data['attributes']]
        if [] in attributes:
            url = add_or_replace_parameter(url, 'attributes[1]',
                                           attributes[0][0]['value_id'])
            data = json.load(urlopen(url))
            attributes = [attr['values'] for attr in data['attributes']]
        variants = itertools.product(*attributes)
        for variant in variants:
            url = 'https://www.andrewjamesworldwide.com/ajax/get_product_options/{0}'.format(
                identifier)
            for idx, option in enumerate(variant):
                url = add_or_replace_parameter(
                    url, 'attributes[{0}]'.format(idx + 1), option['value_id'])
            data = json.load(urlopen(url))
            selection = data['selection'].values()[0]
            sku = selection['reference'].strip()
            if not sku and base_sku not in self.skus_found:
                sku = base_sku
            if sku not in self.skus.keys():
                continue
            if sku in self.skus_found:
                self.logger.info('Duplicated SKU is found: %s' % sku)
            self.skus_found.add(sku)

            loader = ProductLoader(item=Product(), response=response)
            loader.add_value('sku', sku)
            loader.add_value('identifier', selection['product_id'])
            loader.add_xpath('name', '//span[@id="js-product-title"]/text()')
            loader.add_value('name', [option['value'] for option in variant])
            loader.replace_value('name', selection['title'])
            loader.add_value('url', response.url)
            loader.add_value('price', selection['price_inc'])
            category = response.css('div.breadcrumb a::attr(title)').extract()
            loader.add_value('category', category[1:])
            try:
                image_url = [
                    attr['images'][0]['image']
                    for attr in data['attributes'][-1]['values']
                ]
            except IndexError:
                image_url = response.xpath(
                    '//div[@id="js-product-image"]//@src').extract()
            loader.add_value('image_url', response.urljoin(image_url[0]))
            loader.add_value('brand', "Andrew James")
            item = loader.load_item()

            metadata = AndrewJamesMeta()
            metadata['asin'] = self.skus[sku]['ASIN']
            item['metadata'] = metadata
            yield item
Example #17
0
	def _clean_url(self, url):
		"""
		Canonicalizes the url, as it is done in Scrapy.
		And keeps only USEFUL_QUERY_KEYS. It also strips the 
		trailing slash to help identifying dupes.
		"""
		clean_url = url_query_cleaner(url, 
			parameterlist=USEFUL_QUERY_KEYS) # , remove=True
		return canonicalize_url(clean_url).rstrip('/')
Example #18
0
    def fingerprint(self, lnk, **kw):
        url = canonicalize_url(lnk.url)
        # pconf = kw.get('conf')
        # if not pconf:
        #     pconf = xconf.get_page(project, job, lnk.page)
        qo = kw.get('df_query_only')
        qr = kw.get('df_query_remove')
        if qo:
            url = url_query_cleaner(url, arg_to_iter(qo), remove=False)
        if qr:
            url = url_query_cleaner(url, arg_to_iter(qr), remove=True)

        cnf = lnk.conf
        mds = [lnk.page, url]
        for key in ('method', 'headers', 'data', 'params', 'auth', 'cookies'):
            mds.append(cnf.get(key))

        return md5sum(mds)
Example #19
0
def load_product(response):
	"""Load a ProductItem from the product page response."""
	loader = ProductItemLoader(item=ProductItem(), response=response)
	
	url = url_query_cleaner(response.url, ['snr'], remove=True)
	url = canonicalize_url(url)
	loader.add_value('url', url)
	
	found_id = re.findall('/app/(.*?)/', response.url)
	if found_id:
		id = found_id[0]
		reviews_url = f'http://steamcommunity.com/app/{id}/reviews/?browsefilter=mostrecent&p=1'
		loader.add_value('id', id)
	
	# Publication details.
	details = response.css('.details_block').extract_first()
	try:
		details = details.split('<br>')
		
		for line in details:
			line = re.sub('<[^<]+?>', '', line)  # Remove tags.
			line = re.sub('[\r\t\n]', '', line).strip()
			for prop, name in [
				('Title:', 'title'),
				('Genre:', 'genres'),
				('Release Date:', 'date')
			]:
				if prop in line:
					item = line.replace(prop, '').strip()
					loader.add_value(name, item)
	except:  # noqa E722
		pass
	
	loader.add_css('app_name', '.apphub_AppName ::text')
	
	price = response.css('.game_purchase_price ::text').extract_first()
	if not price:
		price = response.css('.discount_original_price ::text').extract_first()
		loader.add_css('discount_price', '.discount_final_price ::text')
	loader.add_value('price', price)
	
	sentiment = response.css('.game_review_summary').xpath(
		'../*[@itemprop="description"]/text()').extract()
	loader.add_value('sentiment', sentiment)
	loader.add_css('n_reviews', '.responsive_hidden', re='\(([\d,]+) reviews\)')
	
	loader.add_xpath(
		'metascore',
		'//div[@id="game_area_metascore"]/div[contains(@class, "score")]/text()')
	
	early_access = response.css('.early_access_header')
	if early_access:
		loader.add_value('early_access', True)
	else:
		loader.add_value('early_access', False)

	return loader.load_item()
Example #20
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        brand = response.xpath(
            '//span[@itemprop="http://schema.org/manufacturer"]/text()'
        ).extract_first() or response.xpath(
            '//span[@itemprop="http://schema.org/brand"]/text()'
        ).extract_first()

        identifier = hxs.select('//input[@id="itemsArray"]/@value').extract()
        if not identifier:
            return
        sku = response.xpath(
            '//*[@itemprop="mpn"]/text()').extract()[0].strip()
        product_loader = ProductLoader(item=Product(), selector=hxs)
        image_url = response.css(
            'img#productMainImage::attr(src)').extract_first()
        if image_url:
            product_loader.add_value('image_url', response.urljoin(image_url))

        category = response.meta.get('category', '')
        if not category:
            category = hxs.select('//div[@id="breadcrumb"]/ul/li/a/text()'
                                  ).extract()[-2].strip()

        product_loader.add_value('category', category)

        product_name = response.xpath('//div[@id="product"]//h1//text()').re(
            '\S+')

        product_loader.add_value('name', product_name)
        product_loader.add_xpath('url', 'link[@rel="canonical"]/@href')
        product_loader.add_value('url', url_query_cleaner(response.url))
        product_loader.add_value('identifier', identifier.pop())

        product_loader.add_value('brand', brand)
        product_loader.add_value('sku', sku)
        price = ''.join(
            hxs.select(
                '//table[contains(@class, "pricing")]//td[@class="threeColTd"][1]/text()'
            ).extract()).strip().split('(')[0].strip().replace(u'\xa3', '')
        if price:
            price = extract_price(price)
            price = price.quantize(Decimal('.01'))
            product_loader.add_value('price', price)
        else:
            product_loader.add_value('price', 0)

        stock = response.css('span.availability::text').re('\d+')
        if stock:
            product_loader.add_value('stock', stock[0])
        else:
            product_loader.add_value('stock', 0)

        yield product_loader.load_item()
    def parse_product(self, response):
        base_url = get_base_url(response)

        product_links = response.xpath('//div[@id="products"]//a[contains(@class,"qa-product-link")]/@href').extract()
        if product_links:
            for link in product_links:
                yield Request(url_query_cleaner(response.urljoin(link)), callback=self.parse_product)
            return

        product_name = response.xpath('//h1[@itemprop="name"]/text()').extract()
        if not product_name:
            return
        product_name = product_name[-1].strip()
        category = re.findall("name:'Category', value:'([^']+)'", response.body.replace("\\'", "&quote;"))
        if category:
            category = category.pop().replace("&quote;", "'")
        else:
            category = ""
        brand = response.xpath('//h1[@itemprop="name"]/span/text()').extract()
        brand = brand[0].strip() if brand else ''

        rrp_by_sku = {}

        sku_data = re.search(r'BC.product.skusCollection = \$.parseJSON\((.*)\);', response.body)
        if sku_data:
            sku_data = json.loads(demjson.decode(sku_data.group(1), encoding='utf8' ))
            rrp_by_sku = {sku.upper():str(opt['price']['high']) for sku, opt in sku_data.iteritems() if opt['price']['high']>opt['price']['low']}


        options = response.xpath('//li[contains(@class,"qa-variant-item-")]')
        for option in options:
            product_loader = ProductLoader(item=Product(), selector=option)
            sku = option.xpath('./@sku-value').extract()
            sku = sku[0]
            product_loader.add_value('sku', sku)
            product_loader.add_value('identifier', sku)
            option_name = option.xpath('./@title').extract()[0].strip()
            option_name = option_name.replace('One Color, One Size', '').replace(', One Size', '').replace('One Color, ', '').strip()
            if option_name != '':
                product_loader.add_value('name', product_name + ', ' + option_name)
            else:
                product_loader.add_value('name', product_name)
            image_url = option.xpath('./@data-img-large').extract()
            if image_url:
                product_loader.add_value('image_url', urljoin_rfc(base_url, image_url[0]))
            price = extract_price(option.xpath('./@data-price').extract()[0])
            product_loader.add_value('price', price)
            product_loader.add_value('url', response.url)
            product_loader.add_value('brand', brand)
            product_loader.add_value('category', category)
            product = product_loader.load_item()
            metadata = CRCMeta()
            metadata['rrp'] = rrp_by_sku.get(sku.upper(), '')
            product['metadata'] = metadata
            yield product
Example #22
0
class RebelSport(CrawlSpider):
    name = 'kitbag_au-rebelsport'
    allowed_domains = ['rebelsport.com.au']
    start_urls = [
        'http://www.rebelsport.com.au/store/fangear/soccer-football/604'
    ]

    categories = LinkExtractor(
        restrict_css='.secondary-menu',
        process_value=lambda url: add_or_replace_parameter(
            url, 'pageSize', '500'))
    pages = LinkExtractor(restrict_css='.pagination')
    products = LinkExtractor(
        restrict_css='.product',
        process_value=lambda url: make_variant_url(url_query_cleaner(url)))

    rules = (Rule(categories), Rule(products, callback='parse_product'))

    def parse_product(self, response):
        data = response.xpath('//script/text()').re('{\\\\"Variants.+}')[0]
        data = json.loads(data.replace('\\"', '"'))
        variants = data['Variants']
        for variant in variants:
            url = response.urljoin(variant['ProductPLU'])
            yield Request(make_variant_url(url), self.parse_product)

        loader = ProductLoader(item=Product(), response=response)
        identifier = response.xpath(
            '//input[@id="ProductPLU"]/@value').extract_first()
        loader.add_value('identifier', identifier)
        loader.add_value('sku', identifier)
        loader.add_value('url', response.url)
        loader.add_xpath('name', '(//h1[@itemprop="name"]/text())[1]')
        metadata = {}
        for i in xrange(3):
            variant_name = data['Variant%dSelected' % (i + 1)]
            if variant_name and variant_name != 'N/A':
                loader.add_value('name', variant_name)
                metadata[data['Variant%dHeader' % (i + 1)]] = variant_name
                if 'size' in variant_name.lower():
                    metadata['size'] = variant_name[5:].strip()
        price = response.css('.price-value .currency::text').extract()
        loader.add_value('price', price.pop())
        category = response.css('.breadcrumb a::text').extract()
        loader.add_value('category', category[1:])
        loader.add_css('image_url', '.product-image::attr(src)')
        loader.add_xpath('brand', '//meta[@itemprop="brand"]/@content')
        loader.add_value('shipping_cost', '7.95')
        stock = response.css('.product-stock-widget::attr(ng-init)').re(
            'AvailableOnline: (\w+)')[0]
        if stock != 'true':
            loader.add_value('stock', 0)
        item = loader.load_item()
        item['metadata'] = metadata
        yield item
Example #23
0
 def test_url_query_cleaner(self):
     self.assertEqual(
         'product.html?id=200',
         url_query_cleaner("product.html?id=200&foo=bar&name=wired",
                           ['id']))
     self.assertEqual(
         'product.html?id=200',
         url_query_cleaner("product.html?&id=200&&foo=bar&name=wired",
                           ['id']))
     self.assertEqual(
         'product.html',
         url_query_cleaner("product.html?foo=bar&name=wired", ['id']))
     self.assertEqual(
         'product.html?id=200&name=wired',
         url_query_cleaner("product.html?id=200&foo=bar&name=wired",
                           ['id', 'name']))
     self.assertEqual(
         'product.html?id',
         url_query_cleaner("product.html?id&other=3&novalue=", ['id']))
     self.assertEqual(
         'product.html?d=1&d=2&d=3',
         url_query_cleaner("product.html?d=1&e=b&d=2&d=3&other=other",
                           ['d'],
                           unique=False))
     self.assertEqual(
         'product.html?id=200&foo=bar',
         url_query_cleaner("product.html?id=200&foo=bar&name=wired#id20",
                           ['id', 'foo']))
     self.assertEqual(
         'product.html?foo=bar&name=wired',
         url_query_cleaner("product.html?id=200&foo=bar&name=wired", ['id'],
                           remove=True))
     self.assertEqual(
         'product.html?name=wired',
         url_query_cleaner("product.html?id=2&foo=bar&name=wired",
                           ['id', 'foo'],
                           remove=True))
     self.assertEqual(
         'product.html?foo=bar&name=wired',
         url_query_cleaner("product.html?id=2&foo=bar&name=wired",
                           ['id', 'footo'],
                           remove=True))
Example #24
0
    def is_href_matching(url_string: str, regex: re) -> bool:
        """
        Check if the regex has any match in the url string.

        :param url_string: URL as string
        :param regex: Regex used to search URL
        :return: boolean
        """
        if regex.search(url_query_cleaner(url_string)):
            return True
        return False
    def resultsPage(self, response):
        table = response.selector.xpath('/html/body/div/div[3]/div[2]/table')
        cisResultLinks = table.css('.left-align').xpath('a/@href').extract()
        for link in cisResultLinks:
            yield Request(urlparse.urljoin(response.url, link), 
                          callback = self.surveyResult)

        if len(response.selector.xpath('//input[@value="Next page"]')) != 0:
            yield FormRequest.from_response(response,
                                            formxpath='//div[@class="page-forward"]/form[1]',
                                            url = url_query_cleaner(response.url), # workaround for scrapy problem
                                            callback = self.resultsPage)
Example #26
0
def _link_callback(uri: str, rel: str) -> str:
    """Replace default link loading in xhtml2pdf.
    
    We don't want the pdf generation process to actually attempt to hit the network or
    filesystem so we return a placeholder data URL for links that appear to be images
    otherwise we simply return an empty string so that nothing is loaded."""
    uri = url_query_cleaner(uri)
    type_, _ = mimetypes.guess_type(uri)
    if type_ and type_.startswith('image/'):
        return _BLACK_PIXEL_DATA_URL
    else:
        return ''
Example #27
0
 def test_url_query_cleaner(self):
     self.assertEqual('product.html?id=200',
             url_query_cleaner("product.html?id=200&foo=bar&name=wired", ['id']))
     self.assertEqual('product.html?id=200',
             url_query_cleaner("product.html?&id=200&&foo=bar&name=wired", ['id']))
     self.assertEqual('product.html',
             url_query_cleaner("product.html?foo=bar&name=wired", ['id']))
     self.assertEqual('product.html?id=200&name=wired',
             url_query_cleaner("product.html?id=200&foo=bar&name=wired", ['id', 'name']))
     self.assertEqual('product.html?id',
             url_query_cleaner("product.html?id&other=3&novalue=", ['id']))
     self.assertEqual('product.html?d=1&d=2&d=3',
             url_query_cleaner("product.html?d=1&e=b&d=2&d=3&other=other", ['d'], unique=False))
     self.assertEqual('product.html?id=200&foo=bar',
             url_query_cleaner("product.html?id=200&foo=bar&name=wired#id20", ['id', 'foo']))
     self.assertEqual('product.html?foo=bar&name=wired',
             url_query_cleaner("product.html?id=200&foo=bar&name=wired", ['id'], remove=True))
     self.assertEqual('product.html?name=wired',
             url_query_cleaner("product.html?id=2&foo=bar&name=wired", ['id', 'foo'], remove=True))
     self.assertEqual('product.html?foo=bar&name=wired',
             url_query_cleaner("product.html?id=2&foo=bar&name=wired", ['id', 'footo'], remove=True))
Example #28
0
 def parse_products_list(self, response):
     hxs = HtmlXPathSelector(response)
     base_url = get_base_url(response)
     brand = response.meta.get('brand')
     for url in hxs.select('//div[@class="product_name"]//a/@href | //div[@class="product_features"]/h3/a/@href').extract():
         self.jar_counter += 1
         yield Request(url_query_cleaner(response.urljoin(url)),
                       callback=self.parse_product,
                       cookies={},
                       meta={'cookiejar': self.jar_counter, 'brand': brand})
     for url in hxs.select('//ul[@class="catthumb_list clearfix"]//div[@class="title"]/a/@href').extract():
         yield Request(urljoin_rfc(base_url, url), callback=self.parse_products_list)
Example #29
0
    def parse_product(self, response):
        if response.url.endswith('page-not-found.page'):
            return
        formdata = {}
        for inp in response.xpath('//form[@id="variant-form"]//input'):
            formdata[inp.xpath('@name').extract_first()] = inp.xpath(
                '@value').extract_first()
        if not formdata:
            self.logger.warning('No data on %s' % response.url)
            return
        del formdata[None]
        options = response.css('.vContainer .variantDataElement')
        for option in options:
            formdata[option.xpath('@name').extract_first()] = option.xpath(
                '@data-variant-value').extract_first()
            r = FormRequest.from_response(
                response,
                formxpath='//form[@id="variant-form"]',
                formdata=formdata,
                callback=self.parse_product)
            yield r

        loader = ProductLoader(item=Product(), response=response)
        sku = response.xpath('//input[@id="skuIdVal"]/@value').extract_first()
        if sku != url_query_parameter(response.url, 'skuId'):
            url = add_or_replace_parameter(url_query_cleaner(response.url),
                                           'skuId', sku)
            yield Request(url, self.parse_product)
            return
        loader.add_value('identifier', sku)
        loader.add_value('url', response.url)
        loader.add_xpath('name', '//h1[@id="productLabel"]//text()')
        #loader.add_css('name', '.selected .variantDisplayName_title ::text')
        loader.add_css('price', '.current-price ::text')
        loader.add_value('sku', sku)
        category = response.xpath(
            '//div[@id="breadcrumb"]//li//span[@itemprop="title"]/text()'
        ).extract()
        loader.add_value('category', category[-4:-1])
        image_url = response.xpath(
            '//img[@itemprop="image"]/@src').extract_first()
        if image_url:
            loader.add_value('image_url', response.urljoin(image_url))
        loader.add_xpath(
            'brand', '//div[@itemprop="brand"]//span[@itemprop="name"]/text()')
        loader.add_value('shipping_cost', 3)
        #if not response.css('.stock-tag.in-stock') and not response.xpath('//link[@href="http://schema.org/InStock"]') and not response.css('.available-from'):
        if not response.css('.add-to-basket'):
            loader.add_value('stock', 0)
        if loader.get_output_value('price'):
            yield loader.load_item()
Example #30
0
    def is_valid_filetype(url: str) -> bool:
        """
        Check if url string has an invalid filetype extension.

        :param url: URL string
        :return: boolean
        """
        # if file_regex.search(url.strip()):
        #     return False
        # return True
        suffix = pathlib.Path(url_query_cleaner(url)).suffix.strip(".").lower()
        if suffix in invalid_filetypes:
            return False
        return True
Example #31
0
 def img_base64_to_link(self, objs: QuerySet, html: str):
     """
     HTML String의 Base64 이미지들을 objs의 Queryset에 있는 이미지 url로 replace하여 새 HTML String을 반환
     :param objs:
     :param html:
     :return:
     """
     soup = BeautifulSoup(html, 'html.parser')
     img_tags = soup.find_all("img")
     for obj, img_tag in zip(objs, img_tags):
         # Get rid of Amazon Token
         url = url_query_cleaner(obj.image.url)
         new_img_tag = soup.new_tag('img', src=url)
         img_tag.replace_with(new_img_tag)
     return str(soup)
Example #32
0
    def parse_search_results(self, response):
        products = response.xpath(
            '//table[@id="sProdList"]/tbody/tr[td[@class="productImage"]]')
        for product in products:
            sku = product.css('p.wordBreak a::text').extract_first()
            if sku and sku.strip().upper() == response.meta['sku']:
                url = product.xpath(
                    './/a[@class="sku"]/@href').extract_first().strip()
                url = url_query_cleaner(url)
                yield Request(url, self.parse_product)

        sku = response.xpath('//*[@itemprop="mpn"]/text()').extract_first()
        if not products and sku and sku.strip().upper(
        ) == response.meta['sku']:
            yield Request(url_query_cleaner(response.url),
                          self.parse_product,
                          dont_filter=True)

        urls = response.css('ul.categoryList a::attr(href)').extract()
        if not products and not sku and urls:
            for url in urls:
                yield Request(url,
                              self.parse_search_results,
                              meta=response.meta)
Example #33
0
 def parse(self, response):
     self.state['items_count'] = self.state.get('items_count', 0) + 1
     response = response.replace(url=url_query_cleaner(response.url))
     hxs = HtmlXPathSelector(response)
     index_level = self.determine_level(response)
     if index_level in [1, 2]:
         relative_urls = self.get_follow_links(index_level, hxs)
         if relative_urls is not None:
             for url in relative_urls:
                 yield Request(url, callback=self.parse)
     elif index_level == 3:
         vacature = ITBanenParser.parse_profile(hxs)
         if vacature is None:
             return
         vacature['url'] = UnicodeDammit(response.url).markup
         yield vacature
Example #34
0
 def parse_content(self,response):
     self.log("Crawled %s %d"%(response.url,response.status),level=scrapy.log.INFO)
     #self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO)
     if response.status / 100 != 2:
         yield scrapy.Request(url=response.url,callback=self.parse_content)     
         return
     base_url  = get_base_url(response)
     #解析文章
     for href in response.xpath('//table//a/@href').extract():
         if "view_abstract.aspx?"  in href:
             href = url_query_cleaner(href,("file_no"))
         elif"create_pdf.aspx?"  in href:
             pass                
         else:
             continue
         abs_url =urljoin_rfc(base_url,href)            
         yield self.baidu_rpc_request({"url":abs_url,"src_id":22},furl=response.url)
Example #35
0
    def parse_hotel(self, response):
        hxs = Selector(response)
        hotel = HtmlParser.extract_hotel(response.url, hxs)

        checkin = url_query_parameter(response.url,"checkin")
        checkout = url_query_parameter(response.url,"checkout")

        checkinDatetime = None
        checkoutDatetime = None

        today = datetime.date.today()

        if checkin is not None:
            checkinDatetime = datetime.datetime.strptime(checkin, "%Y-%m-%d").date()
            checkinDatetime = self.add_months(checkinDatetime,1)
        else:
            checkinDatetime = datetime.date(today.year, today.month, 15)

        if checkout is not None:
            checkoutDatetime = datetime.datetime.strptime(checkout, "%Y-%m-%d").date()
            checkoutDatetime = self.add_months(checkoutDatetime,1)
        else:
            checkoutDatetime = datetime.date(today.year, today.month, 16)

        maxDatetime = self.add_months(today,18)

        if checkinDatetime < maxDatetime:
            url = url_query_cleaner(response.url)
            url = add_or_replace_parameter(url,"checkin",str(checkinDatetime))
            url = add_or_replace_parameter(url,"checkout",str(checkoutDatetime))
            #logging.warning('----------------------------  %s' % url)
            yield Request(url, callback=self.parse_hotel)

        yield hotel["hotel"]

        if len(hotel["rooms"]) > 0:
            for room in hotel["rooms"]:
                yield room
Example #36
0
    def parse(self, response):
        # Get product details if /ip/ is in the URL
        if '/ip/' in response.url:
            # Remove the unnecessary parameters from the product url
            clean_url = url_query_cleaner(response.url)
            # Create a new Product
            p = Product()
            p['url'] = clean_url
            p['title'] = response.xpath("//h1[@itemprop='name']/span/text()").extract()[0].strip()
            price_data = response.xpath("//div[@itemprop='price']//text()").extract()

            if price_data:
                p['price'] = Decimal("".join(price_data[2:7]))
            else:
                p['price'] = Decimal("0")

            yield p

        # Check all of the links on the current page
        for link in response.xpath("//a/@href").extract():
            # Create an absolute url
            abs_url = urlparse.urljoin(response.url, link.strip())
            # Create a new request for a spider to crawl
            yield Request(url=abs_url)
 def __parse_as_next_page__(self, response):
     refer = response.request.headers.get('Referer')
     chart = self.__chart_items[url_query_cleaner(refer)]
     yield self.do_parse(chart, response)
Example #38
0
def url_cleaner(url):
    url = url_query_cleaner(url)
    url = url.replace('://www','://')
    return url
Example #39
0
 def test_url_query_cleaner_keep_fragments(self):
     self.assertEqual('product.html?id=200#foo',
             url_query_cleaner("product.html?id=200&foo=bar&name=wired#foo",
                               ['id'],
                               keep_fragments=True))
Example #40
0
 def remove_url_parameter(url):
     return url_query_cleaner(url)
Example #41
0
 def test_url_query_cleaner(self):
     self.assertEqual('product.html?id=200',
             url_query_cleaner("product.html?id=200&foo=bar&name=wired", ['id']))
     self.assertEqual('product.html?id=200',
             url_query_cleaner("product.html?&id=200&&foo=bar&name=wired", ['id']))
     self.assertEqual('product.html',
             url_query_cleaner("product.html?foo=bar&name=wired", ['id']))
     self.assertEqual('product.html?id=200&name=wired',
             url_query_cleaner("product.html?id=200&foo=bar&name=wired", ['id', 'name']))
     self.assertEqual('product.html?id',
             url_query_cleaner("product.html?id&other=3&novalue=", ['id']))
     # default is to remove duplicate keys
     self.assertEqual('product.html?d=1',
             url_query_cleaner("product.html?d=1&e=b&d=2&d=3&other=other", ['d']))
     # unique=False disables duplicate keys filtering
     self.assertEqual('product.html?d=1&d=2&d=3',
             url_query_cleaner("product.html?d=1&e=b&d=2&d=3&other=other", ['d'], unique=False))
     self.assertEqual('product.html?id=200&foo=bar',
             url_query_cleaner("product.html?id=200&foo=bar&name=wired#id20", ['id', 'foo']))
     self.assertEqual('product.html?foo=bar&name=wired',
             url_query_cleaner("product.html?id=200&foo=bar&name=wired", ['id'], remove=True))
     self.assertEqual('product.html?name=wired',
             url_query_cleaner("product.html?id=2&foo=bar&name=wired", ['id', 'foo'], remove=True))
     self.assertEqual('product.html?foo=bar&name=wired',
             url_query_cleaner("product.html?id=2&foo=bar&name=wired", ['id', 'footo'], remove=True))
     self.assertEqual('product.html?foo=bar',
             url_query_cleaner("product.html?foo=bar&name=wired", 'foo'))
     self.assertEqual('product.html?foobar=wired',
             url_query_cleaner("product.html?foo=bar&foobar=wired", 'foobar'))