Ejemplo n.º 1
0
class EcarteleraSpider(CrawlSpider):
    name = 'cartelera'
    item_count = 0
    allowed_domain = ['https://www.ecartelera.com']
    start_urls = ['https://www.ecartelera.com/listas/mejores-peliculas/']
    
    rules = {
		# Para cada item
		Rule(LinkExtractor(allow = (), restrict_xpaths = ('//div[@class="pagination"]/a[last()]')), callback='process', process_links= 'appendDummy', follow=True),
		Rule(LinkExtractor(allow =(), restrict_xpaths = ('//*[@id="listaglobal"]//a')),
							callback = 'parse_item', follow = True)
	}
    
    def parse_item(self, response):
        ecartelera_item = CarteleraItem()
		#info de pelicula
        ecartelera_item['titulo'] = response.xpath('normalize-space(//*[@id="bloc1"]/div[1]/div/p[2]/text())').extract()
        if(len(ecartelera_item['titulo'])==0):
            ecartelera_item['titulo'] = response.xpath('normalize-space(//*[@id="bloc1"]/div[1]/div/p[3]/text())').extract()
            
        ecartelera_item['tituloOriginal'] = response.xpath('normalize-space(//*[@id="bloc1"]/div[1]/div/p[2]/text())').extract()
        ecartelera_item['anyo'] = response.xpath('normalize-space(//*[@id="bloc1"]/div[1]/div/p[1]/span/text())').extract()
        ecartelera_item['pais'] = response.xpath('normalize-space(//*[@id="bloc1"]/div[1]/div/p[3]/text())').extract()
        ecartelera_item['duraccion'] = response.xpath('normalize-space(//*[@id="bloc1"]/div[1]/div/p[4]/span)').extract()
        ecartelera_item['presupuesto'] = response.xpath('normalize-space(//*[@id="bloc1"]/div[1]/div/p[5]/text())').extract()
        ecartelera_item['genero'] = response.xpath('normalize-space(//*[@id="bloc1"]/div[1]/div/p[6]/span)').extract()
        ecartelera_item['estudio'] = response.xpath('normalize-space(//*[@id="bloc1"]/div[1]/div/p[7]/span)').extract()
    	#ecartelera_item['distribuidora'] = response.xpath('normalize-space(
        ecartelera_item['ranking'] = response.xpath('normalize-space(//*[@id="bloc1"]/div[3]/div/div/p[2]/strong[1])').extract()
    	#ecartelera_item['listas'] = response.xpath('normalize-space(

        yield ecartelera_item
Ejemplo n.º 2
0
class MercadoSpider(CrawlSpider):
    name = 'mercado'
	item_count = 0
	allowed_domain = ['www.mercadolibre.com.co']
	start_urls = ['https://listado.mercadolibre.com.co/celulares-xiaomi#D[A:celulares%20xiaomi]']

	rules = {
		# Para cada item
		Rule(LinkExtractor(allow = (), restrict_xpaths = ('//li[@class="andes-pagination__arrow-title"]/a'))),
		Rule(LinkExtractor(allow =(), restrict_xpaths = ('//h2[contains(@class,"main-title")]/a')),
							callback = 'parse_item', follow = False)
	}

	def parse_item(self, response):
		ml_item = MercadoItem()
		#info de producto
		ml_item['titulo'] = response.xpath('normalize-space(//h1[@class="item-title__primary"]/text())').extract_first()

		ml_item['precio'] = response.xpath('normalize-space(//span[@class="price-tag-fraction"]/text())').extract()

		ml_item['envio'] = response.xpath('normalize-space(//p[contains(@class, "shipping-method-title shipping-text")]/text())').extract()

        ml_item['vendido'] = response.xpath('normalize-space(//[(@class = "item-conditions")]/text())').extract()

        ml_item['opiniones'] = response.xpath('normalize-space(//span[@class="average-legend"]/text())').extract()
		#imagenes del producto
		ml_item['image_urls'] = response.xpath('//figure[contains(@class, "gallery-image-container")]/a/img/@src').extract()
		ml_item['image_name'] = response.xpath('normalize-space(//h1[@class="item-title__primary "]/text())').extract_first()

		self.item_count += 1
		if self.item_count > 20:
			raise CloseSpider('item_exceeded')
		yield ml_item
Ejemplo n.º 3
0
class CumbriaSpider(CrawlSpider):
    name = 'Cumbria'
    allowed_domains = ['www.cumbria.ac.uk']
    start_urls = [
        'https://www.cumbria.ac.uk/study/courses/course-search/?level=ug-full-time-degree&level=ug-sandwich-placement&page=1'
    ]

    rules = (
        Rule(LinkExtractor(allow=r'page=\d*'), follow=True),
        Rule(LinkExtractor(
            restrict_xpaths='//div[@class="articles-wrapper"]/article/a'),
             follow=False,
             callback='parse_item'),
    )

    def parse_item(self, response):
        print('--------------------', response.url)
        titles = response.xpath('//h1//text()').extract()
        titles = ''.join(titles)
        degree_type = re.findall('[A-Za-z]*\s\([a-zA-Z]{0,6}\)', titles)
        degree_type = ''.join(degree_type)
        programme = titles.replace(degree_type, '').strip()

        ucas_code = response.xpath(
            '//div[@class="ucas-code"]//text()').extract()
        ucas_code = ''.join(ucas_code).replace('Course code', '').strip()
        # print(ucas_code)

        modules = response.xpath(
            '//div[@id="course-outline"]//text()').extract()
        modules = ''.join(modules)
Ejemplo n.º 4
0
class Ecspider(CrawlSpider):
    name = 'zgcspider'
    custom_settings = {
        'ITEM_PIPELINES': {
            'zgc.pipelines.RedisPipeline': 300,
        }
    }

    start_urls = [
        'http://mobile.zol.com.cn/', 'http://bbs.zol.com.cn/sjbbs/p1.html#c'
    ]
    page_link_a = LinkExtractor(allow=(r'/detail_\d+/'))
    link_b = LinkExtractor(allow=(r'/more/\d+_\d+.shtml'))
    page_link_c = LinkExtractor(allow=(r'/sjbbs/p\d+.html#c'))
    page_link_b = LinkExtractor(allow=(r'/more/\d+_\d+_\d+.shtml'))

    rules = (
        Rule(page_link_a, callback="parse_item", follow=True),
        Rule(link_b),
        Rule(page_link_b, callback="parse_item", follow=True),
        Rule(page_link_c, callback="parse_item", follow=True),
    )

    def parse_item(self, response):
        item = RedisItem()
        item['url'] = response.url
        yield item
Ejemplo n.º 5
0
class MeishijieSpider(CrawlSpider):
    name = 'meishijie'
    allowed_domains = ['meishij.net']
    start_urls = ['http://www.meishij.net']

    num = 0
    next_page = 0

    rules = (Rule(
        LinkExtractor(allow=(r'http://www.meishij.net/china-food/xiaochi/$')),
        callback='test',
        follow=True),
             Rule(LinkExtractor(allow=(
                 r'http://www.meishij.net/china-food/xiaochi/\?&page=\d+$')),
                  callback='next',
                  follow=True),
             Rule(LinkExtractor(
                 allow=(r'http://www.meishij.net/zuofa/\w+\.html')),
                  callback='save'))

    def next(self, response):
        self.next_page += 1
        print("next++++++++++++++++++++++++++++++++++++++++++", self.next_page)

    def save(self, response):
        self.num += 1
        print(
            response.xpath(".//*[@id='tongji_title']/text()").extract(),
            self.num)
Ejemplo n.º 6
0
class Furgoneta1Spider(CrawlSpider):
    name = "furgoneta1"
    item_count = 1
    MAX_ITEMS = 2500

    allowed_domain = ['www.mercadolibre.com.ar']
    
    start_urls = [
        'https://autos.mercadolibre.com.ar/_VEHICLE*BODY*TYPE_452750#VEHICLE_BODY_TYPE'
    ]
    
    rules = {
		# Boton siguiente
		Rule(LinkExtractor(allow = (), restrict_xpaths = ("//li[contains(@class, 'andes-pagination__button andes-pagination__button--next')]/a"))),
        # Ingreso al item
		Rule(LinkExtractor(allow =(), restrict_xpaths = ("//div[contains(@class, 'rowItem item item--grid item--has-row-logo new')]/a")),
            callback = 'parse_item', follow = False)
	}

    def parse_item(self, response):
        item = VehiculosItem()
        
        item['id'] = "crawler1_" + str(self.item_count)
        item['categoria'] = "furgoneta"
        item['titulo'] = response.xpath('normalize-space(//h1[@class="item-title__primary "]/text())').extract_first()
        item['imagen_urls'] = response.xpath('//figure[contains(@class, "gallery-image-container")]/a/img/@src').extract()
        
        if self.item_count > self.MAX_ITEMS:
            raise CloseSpider("Scraping terminado con "  + str(self.item_count - 1) + " vehiculos analizados.")
        
        self.item_count += 1
        
        yield item
Ejemplo n.º 7
0
class MercadoSpider(CrawlSpider):
    name = 'mercado'
    item_count = 0
    allowed_domain = ['www.mercadolibre.com.mx']
    start_url = [
        'https://listado.mercadolibre.com.mx/consolas-videojuegos#D[A:consolas-videojuegos,B:5]'
    ]

    rules = {
        # Para cada item
        Rule(
            LinkExtractor(
                allow=(),
                restrict_xpaths=('//li[@class="pagination__next"]/a'))),
        Rule(LinkExtractor(
            allow=(),
            restrict_xpaths=('//h2[contains(@class,"item__title")]/a')),
             callback='parse_item',
             follow=False)
    }

    def parse_item(self, response):
        product = MercadoItem()

        #get info product
        product['titulo'] = response.xpath(
            'normalize-space(//h1[@class="item-title__primary"]/text())'
        ).extract_firts()
        #Maximo de productos
        self.item_count += 1
        if self.item_count > 20:
            raise CloseSpider('item_exceeded')
        yield product
class ComputadorasSpider(CrawlSpider):
	name = 'computadoras'
	item_count = 0
	allowed_domain = ['www.mercadolibre.com.mx']
	start_urls = ['https://listado.mercadolibre.com.mx/computadoras#D[A:computadoras,L:1]']

	rules = {
		# Para cada item
		Rule(LinkExtractor(allow = (), restrict_xpaths = ('//li[@class="pagination__next"]/a'))),
		Rule(LinkExtractor(allow =(), restrict_xpaths = ('//h2[contains(@class,"item__title")]/a')),
							callback = 'parse_item', follow = False)
	}

	def parse_item(self, response):
		computadora_item = ProyectoItem()
		#info de producto
		computadora_item['titulo'] = response.xpath('normalize-space(//h1[@class="item-title__primary "]/text())').extract_first()
		computadora_item['modelo'] = response.xpath('normalize-space(//*[@id="root-app"]/div[2]/div[1]/div[1]/section[3]/div/section/ul/li[3]/span)').extract()
		computadora_item['marca'] = response.xpath('normalize-space(//*[@id="root-app"]/div[2]/div[1]/div[1]/section[3]/div/section/ul/li[1]/span)').extract()
		computadora_item['precio'] = response.xpath('normalize-space(//span[@class="price-tag-fraction"]/text())').extract()
		computadora_item['condicion'] = response.xpath('normalize-space(//div[@class="item-conditions"]/text())').extract()
		computadora_item['opiniones'] = response.xpath('normalize-space(//span[@class="review-summary-average"]/text())').extract()
		#info de la tienda o vendedor
		computadora_item['tipo_vendedor'] = response.xpath('normalize-space(//p[contains(@class, "power-seller")]/text())').extract()
		computadora_item['ventas_vendedor'] = response.xpath('normalize-space(//dd[@class="reputation-relevant"]/strong/text())').extract()
		
		self.item_count += 1
		if self.item_count > 40:
			raise CloseSpider('item_exceeded')
		yield computadora_item
Ejemplo n.º 9
0
class MMspider(CrawlSpider):
    """docstring for MMspider"""
    name = 'mzitu'
    allowed_domains = ['www.mzitu.com']
    start_urls = ['http://www.mzitu.com/']
    rules = (
        Rule(LinkExtractor(allow=(r'/xinggan/page/\d+')), follow=True),
        Rule(LinkExtractor(allow=(r'/\d{1,6}', ), deny=(r'/\d{1,6}/\d{1,6}')),
             callback='parse_item',
             follow=True),
    )

    def parse_item(self, response):
        header = {
            "User-agent":
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36"
        }
        item = MzituItem()
        item["name"] = response.css(".main-title::text").extract()
        item["url"] = response.url
        item['image_urls'] = response.css(
            ".main-image img::attr(src)").extract()
        time.sleep(random.randint(3, 6))
        yield Request(response.url, headers=header)
        yield item
Ejemplo n.º 10
0
class Synsam(CrawlSpider):
    name = 'specsavers_no-synsam'
    allowed_domains = ['synsam.no']
    start_urls = ['https://www.synsam.no/kontaktlinser']

    products = LinkExtractor(restrict_css='.product-list-products')
    pagination = LinkExtractor(
        restrict_css='.paging-navigation',
        process_value=lambda x: 'https://www.synsam.no/ArticleFilter/CL/?'
        'sort=price&sortOrder=asc&from=' + re.search('fr=(.*)&?', x).group(1))
    rules = (Rule(products, callback='parse_product'), Rule(pagination))

    def parse_product(self, response):
        if response.xpath('//h5[contains(., "under varemerket")]'):
            return
        loader = ProductLoader(Product(), response=response)
        identifier = response.xpath(
            '//input[@id="articleId"]/@value').extract_first(
            ) or response.xpath('//input[@id="skuId"]/@value').extract_first()
        loader.add_value('identifier', identifier)
        loader.add_value('sku', identifier)
        loader.add_value('url', response.url)
        breadcrumbs = response.css('.breadcrumbs a::text').extract()[1:]
        loader.add_value('name', breadcrumbs.pop())
        loader.add_value('category', breadcrumbs[-3:])
        loader.add_xpath('price', '//h3[@itemprop="price"]/@content')
        loader.add_xpath('image_url', '//img[@itemprop="image"]/@src')
        loader.add_css('brand', '.product-hero-brand img::attr(alt)')
        if loader.get_output_value('price') < 1000:
            loader.add_value('shipping_cost', 49)
        yield loader.load_item()
Ejemplo n.º 11
0
class YourLens(CrawlSpider):
    name = 'specsavers_nl-yourlens'
    allowed_domains = ['yourlenses.nl']
    start_urls = ['https://www.yourlenses.nl/lenses']

    products = LinkExtractor(restrict_css='.product-list-item')
    pages = LinkExtractor(restrict_css='.prodList-pagination :not(.disabled)')

    rules = (Rule(pages), Rule(products, callback='parse_product'))

    def parse_product(self, response):
        loader = ProductLoader(Product(), response=response)
        identifier = response.xpath(
            '//input[@id="prodid"]/@value').extract_first()
        if not identifier:
            self.logger.warning('No identifier for %s' % response.url)
            return
        loader.add_value('identifier', identifier)
        loader.add_value('url', response.url)
        loader.add_css('name', 'div.infotitle h1::text')
        loader.add_css('price', '.inline.price::text')
        loader.add_value('sku', identifier)
        image_url = response.css('.photo::attr(src)').extract_first()
        if image_url:
            loader.add_value('image_url', response.urljoin(image_url))
        brand = response.xpath(
            '//meta[@itemprop="brand"]/@content').extract_first()
        if not brand:
            try:
                brand = response.xpath('//script/text()').re(
                    '"manufacturer":"(.*?)"')[0].decode('unicode-escape')
            except IndexError:
                pass
        loader.add_value('brand', brand)
        yield loader.load_item()
Ejemplo n.º 12
0
class Aprendiendo(CrawlSpider):
    name = "aprendiendo"
    item_count = 0
    allowed_domain = ['https://www.degustam.com']
    start_urls = ("https://www.degustam.com/366-especial-navidad.html", )

    rules = {
        Rule(
            LinkExtractor(allow=(),
                          restrict_xpaths=('//div[@class="next"]/a'))),
        Rule(LinkExtractor(allow=(), restrict_xpaths=('//h5/a')),
             callback='parse_item',
             follow=False),
    }

    def parse_item(self, response):
        ml_item = AprendiendoItem()

        ml_item['titulo'] = response.xpath(
            '//h1[@class="col-xs-12 page-title product-name"]/text()').extract(
            )
        ml_item['descripcion'] = response.xpath('//p/text()').extract()
        ml_item['precio'] = response.xpath(
            '//span[@itemprop="price"]/text()').extract()
        ml_item['image_urls'] = response.xpath(
            '//figure[contains(@class, "col-sm-4 col-xs-12 product-img2")]/a/img/@src'
        ).extract()
        ml_item['image_name'] = response.xpath(
            '//h1[@class="col-xs-12 page-title product-name"]/text()'
        ).extract_first()

        self.item_count += 1
        if self.item_count > 5:
            raise CloseSpider('item_exceeded')
        yield ml_item
Ejemplo n.º 13
0
class MercadoSpider(CrawlSpider):
    name = "mercado"
    item_count = 0
    allowed_domain = ['www.mercadolibre.com.ve']
    start_urls = ['https://listado.mercadolibre.com.ve/impresoras']

    rules = {
        Rule(
            LinkExtractor(
                allow=(),
                restrict_xpaths=(
                    '//*[@id="results-section"]/div[2]/ul/li[12]/a'))),
        Rule(LinkExtractor(
            allow=(),
            restrict_xpaths=(
                '//li[@class="results-item article grid item-info-height-117"]'
            )),
             callback='parse_item',
             follow=False),
    }

    def parse_item(self, response):
        yield {
            'titulo':
            response.xpath(
                'normalize-space(//*[@id="short-desc"]/div/header/h1)').
            extract(),
            'precio':
            response.xpath(
                'normalize-space(//*[@id="productInfo"]/fieldset[1]/span/span[2])'
            ).extract()
        }
        self.item_count += 1
        if self.item_count > 30:
            raise CloseSpider('item_exceeded')
class NewsSpider(CrawlSpider):
    name = 'news'
    allowed_domains = ['www.nikkei.com']
    start_urls = ['http://www.nikkei.com/news/category/']
    rules = [
        Rule(LinkExtractor(allow=r'/news/category/[a-zA-Z]+/$')),
        Rule(LinkExtractor(allow=r'/article/[a-zA-Z\d_]+/$'), callback='parse_articles'),
    ]

    #def parse(self, response):
     #   for url in response.css('h4.cmn-article_title a::attr("href")').re(r'/article/[a-zA-Z\d_]+/$'):
      #      yield scrapy.Request(response.urljoin(url), self.parse_articles)

    def parse_articles(self, response):
        title = response.css('.cmnc-middle ::text').extract_first()
        body = response.css('.cmn-article_text').xpath('string()').extract_first().strip()
        publish_date = response.css('.cmnc-publish ::text').extract_first()
        category = response.css('.cmn-topic_path').xpath('string()').extract_first().strip()
        yield Page(
            url=response.url,
            key=extract_key(response.url),
            html=response.text,
            title=title,
            body=body,
            date=publish_date,
            category=category
        )
        """
Ejemplo n.º 15
0
class ListSpider(CrawlSpider):
    #爬虫名称
    name = "tutorial"
    #设置下载延时
    download_delay = 1
    #允许域名
    allowed_domains = ["news.cnblogs.com"]
    #开始URl
    start_urls = ["https://news.cnblogs.com"]
    #爬虫规则
    rules = (
        Rule(SgmlLinkExtractor(
            allow=(r'https://news.cnblogs.com/n/page/\d', ))),
        Rule(SgmlLinkExtractor(allow=(r'https://news.cnblogs.com/n/\d+', )),
             callback='parse_content'),
    )

    #解析内容
    def parse_content(self, response):
        item = TutorialItem()
        #当前url
        title = response.selector.xpath(
            '//div[@id="news_title"]')[0].extract().decode('utf-8')
        item['title'] = title
        author = response.selector.xpath('//div[@id="news_info"]/span/a/text()'
                                         )[0].extract().decode('utf-8')
        item['author'] = author
        releasedate = response.selector.xpath(
            '//div[@id="news_info"]/span[@class="time"]/text()')[0].extract(
            ).decode('utf-8')
        item['releasedate'] = releasedate
        yield item
Ejemplo n.º 16
0
class TabelogSpoider(CrawlSpider):
    name = 'tabelog'
    allowed_domains = ["tabelog.com"]
    start_urls = [
        'https://tabelog.com/tokyo/rstLst/lunch/?LstCosT=2&RdoCosTp=1'
    ]
    rules = [
        Rule(LinkExtractor(allow=r'/\w+/rstLst/lunch/\d/')),
        Rule(LinkExtractor(allow=r'/\w+/A\d+/A\d+/\d+/$'),
             callback='parse_restaurant'),
    ]

    def parse_restaurant(self, response):
        latitude, longitude = response.css(
            'img.js-map-lazyload::attr("data-original")').re(
                r'markers=.*?%7C([\d.]+),([\d.]+)')

        item = Restaurant(
            name=response.css('.display-name').xpath(
                'string()').extract_first().strip(),
            address=response.css('[class="rstinfo-table__address"]').xpath(
                'string()').extract_first(),
            latitude=latitude,
            longitude=longitude,
            station=response.css('[class="linktree__parent-target-text"]'
                                 ).xpath('string()').extract_first(),
            score=response.css('[class="rdheader-rating__score-val-dtl"]').
            xpath('string()').extract_first(),
        )

        return item
Ejemplo n.º 17
0
class BandQ(CrawlSpider):
    name = 'e-bedding-bandq'
    allowed_domains = ['diy.com']
    start_urls = (
        'http://www.diy.com/rooms/bedroom/bedding/DIY822487.cat',
        'http://www.diy.com/rooms/bedroom/beds-mattresses/DIY822423.cat')

    categories = LinkExtractor(restrict_css='#content .menu')
    pages = LinkExtractor(restrict_css='.paginator')
    products = LinkExtractor(restrict_css='#product-listing h3')

    rules = (Rule(categories), Rule(pages),
             Rule(products, callback='parse_product'))

    def parse_product(self, response):
        loader = ProductLoader(Product(), response=response)
        identifier = re.search('(\d+)_BQ', response.url).group(1)
        loader.add_value('identifier', identifier)
        loader.add_value('url', response.url)
        loader.add_css('name', '.product-summary h1.product-title::text')
        loader.add_css('price', '.product-price::attr(content)')
        loader.add_css('sku', 'dl.product-code dd::text')
        loader.add_value('category', 'Bedroom')
        category = response.css('.breadcrumb').xpath(
            './/li/a/text()').extract()[-1]
        loader.add_value('category', category)
        image_url = response.css('.main-img img::attr(src)').extract_first()
        if image_url:
            loader.add_value('image_url', response.urljoin(image_url))
        loader.add_xpath('brand',
                         '//th[text()="Brand"]/following-sibling::td/text()')
        if loader.get_output_value('price') < 50:
            loader.add_value('shipping_cost', 5)
        yield loader.load_item()
Ejemplo n.º 18
0
class MercadoSpider(CrawlSpider):
    name = 'mercado'
    item_count = 0
    allowed_domain = ['https://www.mercadolibre.com.pe/']
    start_urls = [
        'https://listado.mercadolibre.com.pe/impresoras#D[A:impresoras]'
    ]

    rules = {
        # Para cada item
        Rule(
            LinkExtractor(
                allow=(),
                restrict_xpaths=('//li[@class="pagination__next"]/a'))),
        Rule(LinkExtractor(
            allow=(),
            restrict_xpaths=(
                '//*[@class="item__title list-view-item-title" ]')),
             callback='parse_item',
             follow=False)
    }

    def parse_item(self, response):
        ml_item = MercadoItem()
        #info de producto
        ml_item['titulo'] = response.xpath(
            'normalize-space(//*[@class="item-title__primary"]/text())'
        ).extract_first()
        ml_item['folio'] = response.xpath(
            'normalize-space(//*[@class="item-info__id-number"]/text())'
        ).extract()
        ml_item['precio'] = response.xpath(
            'normalize-space(//span[@class="price-tag-fraction"]/text())'
        ).extract()
        ml_item['envio'] = response.xpath(
            'normalize-space(//*[@class="shipping-method-title"]/text())'
        ).extract()
        ml_item['ubicacion'] = response.xpath(
            'normalize-space(//*[@class="custom-address"]//text())').extract()
        ml_item['ventas_producto'] = response.xpath(
            'normalize-space(//*[@class="item-conditions"]/text())').extract()

        #info de la tienda o vendedor
        ml_item['vendedor_url'] = response.xpath(
            '//*[@class="reputation-view-more card-block-link"]/@href'
        ).extract()
        ml_item['ventas_vendedor'] = response.xpath(
            'normalize-space(//*[@class="reputation-relevant"][2]/strong'
        ).extract()
        ml_item['reputacion'] = response.xpath(
            'normalize-space(//*[@class="reputation-relevant"][1]/strong'
        ).extract()

        self.item_count += 1
        if self.item_count > 5:
            raise CloseSpider('item_exceeded')
        yield ml_item
Ejemplo n.º 19
0
class RutlandcyclingSpider(CrawlSpider):
    name = 'zyro-rutlandcycling.com'
    allowed_domains = ['rutlandcycling.com']
    start_urls = ('http://www.rutlandcycling.com',)
    
    rules = (
        Rule(LinkExtractor(restrict_css='.ctrNavigation, #lnkNextTop')),
        Rule(LinkExtractor(restrict_xpaths='//div[@itemtype="http://schema.org/Product"]'), callback='parse_product')
        )

    def _parse(self, response):
        for url in response.css('.ctrNavigation a::attr(href)').extract():
            yield Request(response.urljoin(url), callback=self.parse)

        for url in response.xpath('//div[@itemtype="http://schema.org/Product"]//a/@href').extract():
            yield Request(response.urljoin(url), callback=self.parse_product)

    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        name = response.xpath('//h1/text()').extract()[0]
        identifier = response.xpath('//*[@id="currentProduct"]/@value').extract()[0]
        sku = response.xpath('//p[contains(., "Code")]/span[@class="seasonCode"]/text()').extract()
        sku = sku[0] if sku else ''
        brand = response.xpath('//p[contains(., "Brand")]/span[@class="seasonCode"]/text()').extract()
        brand = brand[0] if brand else ''
        image_url = response.css('.mainImages ::attr(data-image)').extract()
        category = response.xpath('//div[@class="breadcrumbs"]//a/text()').extract()[1:-1]

        products = response.xpath('//div[@class="clAttributeGridContainer"]/div')

        for product in products:
            product_loader = ProductLoader(item=Product(), selector=product)
            p_name = product.select('div[@id="attName"]/div/text()').extract()[0]
            p_name = name + ' ' + p_name.replace('On Sale - ', '')
            p_identifier = product.select('div[@id="attCode"]/text()').extract()[0]
            price = product.select('div[@id="attPrice"]/span[@id]/text()').extract()[0]
            price = extract_price(price)
            out_of_stock = product.select('div[@id="attStockMessage"]/span[@class="OutofStockCSS"]').extract()

            product_loader.add_value('identifier', identifier + '_' + p_identifier)
            product_loader.add_value('name', p_name)
            product_loader.add_value('sku', sku)
            if image_url:
                product_loader.add_value('image_url', urljoin_rfc(base_url, image_url[0]))
            product_loader.add_value('price', price)
            if price < 20:
                product_loader.add_value('shipping_cost', 3.49)
            if out_of_stock:
                product_loader.add_value('stock', 0)
            product_loader.add_value('category', category)
            product_loader.add_value('brand', brand)
            product_loader.add_value('url', response.url)
            product = product_loader.load_item()
            yield product
Ejemplo n.º 20
0
class BMStores(Spider):
    name = 'toymonitor-bmstores'
    allowed_domains = ['bmstores.co.uk']
    start_urls = ['http://www.bmstores.co.uk/products/toys-and-games']

    categories = LinkExtractor(restrict_css='ul.aside-list')
    products = LinkExtractor(restrict_css='a.product')

    rules = (Rule(categories, callback='parse_pages',
                  follow=True), Rule(products, callback='parse_product'))

    def parse_pages(self, response):
        category_id = response.xpath('//script/text()').re(
            "categoryID: '(.+)'")[0]
        for page in response.css(
                'div.pagination ::attr(data-pageto)').extract():
            url = 'http://www.bmstores.co.uk/hpcProduct/productbyfilter/ajaxmode/1?categoryID=%s&sort=datehigh&perPage=36&pageNum=%s' % (
                category_id, page)
            yield Request(url, self.parse_page)

    def parse_page(self, response):
        data = json.loads(response.body)
        if not data['success']:
            self.logger.warning('Failed pagination %s' % response.url)
        selector = Selector(text=data['paginationLink'])
        for page in selector.css(
                'div.pagination ::attr(data-pageto)').extract():
            url = add_or_replace_parameter(response.url, 'pageNum', page)
            yield Request(url, self.parse_page)
        selector = Selector(text=data['pageHTML'])
        for url in selector.css('a.product::attr(href)').extract():
            yield Request(response.urljoin(url), self.parse_product)

    def parse_product(self, response):
        if 'login.cfm' in response.url:
            return
        loader = ProductLoader(Product(), response=response)
        identifier = response.url.split('/')[-1]
        identifier = hashlib.md5(identifier).hexdigest()
        loader.add_value('identifier', identifier)
        loader.add_value('url', response.url)
        loader.add_css('name', 'h1.content-title::text')
        loader.add_xpath('price', '//script/text()', re='price": "(.+)"')
        loader.add_xpath('sku', '//script/text()', re='sku": "(.+)"')
        category = response.xpath(
            '//ul[@id="breadcrumbs"][1]//a/text()').extract()[1:-1]
        loader.add_value('category', category)
        image_url = response.css(
            'div.product-detail-feature-img img::attr(src)').extract_first()
        if image_url:
            loader.add_value('image_url', response.urljoin(image_url))
        loader.add_xpath('brand', '//meta[@property="og:brand"]/@content')
        stock = response.xpath('//script/text()').re('availability": "(.+)"')
        if stock and stock[0] != 'In stock':
            loader.add_value('stock', 0)
        yield loader.load_item()
Ejemplo n.º 21
0
class AqiSpider(CrawlSpider):
    name = 'aqi_crawl'
    # 改域名
    allowed_domains = ['aqistudy.cn']

    # 1. 请求首页
    start_urls = ['https://www.aqistudy.cn/historydata/']

    # callback 有; follow False
    # callback 没有 follow True

    rules = (
        # 自动提取 所有城市的链接,自动发送请求 解析link
        Rule(LinkExtractor(allow='monthdata\.php')),

        # 自动提取 所有月份的链接,自动发送请求 解析link, 手动解析data
        Rule(LinkExtractor(allow="daydata\.php"), callback="parse_day", follow=False),
    )

    # 4.解析目标数据 每天的数据
    def parse_day(self, response):

        item = AqiItem()

        # 解析 标题 在提取城市名字
        title = response.xpath('//*[@id="title"]/text()').extract_first()
        item['city_name'] = title[8:-11]

        # 1. 取出所有 tr_list
        tr_list = response.xpath('//tr')

        # 2.删除表头
        tr_list.pop(0)

        for tr in tr_list:
            # 日期
            item['date'] = tr.xpath('./td[1]/text()').extract_first()
            # AQI
            item['aqi'] = tr.xpath('./td[2]/text()').extract_first()
            # 质量等级
            item['level'] = tr.xpath('./td[3]//text()').extract_first()
            # PM2.5
            item['pm2_5'] = tr.xpath('./td[4]/text()').extract_first()
            # PM10
            item['pm10'] = tr.xpath('./td[5]/text()').extract_first()
            # 二氧化硫
            item['so_2'] = tr.xpath('./td[6]/text()').extract_first()
            # 一氧化碳
            item['co'] = tr.xpath('./td[7]/text()').extract_first()
            # 二氧化氮
            item['no_2'] = tr.xpath('./td[8]/text()').extract_first()
            # 臭氧
            item['o3'] = tr.xpath('./td[9]/text()').extract_first()

            # 将数据 -->engine-->pipeline
            yield item
Ejemplo n.º 22
0
class Coverbrands(CrawlSpider):
    name = "blivakker-coverbrands"
    allowed_domains = ['coverbrands.no']
    start_urls = ['http://www.coverbrands.no/']

    rules = (Rule(
        LinkExtractor(restrict_xpaths='//ul[@id="nav"]',
                      restrict_css='.pages')),
             Rule(LinkExtractor(restrict_css='.products-grid',
                                process_value=url_query_cleaner),
                  callback='parse_product'))

    def parse_product(self, response):
        loader = ProductLoader(item=Product(), response=response)
        css = '.nosto_product .%s ::text'
        loader.add_css('identifier', css % 'product_id')
        loader.add_css('sku', css % 'product_id')
        for field in ('url', 'name', 'image_url', 'brand'):
            loader.add_css(field, css % field)
        list_price = response.css(css % 'list_price').extract_first()
        sales_price = response.css(css % 'price').extract_first()
        loader.add_value('price', list_price)
        if 'InStock' not in response.css(css % 'availability').extract_first():
            loader.add_value('stock', 0)
        category = response.css(css % 'category').extract_first()
        loader.add_value('category', category.split('/')[-1])
        options_data = response.xpath('//script/text()').re(
            'Product.Config.({.+})')
        if not options_data:
            item = loader.load_item()
            if sales_price != list_price:
                item['metadata'] = {'SalesPrice': Decimal(sales_price)}
            yield item
            return
        options_data = json.loads(options_data[0])
        if len(options_data['attributes']) > 1:
            self.log('More than one options attributes found on %s' %
                     response.url)
            return
        price = loader.get_output_value('price')
        name = loader.get_output_value('name')
        sales_price = Decimal(sales_price)
        for option in options_data['attributes'].values()[0]['options']:
            new_price = sales_price + Decimal(option['price'])
            loader.replace_value('price', price + Decimal(option['oldPrice']))
            loader.replace_value('name', name + ' ' + option['label'])
            loader.replace_value('identifier', option['products'][0])
            loader.replace_value('sku', option['products'][0])
            loader.replace_xpath(
                'image_url', '//li[@id="simple-product-image-%s"]/a/@href' %
                option['products'][0])
            item = loader.load_item()
            if price + Decimal(option['oldPrice']) != new_price:
                item['metadata'] = {'SalesPrice': new_price}
            yield item
Ejemplo n.º 23
0
class SinaSpeicalSpider(CrawlSpider):
    name = "sina_special"
    #allowed_domains = ["www.51job.com"]
    start_urls = ('http://match.2016.sina.com.cn/medals/', )

    rules = (
        Rule(LinkExtractor(
            allow=('2016.sina.com.cn/china/[0-9\-]*/doc-if[a-z0-9]*.shtml', )),
             callback='parse_one_news',
             follow=True),
        Rule(LinkExtractor(
            allow=('2016.sina.com.cn/brazil/[0-9\-]*/doc-if[a-z0-9]*.shtml',
                   '2016.sina.com.cn/side/[0-9\-]*/doc-if[a-z0-9]*.shtml')),
             callback='parse_one_news',
             follow=True),
        Rule(LinkExtractor(allow=('2016.sina.com.cn', ),
                           deny=(
                               'php$',
                               'php?',
                               'video.sina.com.cn',
                           )),
             follow=True),
    )

    def parse(self, response):
        def do_item(item):
            if item and isinstance(item, list):
                return item[0]
            return item

        try:
            rows = response.css("table[class='tb_02 tb_04'] tr[class='sub']")

            for row in rows:
                item = SpecItem()
                item['url'] = response.url
                item['kind'] = 1
                item['rank'] = row.css(
                    "td[class='w01'] ::text").extract()[0].strip()
                item['country'] = row.css(
                    "td[class='w02'] a::text").extract()[0].strip()
                item['gold'] = row.css(
                    "td[class='w03'] a::text").extract()[0].strip()
                item['silver'] = row.css(
                    "td[class='w04'] a::text").extract()[0].strip()
                item['bronze'] = row.css(
                    "td[class='w05'] a::text").extract()[0].strip()
                item['total'] = row.css(
                    "td[class='w06'] a::text").extract()[0].strip()
                yield item

        except Exception as e:
            self.logger.error("parse url:%s err:%s", response.url, e)
            return []
        return item
Ejemplo n.º 24
0
class FeneticWellbeing(CrawlSpider):
    name = 'betterlife_healthcare-feneticwellbeing'
    allowed_domains = ['feneticwellbeing.com']
    start_urls = ['http://www.feneticwellbeing.com/']

    categories = LinkExtractor(allow='/product-category/')
    products = LinkExtractor(allow='/shop/')

    rules = (Rule(categories), Rule(products, callback='parse_product'))

    def parse_product(self, response):
        loader = ProductLoader(Product(), response=response)
        identifier = response.xpath(
            '//input[@name="product_id"]/@value').extract_first(
            ) or response.xpath(
                '//input[@name="add-to-cart"]/@value').extract_first()
        if not identifier:
            loader.add_value('stock', 0)
            identifier = response.xpath(
                '//div[@itemtype="http://schema.org/Product"]/@id').re_first(
                    'product-(\d+)')
        loader.add_value('identifier', identifier)
        loader.add_css('sku', 'span.sku::text')
        loader.add_value('url', response.url)
        loader.add_xpath('name', '//h1[@itemprop="name"]/text()')
        loader.add_css('price', '.product-price-exvat span.amount::text')
        loader.add_css('price', '.product-price span.amount::text')
        category = response.xpath(
            '//span[@class="posted_in"][contains(., "Categories:")]/a/text()'
        ).extract_first()
        loader.add_value('category', category)
        loader.add_css('image_url',
                       'div.single-product-main-image a::attr(href)')
        brand = response.xpath(
            '//span[@class="posted_in"][contains(., "Brands:")]/a/text()'
        ).extract_first()
        loader.add_value('brand', brand)
        item = loader.load_item()

        variations = response.xpath(
            '//@data-product_variations').extract_first()
        if not variations:
            yield item
            return
        variations = json.loads(variations)
        for variant in variations:
            loader = ProductLoader(Product(), response=response)
            loader.add_value(None, item)
            loader.replace_value('identifier', variant['variation_id'])
            loader.replace_value('sku', variant['sku'])
            loader.replace_value('price', variant['display_price'])
            if variant['image_link']:
                loader.replace_value('image_url', variant['image_link'])
            loader.add_value('name', variant['attributes'].values())
            yield loader.load_item()
Ejemplo n.º 25
0
class Ocado(CrawlSpider):
    name = 'e-bedding-ocado'
    allowed_domains = ['ocado.com']
    start_urls = [
        'https://www.ocado.com/webshop/getCategories.do?tags=|30931|126580'
    ]

    categories = LinkExtractor(restrict_css='#navigationSidebar .superNav')
    products = LinkExtractor(restrict_css='.productTitle',
                             allow='/product/',
                             process_value=url_query_cleaner)

    rules = (Rule(categories, callback='parse_category',
                  follow=True), Rule(products, callback='parse_product'))

    def parse_category(self, response):
        count = response.css('#productCount em::text').re('\d+')[0]
        for idx in xrange(int(count)):
            url = add_or_replace_parameter(response.url, 'index', idx)
            yield Request(url)

    def parse_product(self, response):
        options = response.css('.pg_select')
        if options:
            selected_option = options.xpath('option[@selected]')
            if not selected_option:
                for url in options.xpath('.//@data-href').extract():
                    yield Request(response.urljoin(url_query_cleaner(url)),
                                  self.parse_product)
                return

        loader = ProductLoader(Product(), response=response)
        sku = response.xpath(
            '//div[@id="content"]//input[@name="sku"]/@value').extract_first()
        loader.add_value('identifier', sku)
        loader.add_value('sku', sku)
        loader.add_value('url', response.url)
        loader.add_xpath('name', '//strong[@itemprop="name"]/text()')
        loader.add_css('price', 'div.show h5 ::text')
        loader.add_css('price', '.nowPrice ::text')
        loader.add_css('price', '.typicalPrice h5 ::text')
        category = response.xpath('//input[@name="productDetailsDTO"]/@value'
                                  ).re('"category":"(.+?)"')
        if category:
            loader.add_value('category', category[0].split('/'))
        image_url = response.css(
            'ul#galleryImages a::attr(href)').extract_first()
        if image_url:
            loader.add_value('image_url', response.urljoin(image_url))
        loader.add_xpath(
            'brand',
            '//span[@itemprop="brand"]//span[@itemprop="name"]/text()')
        if response.css('div#content p.oos'):
            loader.add_value('stock', 0)
        yield loader.load_item()
Ejemplo n.º 26
0
class RebelSport(CrawlSpider):
    name = 'kitbag_au-rebelsport'
    allowed_domains = ['rebelsport.com.au']
    start_urls = [
        'http://www.rebelsport.com.au/store/fangear/soccer-football/604'
    ]

    categories = LinkExtractor(
        restrict_css='.secondary-menu',
        process_value=lambda url: add_or_replace_parameter(
            url, 'pageSize', '500'))
    pages = LinkExtractor(restrict_css='.pagination')
    products = LinkExtractor(
        restrict_css='.product',
        process_value=lambda url: make_variant_url(url_query_cleaner(url)))

    rules = (Rule(categories), Rule(products, callback='parse_product'))

    def parse_product(self, response):
        data = response.xpath('//script/text()').re('{\\\\"Variants.+}')[0]
        data = json.loads(data.replace('\\"', '"'))
        variants = data['Variants']
        for variant in variants:
            url = response.urljoin(variant['ProductPLU'])
            yield Request(make_variant_url(url), self.parse_product)

        loader = ProductLoader(item=Product(), response=response)
        identifier = response.xpath(
            '//input[@id="ProductPLU"]/@value').extract_first()
        loader.add_value('identifier', identifier)
        loader.add_value('sku', identifier)
        loader.add_value('url', response.url)
        loader.add_xpath('name', '(//h1[@itemprop="name"]/text())[1]')
        metadata = {}
        for i in xrange(3):
            variant_name = data['Variant%dSelected' % (i + 1)]
            if variant_name and variant_name != 'N/A':
                loader.add_value('name', variant_name)
                metadata[data['Variant%dHeader' % (i + 1)]] = variant_name
                if 'size' in variant_name.lower():
                    metadata['size'] = variant_name[5:].strip()
        price = response.css('.price-value .currency::text').extract()
        loader.add_value('price', price.pop())
        category = response.css('.breadcrumb a::text').extract()
        loader.add_value('category', category[1:])
        loader.add_css('image_url', '.product-image::attr(src)')
        loader.add_xpath('brand', '//meta[@itemprop="brand"]/@content')
        loader.add_value('shipping_cost', '7.95')
        stock = response.css('.product-stock-widget::attr(ng-init)').re(
            'AvailableOnline: (\w+)')[0]
        if stock != 'true':
            loader.add_value('stock', 0)
        item = loader.load_item()
        item['metadata'] = metadata
        yield item
Ejemplo n.º 27
0
class SinaOlySpider(CrawlSpider):
    name = "sina_oly"
    #allowed_domains = ["www.51job.com"]
    start_urls = ('http://2016.sina.com.cn/', )

    rules = (
        Rule(LinkExtractor(
            allow=('2016.sina.com.cn/china/[0-9\-]*/doc-if[a-z0-9]*.shtml', )),
             callback='parse_one_news',
             follow=True),
        Rule(LinkExtractor(
            allow=('2016.sina.com.cn/brazil/[0-9\-]*/doc-if[a-z0-9]*.shtml',
                   '2016.sina.com.cn/side/[0-9\-]*/doc-if[a-z0-9]*.shtml')),
             callback='parse_one_news',
             follow=True),
        Rule(LinkExtractor(allow=('2016.sina.com.cn', ),
                           deny=(
                               'php$',
                               'php?',
                               'video.sina.com.cn',
                           )),
             follow=True),
    )

    def parse_one_news(self, response):
        def do_item(item):
            if item and isinstance(item, list):
                return item[0]
            return item

        item = NewsItem()
        try:
            cn = response.css("div[class='cn']")
            item['url'] = response.url
            item['title'] = do_item(
                response.css(
                    "div[class='blkContainerSblk'] h1::text").extract())

            art_info = response.css("div[class='artInfo']")
            item['publish'] = do_item(
                art_info.css("span[id='pub_date']::text").extract())
            item['pic_title'] = do_item(
                response.css("span[class='img_descr'] ::text").extract())
            item['keywords'] = do_item(
                response.css("p[class='art_keywords'] a::text").extract())
            '''
            filename = response.url.split("/")[-2] + '.html'
            with open(filename,'wb') as f:
                f.write(response.body)
            '''
        except Exception as e:
            self.logger.error("parse url:%s err:%s", response.url, e)
            return []
        return item
Ejemplo n.º 28
0
class LifeSpider(CrawlSpider):
    name = 'life'

    # limits the number of item scraped, it's optional, but in some cases you don't want to scrape to infinity
    #item_count = 0

    # only allow information within this domain, don't let scrapy get out of it
    allowed_domain = ['https://lifeinformatica.com/']
    # for laptops
    start_urls = [
        'https://lifeinformatica.com/categoria-producto/family-ordenadores-y-portatiles/family-portatiles-y-accesorios/family-portatiles/'
    ]
    # for smartphones
    #start_urls = ['https://lifeinformatica.com/categoria-producto/family-tablets-y-moviles/family-smartphones-y-accesorios/family-smartphones/']

    rules = {
        # go through every page with the next button
        Rule(
            LinkExtractor(allow=(),
                          restrict_xpaths=(
                              '//nav[@class="electro-advanced-pagination"]/a'))
        ),
        # go inside ever product on the page
        Rule(LinkExtractor(
            allow=(), restrict_xpaths=('//div[@class="product-loop-header"]')),
             callback='parse_item',
             follow=False)
    }

    def parse_item(self, response):
        item = LifeItem()

        # product information
        item['brand'] = response.xpath(
            '//span[@itemprop="brand"]//text()').extract()
        item['name'] = response.xpath(
            '//h1[@class="product_title entry-title"]//text()').extract()
        item['price'] = response.xpath(
            '//span[@class="entero"]//text()').extract()
        item['decimals'] = response.xpath(
            '//span[@class="decimales_precio"]//text()')[0].extract()
        item['currency'] = response.xpath(
            '//span[@class="decimales_precio"]//text()')[1].extract()
        item['price_without_vat'] = response.xpath(
            '//p[@class="sinIva"]//text()').extract()
        item['availability'] = response.xpath(
            '//span[@class="no_stock"]//text()').extract()
        item['description'] = response.xpath(
            '//div[@class="electro-description clearfix"]/p/text()').extract()
        #self.item_count += 1
        #if self.item_count > 5:
        #raise CloseSpider('item_exceeded')
        yield item
Ejemplo n.º 29
0
class ShoucaiSpider(CrawlSpider):
    name = 'shoucai'
    start_urls = ['https://www.shoucainu8.com/Invest/llist/status/3']
    rules = (
        Rule(LinkExtractor(allow=(r'https://www.shoucainu8.com/Invest/llist/status/3/p/\d+.html'))),
        Rule(LinkExtractor(allow=(r'https://www.shoucainu8.com/invest/detail/sn/\d+')),
             callback='parse_item')
    )

    def parse_item(self, response):
        sel = Selector(response)
        wealth_title = sel.xpath('//div[@class="invest-title"]/h2/text()')\
            .extract()
        wealth_interest_rate = sel.xpath('//p[@class="rate"]/text()')\
            .extract()
        wealth_sum = sel.xpath('//p[@class="total"]/text()').extract()
        wealth_deadline = sel.xpath('//p[@class="duration"]/text()').extract()
        wealth_starting_amount_or_username = sel\
            .xpath('//table[@class="table-details"]/tr[1]/td/text()').extract()
        wealth_interest_bearing_method_or_id = sel\
            .xpath('//table[@class="table-details"]/tr[2]/td/text()').extract()
        wealth_phone_number_or_product_manual = sel\
            .xpath('//table[@class="table-details"]/tr[3]/td/text()').extract()
        wealth_excepted_return_or_type_of_loan = sel\
            .xpath('//table[@class="table-details"]/tr[4]/td/text()').extract()
        wealth_redemption_exit_or_use_of_the_loan = sel\
            .xpath('//table[@class="table-details"]/tr[5]/td/text()').extract()
        wealth_asset_type = sel\
            .xpath('//table[@class="table-details"]/tr[6]/td/text()').extract()
        wealth_market_value = sel\
            .xpath('//table[@class="table-details"]/tr[7]/td/text()').extract()
        wealth_payback = sel\
            .xpath('//table[@class="table-details"]/tr[8]/td/text()').extract()
        wealth_risk_control = sel\
            .xpath('//table[@class="table-details"]/tr[9]/td/text()').extract()


        item = ShoucainuItem()
        item['wealth_title'] = wealth_title
        item['wealth_interest_rate'] = wealth_interest_rate
        item['wealth_sum'] = wealth_sum
        item['wealth_deadline'] = wealth_deadline
        item['wealth_starting_amount_or_username'] = wealth_starting_amount_or_username
        item['wealth_interest_bearing_method_or_id'] = wealth_interest_bearing_method_or_id
        item['wealth_phone_number_or_product_manual'] = wealth_phone_number_or_product_manual
        item['wealth_excepted_return_or_type_of_loan'] = wealth_excepted_return_or_type_of_loan
        item['wealth_redemption_exit_or_use_of_the_loan'] = wealth_redemption_exit_or_use_of_the_loan
        item['wealth_asset_type'] =  wealth_asset_type
        item['wealth_market_value'] = wealth_market_value
        item['wealth_payback'] = wealth_payback
        item['wealth_risk_control'] = wealth_risk_control
        yield item
Ejemplo n.º 30
0
class WatchO(CrawlSpider):
    name = 'bablas-watcho'
    allowed_domains = ['watcho.co.uk']
    start_urls = ('http://www.watcho.co.uk/watches.html',
                  'http://www.watcho.co.uk/Clocks.html')
    
    categories = LinkExtractor(restrict_css='div.SubCategoryListGrid',
                               restrict_xpaths='//a[@href="%s" or @href="%s"]/following-sibling::*' %start_urls)
    pages = LinkExtractor(restrict_css='div.CategoryPagination')
    products = LinkExtractor(restrict_css='div.ProductDetails')
    
    rules = (Rule(categories),
             Rule(pages),
             Rule(products, callback='parse_product'))
    
    def parse_product(self, response):
        loader = ProductLoader(Product(), response=response)
        identifier = response.xpath('//input[@name="product_id"]/@value').extract_first()
        loader.add_value('identifier', identifier)
        loader.add_value('sku', identifier)
        loader.add_value('url', response.url)
        loader.add_xpath('name', '//h1[@itemprop="name"]/text()')
        loader.add_xpath('price', '//meta[@itemprop="price"]/@content')
        category = response.xpath('//div[@id="ProductBreadcrumb"]//a/text()').extract()[1:]
        loader.add_value('category', category)
        loader.add_xpath('image_url', '//img[@itemprop="image"]/@src')
        loader.add_xpath('brand', '//div[@itemtype="http://schema.org/Organization"]/meta[@itemprop="name"]/@content')
        if not response.xpath('//link[@itemprop="availability"]/@href[contains(., "InStock")]'):
            loader.add_value('stock', 0)
        
        sku = identifier
        name = loader.get_output_value('name')
        name_end = re.search('\S+$', name).group(0).strip(' ()')
        keywords = response.xpath('//meta[@name="keywords"]/@content').extract_first().split(',')
        keywords = [word.strip() for word in keywords if word]
        shortest_keyword = min(keywords, key=len) if keywords else 'none'
        from_name = re.findall('\S*\d+\S*', name)
        if shortest_keyword.lower() == name_end.lower():
            sku = name_end
        elif shortest_keyword.upper() == shortest_keyword:
            sku = shortest_keyword
        elif name_end.upper() == name_end:
            sku = name_end
        elif from_name:
            sku = max(from_name, key=len)
            if '(' in sku:
                sku = identifier
        loader.replace_value('sku', sku)
        yield loader.load_item()