Ejemplo n.º 1
0
 def parse_item(self, response):
     l = ItemLoader(item=CrawlpictureItem(), response=response)
     l.add_xpath('name', '//h2/a/text()')
     l.add_css('tags', 'div.metaRight p::text')
     #l.add_xpath('image_urls','//div[@id="picture"]/p/img/@src' or '//img[@class="scrollLoading"]/@src',Identity())
     l.add_css('image_urls', 'div.postContent img::attr(src)', Identity())
     l.add_value('url', response.url)
     return l.load_item()
 def parse_product(self, response):
     p = ItemLoader(item=Product(), response=response)
     p.add_css('nome', 'h1 > span[itemprop=name]::text')
     p.add_value('url', response.url)
     p.add_css('descricaoLongaHtml','.infoProdBox')
     p.add_css('descricaoLonga','.infoProdBox')
     #p.add_css('detalhes','.ficha-tecnica table tr th::text, .ficha-tecnica table tr td::text')
     p.add_css('image','ul.a-carousel-list > li > img', re='src=[\"|\'](?P<src>[^\"\']+)[\"|\']')
     p.add_css('categorias','div[class=breadcrumb-box] span[itemprop=name]::text')
     yield p.load_item()
Ejemplo n.º 3
0
    def parse(self, res):
        l = ItemLoader(item=WeiboItem(), response=res)

        l.add_css("link", "link[href]")
        l.add_css("title", "title")
        l.add_css("desc", "a span")

        print "pachong is over!!!"

        return l.load_item()
Ejemplo n.º 4
0
 def parsePost(self, response):
 	l = ItemLoader(item=Post(), response=response)
 	d = pyq(response.body)
 	l.add_value('url', response.url)
 	l.add_css('title', 'h1.entry-title::text')
 	l.add_css('date', 'span.entry-date::text')
 	l.add_css('author', 'span.author.vcard > a::text')
 	l.add_value('content', d('div.entry-content').text())
 	
 	return l.load_item()
Ejemplo n.º 5
0
def Loader_index(self, item_selector):
    l = ItemLoader(item={}, selector=item_selector)

    conver_img = l.get_xpath('.//*[@class="lz_img"]/img/@src')

    l.add_xpath('title', './/*[@class="k_list-lb-2"]/div[1]/a[1]/text()')
    l.add_xpath('url', './/*[@class="k_list-lb-2"]/div[1]/a/@href')
    l.add_value('preview', conver_img)
    l.add_css('date', '#k_list-lb-2-f::text', re=r'(\d{4}-\d{2}-\d{2})')
    l.add_value('image_urls', conver_img)
    return l.load_item()
Ejemplo n.º 6
0
 def parse_product(self, response):
     p = ItemLoader(item=Product(), response=response)
     p.add_css('nome', 'h1 > span[itemprop=name]::text')
     p.add_value('url', response.url)
     p.add_css('descricaoLongaHtml', '.infoProdBox')
     p.add_css('descricaoLonga', '.infoProdBox')
     p.add_css('image',
               'ul.a-carousel-list > li > img',
               re='src=[\"|\'](?P<src>[^\"\']+)[\"|\']')
     p.add_css('categorias',
               'div[class=breadcrumb-box] span[itemprop=name]::text')
     yield p.load_item()
Ejemplo n.º 7
0
    def parse(self, response):
        new_albums = response.css(self.filter_css)
        count = 0
        for new_album in new_albums:
            count += 1
            if count > self.count_limit:
                raise CloseSpider('done')

            loader = ItemLoader(KuwoScrapyItem(), new_album)
            loader.add_value('basic_source_info', '{}')
            loader.add_css('basic_source_name', self.name_css, TakeFirst())
            loader.add_css('basic_source_artist', self.artist_css, Join('&'))
            yield loader.load_item()
    def parse_product(self, response):
        p = ItemLoader(item=Product(), response=response)
        p.add_css('nome', 'h1.livedata::text')
        p.add_value('url', response.url)
        p.add_css('descricaoLonga', '.desc-info')
        p.add_css('image',
                  'div.container-product-image a.image-link > img',
                  re='src=[\"|\'](?P<src>[^\"\']+)[\"|\']')
        p.add_css('categorias', 'span[itemprop=title]::text')
        yield p.load_item()


#executar no mongo
#db.produto.remove({'categorias.0': {$exists: false}})
#db.produto.remove({'categorias.0': {$nin: [' Games', ' Livros', ' DVDs e Blu-ray']}})

#deleta produtos duplicados
#var duplicates = [];

#db.produto_novo.aggregate([
#{"$group" : { "_id": "$nome", "count": { "$sum": 1 }, "dups": { "$addToSet": "$_id" },  }},
#{"$match": {"_id" :{ "$ne" : null } , "count" : {"$gt": 1} } }]
#,{allowDiskUse: true},{cursor:{}}
#).result.forEach(function(doc) {
#doc.dups.shift();
#doc.dups.forEach( function(dupId){
#duplicates.push(dupId);
#}
#)
#})
#printjson(duplicates);
#db.produto_novo.remove({_id:{$in:duplicates}})
 def parse_content(self, response):
     '''Parse content pages.'''
     loader = ItemLoader(item=Rede(), response=response)
     # Usually, we are only interested in the first item, e.g. for title, place, etc.
     loader.default_output_processor = TakeFirst()
     # Add fields
     loader.add_value('link', response.url)
     loader.add_css('title', '.text h1', extract_text)
     # Test if text has an abstract
     abstract = response.css('.abstract')
     if abstract:
         loader.add_css('abstract', '.abstract', extract_text)
         loader.add_css('text', '.abstract ~ p:not(.picture)',
                        extract_text, Join('\n'))
     else:
         loader.add_css('text', '.text p:not(.picture)',
                        extract_text, Join('\n'))
     # Metadata are in dt/dd pairs.
     keys = response.css('dl dt::text').extract()
     values = response.css('dl dd::text').extract()
     for key, value in zip(keys, values):
         if key == 'Datum:':
             match = re.search(r'(\d{1,2}\.\d{1,2}\.\d{2,4})', value)
             if match:
                 # '22.03.2011' format
                 value = match.group(1)
                 dt = datetime.strptime(value.encode(ENC), '%d.%m.%Y')
             else:
                 # '22. März 2011' format
                 dt = datetime.strptime(value.encode(ENC), '%d. %B %Y')
             loader.add_value('date', dt.date())
         elif key == 'Ort:':
             loader.add_value('place', value)
     return loader.load_item()
Ejemplo n.º 10
0
def Loader_content(response):
    l = ItemLoader(item={}, response=response)
    l.add_css('title', '.k_jianjie-3a-1-name::text')
    l.add_value('date', l.get_xpath('//*[@class="k_jianjie-3a-2b"]/text()')[2])
    #l.add_value('url',_response.url[len(self._scheme+"//"+self.allowed_domains[0]):])
    l.add_css('down', '.k_jianjie-3a-5down::text', TrimAll())

    conver_img = l.get_xpath('//*[@id="k_jianjie-2b"]/a/img/@src')
    content_img = l.get_xpath('//*[@class="content"]/p/img/@src')
    l.add_value('src_url', response.url)
    l.add_value('preview', conver_img)
    l.add_value('content', content_img)
    l.add_value('image_urls', conver_img + content_img)
    print('正下载图片:', conver_img + content_img)
    #time.sleep(len(conver_img+content_img))
    return l.load_item()
Ejemplo n.º 11
0
 def get_app(self, response):
     il = ItemLoader(item=PlayStoreItems(), response=response)
     il.add_css('app_id', '.details-wrapper::attr(data-docid)')
     il.add_css('name', '.document-title div::text')
     il.add_css('category', '.category span::text')
     il.add_css(
         'category_url', '.category::attr(href)',
         Compose(lambda urls: [urljoin(response.url, url) for url in urls]))
     il.add_css('price', '.details-actions .price span::text')
     il.add_css('offers_in_app_purchases', '.inapp-msg::text')
     il.add_css('stars_count', '.stars-count::text')
     il.add_css('video', '.details-trailer > span::attr(data-video-url)')
     il.add_css('screenshots', '.screenshot::attr(src)')
     il.add_xpath(
         'description',
         '//div[contains(@class, "show-more-content")]/div//text()')
     il.add_css('update_date', '[itemprop="datePublished"]::text')
     il.add_css('file_size', '[itemprop="fileSize"]::text')
     il.add_css('installs', '[itemprop="numDownloads"]::text')
     il.add_css('current_version', '[itemprop="softwareVersion"]::text')
     il.add_css('requires_android', '[itemprop="operatingSystems"]::text')
     il.add_css('offered_by', '[itemprop="author"] > a span::text')
     il.add_css(
         'offered_by_url', '[itemprop="author"] > a::attr(href)',
         Compose(lambda urls: [urljoin(response.url, url) for url in urls]))
     yield il.load_item()
Ejemplo n.º 12
0
 def work(self, rp):
     print(rp.request.headers, '+++++++++')
     i = ItemLoader(item=NayangJobItem(), response=rp)
     i.add_css('name', '.f_left>h2::text')
     i.add_css('company', '.gs_name2 a::text')
     i.add_css('education', '.clearfix em:nth-child(7)::text')
     i.add_css('money', '.clearfix em:nth-child(-2)::text')
     i.add_css('discript', '.bd')
     i.add_css('job_addr', '.clearfix em:nth-child(5)::text')
     yield i.load_item()
Ejemplo n.º 13
0
    def parse_property(self, response):
        loader = ItemLoader(PropertyScrapperItem(), response=response)
        loader.add_css('address', '.property-address::text')
        loader.add_css('suburb',
                       'dl.cN-featDetails-extended dd.suburb a::text')
        # loader.add_css('description', 'div.main div.cT-productDescription')
        loader.add_css('sold_date',
                       'dl.cN-featDetails-extended dd.saleDate::text')
        loader.add_css('sold_price',
                       'dl.cN-featDetails-extended dd.price::text')
        loader.add_css('property_type',
                       'dl.cN-featDetails-extended dd.propertytype::text')
        loader.add_css(
            'floorplan_url',
            '#Content_Content_propertyPhotos_FloorplanLink::attr(href)')
        loader.add_css('photo_url',
                       '#Content_Content_propertyPhotos_lnkPhoto::attr(href)')
        loader.add_css('sales_type',
                       'dl.cN-featDetails-extended dd.saleType::text')

        # domain uses feature to represents bed + bath + parking,
        # we store this feature in bed, and process it later in self.process
        loader.add_css('bed', 'dl.s-featSummary dd p.features span::text')
        yield self.process(loader.load_item())
Ejemplo n.º 14
0
	def get_app(self, response):
		il = ItemLoader(item=PlayStoreItems(), response=response)
		il.add_css('app_id', '.details-wrapper::attr(data-docid)')
		il.add_css('name', '.document-title div::text')
		il.add_css('category', '.category span::text')
		il.add_css('category_url', '.category::attr(href)',
					Compose(lambda urls: [urljoin(response.url, url) for url in urls]))
		il.add_css('price', '.details-actions .price span::text')
		il.add_css('offers_in_app_purchases', '.inapp-msg::text')
		il.add_css('stars_count', '.stars-count::text')
		il.add_css('video', '.details-trailer > span::attr(data-video-url)')
		il.add_css('screenshots', '.screenshot::attr(src)')
		il.add_xpath('description', '//div[contains(@class, "show-more-content")]/div//text()')
		il.add_css('update_date', '[itemprop="datePublished"]::text')
		il.add_css('file_size', '[itemprop="fileSize"]::text')
		il.add_css('installs', '[itemprop="numDownloads"]::text')
		il.add_css('current_version', '[itemprop="softwareVersion"]::text')
		il.add_css('requires_android', '[itemprop="operatingSystems"]::text')
		il.add_css('offered_by', '[itemprop="author"] > a span::text')
		il.add_css('offered_by_url', '[itemprop="author"] > a::attr(href)',
					Compose(lambda urls: [urljoin(response.url, url) for url in urls]))
		yield il.load_item()
Ejemplo n.º 15
0
 def parse_titles(self, response, hub):
     loader = ItemLoader(item=BlogCategory(), response=response)
     loader.add_value('name', hub)
     loader.add_css('posts', 'main > article h2.entry-title > a::text')
     yield loader.load_item()
Ejemplo n.º 16
0
 def parse_property(self, response):
     loader = ItemLoader(PropertyScrapperItem(), response=response)
     loader.add_css('address', '.property-address::text')
     loader.add_css('suburb', 'dl.cN-featDetails-extended dd.suburb a::text')
     # loader.add_css('description', 'div.main div.cT-productDescription')
     loader.add_css('sold_date', 'dl.cN-featDetails-extended dd.saleDate::text')
     loader.add_css('sold_price', 'dl.cN-featDetails-extended dd.price::text')
     loader.add_css('property_type', 'dl.cN-featDetails-extended dd.propertytype::text')
     loader.add_css('floorplan_url', '#Content_Content_propertyPhotos_FloorplanLink::attr(href)')
     loader.add_css('photo_url', '#Content_Content_propertyPhotos_lnkPhoto::attr(href)')
     loader.add_css('sales_type', 'dl.cN-featDetails-extended dd.saleType::text')
     
     # domain uses feature to represents bed + bath + parking,
     # we store this feature in bed, and process it later in self.process
     loader.add_css('bed', 'dl.s-featSummary dd p.features span::text')
     yield self.process(loader.load_item())
Ejemplo n.º 17
0
    def parse_content(self, response):
        django_istance = self._Model.objects.filter(url=response.url)
        # django obj之前存在,并且不重抓则忽略此条
        if django_istance and not self.refetch:
            return
        if django_istance:
            # 重抓此数据
            django_istance.delete()
        sel = Selector(response)
        loader = ItemLoader(item=PaperEduItem(), response=response)
        # parse page
        loader.add_value('url', response.url)
        raw_html = None
        try:
            raw_html = response.body_as_unicode()
        except:
            raw_html = response.body.decode('latin-1')
        loader.add_value('raw_html', raw_html)
        for attr, css in self._CSS.iteritems():
            loader.add_css(attr, css)
        for attr, xpath in self._XPATH.iteritems():
            loader.add_xpath(attr, xpath)

        pub_css = '#right > div.grid_10.omega.alpha > div.r_two > div.cmtdiv .tip'
        tip = sel.css(pub_css)
        pub_date = tip.re(u'发布时间:\s*(\d+-\d+-\d+)')

        item = loader.load_item()
        # 特殊字段处理

        # 站点标识
        item['site_id'] = SITE_PAPER_EDU

        # 分类标识
        title = sel.css('title::text').extract()[0]
        subject = title.split(' - ')[1]
        item['subject_id'] = SUBJECT_ID.get(subject, -1)

        # keywords页面不规范
        for attr, xpath_correction in self._XPATH_CORRECTION.iteritems():
            if not ''.join(item.get(attr, '')).strip(' ;\n'):
                item[attr] = sel.xpath(xpath_correction).extract()[0]

        try:
            pub_date = pub_date[0]
            pub_date = datetime.strptime(pub_date, self.PUB_DATE_FORMAT).date()
        except IndexError:
            pub_date = None
        except ValueError:
            pub_date = None
        item['pub_date'] = pub_date

        # transe attr
        for attr, value in item.iteritems():
            if isinstance(value, list):
                item[attr] = self._JOIN.get(attr, '').join(value)

        # 字段替换,例如替换关键字中文逗号等
        for attr,_r in self._REPLACE.iteritems():
            old, new = _r
            item[attr] = re.sub(old, new, item[attr])

        # 不规则页面元素替换,关键词中有使用空格切分和;切分的
        for attr, _r in self._SPLIT_AND_JOIN.iteritems():
            pattern, join_str, judge_func = _r
            if judge_func(item[attr]):
                item[attr] = join_str.join(re.split(pattern, item[attr]))

        return item