def parse_item(self, response):
        request_again = self.error_handler(response)
        if request_again:
            return request_again
        il = ItemLoader(item=UKBusinessItem(), response=response)

        # From the OG section at the top
        il.add_xpath('name', '//meta[@property="og:title"]/@content')
        il.add_xpath('url', '//meta[@property="og:url"]/@content')
        il.add_xpath('latitude', '//meta[@property="og:latitude"]/@content')
        il.add_xpath('longitude', '//meta[@property="og:longitude"]/@content')
        il.add_xpath('address', '//meta[@property="og:street-address"]/@content')
        il.add_xpath('locality', '//meta[@property="og:locality"]/@content')
        il.add_xpath('postal_code', '//meta[@property="og:postal-code"]/@content')
        il.add_xpath('country', '//meta[@property="og:country-name"]/@content')

        # XPaths below are from the display
        il.add_xpath('name', '//span[@class="busname"]/text()')
        # No OG for this
        il.add_xpath('phone_number', '//span[@class="bustel"]/text()')
        il.add_xpath('website', '//a[@id="linkWebsite"]/@href')
        il.add_xpath('address', '//span[@data-yext="address.address"]/text()')
        il.add_xpath('locality', '//span[@itemprop="addressLocality"]/text()')
        il.add_xpath('postal_code', '//span[@itemprop="postalCode"]/text()')
        # Unicoded so it can share an input processor with the rest
        il.add_value('url', unicode(response.url))
        return il.load_item()
Beispiel #2
0
 def parse_item(self,response):
     l = ItemLoader(item=MeizituItem(), response=response)
     l.add_xpath('name', '//h2/a/text()')
     l.add_xpath('tags', "//div[@id='maincontent']/div[@class='postmeta clearfix']/div[@class='metaRight']/p")
     l.add_xpath('image_urls', "//div[@id='picture']/p/img/@src", Identity())
     l.add_value('url', response.url)
     return l.load_item()
Beispiel #3
0
 def parse(self, response):
     l = ItemLoader(item=JianshuArticleItem(), response=response)
     l.add_xpath(
         'content',
         '//div[@class="article"]/div[@class="show-content"]/p/text()')
     l.add_value('url', response.url)
     return l.load_item()
Beispiel #4
0
    def parse_item(self, response):
        sel = Selector(response)

        # collect xpaths of each player (row in table)
        rows = sel.xpath('/html//div[@class="table-container"]/table/tbody/tr')

        # loop through players
        for row in rows:
            loader = ItemLoader(SkatRTSItem(), selector=row)
            loader.default_input_processor = MapCompose()
            loader.default_output_processor = Join()

            # get unique NHL ID number from player's page URL
            num = row.xpath("td[2]/a/@href").extract()
            sNum = num[0][-7:]
            loader.add_value("nhl_num", sNum)

            # add season data
            loader.add_value("season", str(self.year))

            # collect stats
            loader.add_xpath("hits", ".//td[6]/text()")
            loader.add_xpath("blocked_shots", ".//td[7]/text()")
            loader.add_xpath("missed_shots", ".//td[8]/text()")
            loader.add_xpath("giveaways", ".//td[9]/text()")
            loader.add_xpath("takeaways", ".//td[10]/text()")
            loader.add_xpath("faceoff_wins", ".//td[11]/text()")
            loader.add_xpath("faceoff_losses", ".//td[12]/text()")

            # feed item to pipeline
            yield loader.load_item()
Beispiel #5
0
    def parse_item(self, response):

        loader = ItemLoader(ChinazItem(), response)
        loader.add_value("url", response.url)
        loader.add_xpath("name", u'//span[@id="spanwillchuanwebName"]/following-sibling::text()')
        loader.add_xpath("domain", u'//a[@id="linkUrl"]/text()')
        loader.add_xpath("homepage", u'//a[@id="linkUrl"]/@href')
        loader.add_xpath("founded", u'//span[.="建站时间:"]/following-sibling::text()')
        loader.add_xpath("company", u'//span[.="网站所属:"]/following-sibling::text()')
        loader.add_xpath("location", u'//span[.="所属地区:"]/following-sibling::a//text()')
        loader.add_xpath("founder", u'//span[.="创始人/团队:"]/following-sibling::text()')
        loader.add_xpath("categories", u'//span[.="网站类型:"]/following-sibling::a//text()')
        loader.add_xpath("rating", u'//td[b="用户评分:"]/following-sibling::td/img/@src', re=r"star_(\d)")
        loader.add_xpath("keywords", u'//td[starts-with(b, "关 键 词")]/following-sibling::td/a/text()')
        loader.add_xpath("brief", u'//td[b="网站简介:"]/following-sibling::td/text()')
        loader.add_xpath("alexa_rank", u'//span[.="Alexa排名:"]/following-sibling::text()')
        loader.add_xpath("baidu_weight", u'//td[.="百度权重:"]/following-sibling::td/img/@alt')
        loader.add_xpath("google_pagerank", u'//td[.="PR值:"]/following-sibling::td/img/@alt')
        loader.add_xpath("chinaz_rank", u'//td[@class="scored"]/span/text()')
        loader.add_xpath("backlink_num", u'//span[.="网站反链数: "]/following-sibling::text()')
        loader.add_xpath("keyword_num", u'//a[@id="tdgjcs"]/text()')
        loader.add_xpath("domain_birth", u'//span[.="域名年限:"]/following-sibling::text()', re=r":([0-9-]+)\)")
        loader.add_xpath("baidu_idx_num", u'//span[.="百度收录:"]/following-sibling::text()')
        loader.add_xpath("google_idx_num", u'//span[.="谷歌收录:"]/following-sibling::text()')
        loader.add_xpath("sogou_idx_num", u'//span[.="搜狗收录:"]/following-sibling::text()')
        loader.add_xpath(
            "introduction",
            u'//div[h3="公司简介"]/following-sibling::div[1]',
            MapCompose(Compose(remove_tags, unicode.strip)),
        )
        loader.add_xpath("snapshot", u"//figure/img/@src", MapCompose(partial(urljoin, "http://top.chinaz.com/")))
        return loader.load_item()
Beispiel #6
0
 def parse_item(self, response):
     sel = Selector(response)
     
     # collect xpaths of each player (row in table)
     rows = sel.xpath('/html//div[@class="table-container"]/table/tbody/tr')
     
     for row in rows:
         loader = ItemLoader(GoalSTItem(), selector=row)
         loader.default_input_processor = MapCompose()
         loader.default_output_processor = Join()
         
         # get unique NHL ID number from player's page URL
         num = row.xpath('td[2]/a/@href').extract()
         sNum = num[0][-7:]
         loader.add_value('nhl_num', sNum)
         
         # add season data
         loader.add_value('season', str(self.year))
         
         # collect additional stats
         loader.add_xpath('es_shots_against', './/td[6]/text()')
         loader.add_xpath('es_goals_against', './/td[7]/text()')
         loader.add_xpath('es_saves', './/td[8]/text()')
         loader.add_xpath('es_save_pct', './/td[9]/text()')
         loader.add_xpath('pp_shots_against', './/td[10]/text()')
         loader.add_xpath('pp_goals_against', './/td[11]/text()')
         loader.add_xpath('pp_saves', './/td[12]/text()')
         loader.add_xpath('pp_save_pct', './/td[13]/text()')
         loader.add_xpath('sh_shots_against', './/td[14]/text()')
         loader.add_xpath('sh_goals_against', './/td[15]/text()')
         loader.add_xpath('sh_saves', './/td[16]/text()')
         loader.add_xpath('sh_save_pct', './/td[17]/text()')
         
         # feed item to pipeline
         yield loader.load_item()
Beispiel #7
0
    def parse_by_product(self, response):
        """
        For the 'Bundles' category, grab the product details for the first
        product listed.
        """
        self.selector = Selector(response)
        self.results = self.selector.xpath('//*[@id="ctl00_tdMainPanel"]')
        loader = ItemLoader(item = VisionsProduct(),
                            selector = self.results[0])

        self.field_xpaths = {
            'product': ('div[contains(@class, "catalogueTitle")]'
                        '/h3/text()'),
            'price': ('div[@id="ctl00_ContentPlaceHolder1_pnl'
                      'Bundle"]/div[@id="divProductDetails"]/div'
                      '[contains(@class, "priceAddToCart")]/div[1]/span'
                      '[contains(@id, "SalePrice")]/text()')
        }

        # Extract and load product details
        loader.add_xpath('product', self.field_xpaths['product'])
        loader.add_xpath('price', self.field_xpaths['price'],
                         re = '\$[\d]*[,]*[\d]*\.[\d]*')
        loader.add_value('availability', 'Not Limited/Clearance Item')

        # Because it's an individual product page, manually set the category
        self.category = '/'.join(['Home', response.url.split('/')[4]])
        loader.add_value('category', self.category)

        yield loader.load_item()
Beispiel #8
0
    def parse_item(self, response):
        sel = Selector(response)

        # collect xpaths of each player (row in table)
        rows = sel.xpath('/html//div[@class="table-container"]/table/tbody/tr')

        # loop through players
        for row in rows:
            loader = ItemLoader(SkatSOItem(), selector=row)
            loader.default_input_processor = MapCompose()
            loader.default_output_processor = Join()

            # get unique NHL ID number from player's page URL
            num = row.xpath("td[2]/a/@href").extract()
            sNum = num[0][-7:]
            loader.add_value("nhl_num", sNum)

            # add season data
            loader.add_value("season", str(self.year))

            # collect stats
            loader.add_xpath("so_shots", ".//td[13]/text()")
            loader.add_xpath("so_goals", ".//td[14]/text()")
            loader.add_xpath("so_pct", ".//td[15]/text()")
            loader.add_xpath("game_deciding_goals", ".//td[16]/text()")

            # feed item to pipeline
            yield loader.load_item()
    def get_new(self, response):
        sel = Selector(response)
        il = ItemLoader(item=New())
        il.add_value('tema', ['Marketing y Publicidad'])
        il.add_value('titulo', sel.xpath('//h1[@class="glr-post-title glr-mb-10"]/text()').extract())
        il.add_value('texto', sel.xpath('//div[@class="glr-post-entry"]').extract())
        il.add_value('fecha', sel.xpath('//span[@class="glr-left glr-post-date"]/text()').extract())
        il.add_value('keywords', sel.xpath('//div[@class="post-tags"]//a/text()').extract())
        item = il.load_item()
        if 'keywords' in item:
            pass
        else:
            item['keywords'] = ['Marketing y Publicidad']
        
        if 'fecha' in item:
            item['fecha'] = self.parse_date(item['fecha'])
        else:
            item['fecha'] = '10/05/2015'
        
        if 'titulo' in item:
            if 'texto' in item:
                yield item

        '''
        item = New()
        item['tema'] = 'Marketing y Publicidad'
        item['titulo'] = self.parse_html(sel.xpath('//h1[@class="glr-post-title glr-mb-10"]/text()').extract()[0].strip())
        item['texto'] = self.parse_html(sel.xpath('//div[@class="glr-post-entry"]').extract()[0].strip())
        item['fecha'] = self.parse_date(sel.xpath('//span[@class="glr-left glr-post-date"]/text()').extract()[0].strip())
        '''
        #yield item
        '''res = []
    def get_new(self, response):
        sel = Selector(response)
        il = ItemLoader(item=New())
        il.add_value('tema', ['Marketing y Publicidad'])
        il.add_value('titulo', sel.xpath('//h1/text()').extract())
        il.add_value('texto', sel.xpath('//div[contains(@class,"post-detalle")]').extract())
        il.add_value('fecha', sel.xpath('//p[@itemprop="datePublished"]/text()').extract())
        il.add_value('keywords', sel.xpath('//div[contains(@class,"tags")]/a/text()').extract())
        item = il.load_item()

        if 'titulo' in item:
            pass
        else:
            print item['titulo']
            print item['texto']

        if 'keywords' in item:
            pass
        else:
            item['keywords'] = ['Marketing y Publicidad']
        
        if 'fecha' in item:
            item['fecha'] = self.parse_date(item['fecha'])
        else:
            item['fecha'] = '10/05/2015'
        
        if 'titulo' in item:
            if 'texto' in item:
                yield item
Beispiel #11
0
 def parse_item(self, response):
     # 解析http://www.meizitu.com/a/5336.html获取图片URL
     l = ItemLoader(item=MeizituItem(), response=response)
     l.add_xpath('image_urls', "//div[@id='picture']/p/img/@src",
                 Identity())
     l.add_value('url', response.url)
     return l.load_item()
Beispiel #12
0
    def process_row(self, row, task):
        stats = self.crawler.stats

        l = ItemLoader(WV_DrillingPermit())
        l.add_value(None, row)
        item = l.load_item()

        if item['API'] and item['permit_activity_type'] and item[
                'permit_activity_date']:
            existing_item = self.db.loadItem(
                item, {
                    'API': item['API'],
                    'permit_activity_type': item['permit_activity_type'],
                    'permit_activity_date': item['permit_activity_date']
                })

            if existing_item:
                stats.inc_value('_existing_count', spider=self)
            else:
                stats.inc_value('_new_count', spider=self)
                yield item

                dt = datetime.strptime(item['permit_activity_date'],
                                       '%Y-%m-%d %H:%M:%S')
                #                if item['permit_activity_type'] in ('Permit Issued', 'Permit Commenced', 'Permit Completed'):
                if item['permit_activity_type'] in (
                        'Permit Issued', 'Permits Issued'
                ) and datetime.now() - dt < timedelta(days=365):
                    for item in self.create_feed_entry(item, task):
                        yield item
Beispiel #13
0
 def parsePage(self, response):
     rentHouse = ItemLoader(item = RentItem(), response = response)
     rentHouse.add_value('id', self.name + '-' +
                         response.url.split('/')[-1].split('.')[0])
     rentHouse.add_value('link', response.url)
     rentHouse.add_xpath('title', "//dl[@class = 'title']/dt/p/text()")
     return rentHouse.load_item()
Beispiel #14
0
    def parse_review(self, response):
        sel = Selector(response)

        if not self._is_right_category(sel):
            self.log('Skip URL: %s' % response.url, level=log.INFO)
            return

        self.log('Parse URL: %s' % response.url, level=log.INFO)

        loader = ItemLoader(item=YelpReview(), selector=sel)
        loader.add_value('crawl_date', '%s' % datetime.utcnow())
        loader.add_value('page_url', response.url)

        # Loop over all the fields we need to extract.
        for field, selector in self._item_selectors.iteritems():
            loader.add_xpath(field, selector)

        master_review = loader.load_item()
        review_selectors = sel.xpath('//div[contains(@class, "review")][@itemprop="review"]')

        for rev_sel in review_selectors:
            review_loader = ItemLoader(item=master_review.copy(), selector=rev_sel)

            for field, selector in self._review_selectors.iteritems():
                review_loader.add_xpath(field, selector)

            yield review_loader.load_item()

        return
Beispiel #15
0
    def parse_item(self, response):
        sel = Selector(response)

        # collect xpaths of each player (row in table)
        rows = sel.xpath('/html//div[@class="table-container"]/table/tbody/tr')

        # prepare to adjust for shootout stats if necessary
        shootout = 0
        if self.year > 2005:
            shootout = 1

        # loop through players
        for row in rows:
            loader = ItemLoader(SkatEngItem(), selector=row)
            loader.default_input_processor = MapCompose()
            loader.default_output_processor = Join()

            # get unique NHL ID number from player's page URL
            num = row.xpath("td[2]/a/@href").extract()
            sNum = num[0][-7:]
            loader.add_value("nhl_num", sNum)

            # add season data
            loader.add_value("season", str(self.year))

            # collect stats
            if shootout:
                loader.add_xpath("en_goals", ".//td[20]/text()")
                loader.add_xpath("ps_goals", ".//td[21]/text()")
            else:
                loader.add_xpath("en_goals", ".//td[21]/text()")
                loader.add_xpath("ps_goals", ".//td[22]/text()")

            # feed item to pipeline
            yield loader.load_item()
Beispiel #16
0
    def parse(self, response):
        sel = Selector(response)

        # collect xpaths of each team (row in table)
        rows = sel.xpath('/html//div[@class="contentBlock"]/table/tbody/tr')

        # loop through teams
        for row in rows:
            loader = ItemLoader(StandingsItem(), selector=row)
            loader.default_input_processor = MapCompose()
            loader.default_output_processor = Join()

            # get team identifier
            team = row.xpath('td[2]/a[1]/@rel').extract()
            loader.add_value('team', team)

            # collect several other data points
            loader.add_xpath('division', './/td[3]/text()')
            loader.add_xpath('games_played', './/td[4]/text()')
            loader.add_xpath('wins', './/td[5]/text()')
            loader.add_xpath('losses', './/td[6]/text()')
            loader.add_xpath('ot_losses', './/td[7]/text()')
            loader.add_xpath('points', './/td[8]/text()')
            loader.add_xpath('row', './/td[9]/text()')

            # feed item to pipeline
            yield loader.load_item()
    def parse_product(self, response):
        p = ItemLoader(item=Product(), response=response)
        p.add_css('nome', 'h1.livedata::text')
        p.add_value('url', response.url)
        p.add_css('descricaoLonga', '.desc-info')
        p.add_css('image',
                  'div.container-product-image a.image-link > img',
                  re='src=[\"|\'](?P<src>[^\"\']+)[\"|\']')
        p.add_css('categorias', 'span[itemprop=title]::text')
        yield p.load_item()


#executar no mongo
#db.produto.remove({'categorias.0': {$exists: false}})
#db.produto.remove({'categorias.0': {$nin: [' Games', ' Livros', ' DVDs e Blu-ray']}})

#deleta produtos duplicados
#var duplicates = [];

#db.produto_novo.aggregate([
#{"$group" : { "_id": "$nome", "count": { "$sum": 1 }, "dups": { "$addToSet": "$_id" },  }},
#{"$match": {"_id" :{ "$ne" : null } , "count" : {"$gt": 1} } }]
#,{allowDiskUse: true},{cursor:{}}
#).result.forEach(function(doc) {
#doc.dups.shift();
#doc.dups.forEach( function(dupId){
#duplicates.push(dupId);
#}
#)
#})
#printjson(duplicates);
#db.produto_novo.remove({_id:{$in:duplicates}})
Beispiel #18
0
 def parse_detail(self, response):
     url = response.url
     item = ItemLoader(item=MeizituItem(), response=response)
     item.add_xpath("title", "//h2/a/text()")
     item.add_xpath("image_urls", '//div[@id="picture"]//img/@src')
     item.add_value('url', url)
     return item.load_item()
Beispiel #19
0
    def get_product_details(self, response):
        crumbs = self.get_breadcrumbs(response)
        loader = ItemLoader(item=VisionsProduct())

        loader.add_value('breadcrumbs', crumbs)
        loader.add_value('url', response.url)

        if isinstance(crumbs, basestring):
            loader.add_value('category', crumbs)

        # Ensure we aren't wasting time extracting from an empty page
        if extract_helper(response, self.EMPTY_PAGE_CHECK):
            for d in self.PRODUCT_DETAILS:
                if '_' not in d.name:  # Don't load price
                    loader.add_value(d.name, 'N/A')
        else:
            productDetails = detailsRunner(self.PRODUCT_DETAILS,
                                           response=response)

            if not productDetails['price']:
                productDetails['price'] = productDetails['price_gif']

            productDetails.pop('price_gif')

            # Fix truncated image urls
            if productDetails['image']:
                productDetails['image'] = add_schema(response.url,
                                                     productDetails['image'])

            for d in productDetails:
                loader.add_value(d, productDetails[d])

        yield loader.load_item()
Beispiel #20
0
 def parse_course_item(self, response):
     url_obj = urlparse(response.url)
     l = ItemLoader(item=CourseItem(), response=response)
     l.default_input_processor = MapCompose(unicode.strip)
     l.default_output_processor = TakeFirst()
     l.add_xpath('code', "/html/head/meta[@name='DC.Subject.ProgramCode']/@content")
     l.add_xpath('name', "/html/head/meta[@name='DC.Subject.Description.Short']/@content")
     l.add_xpath('career', "/html/head/meta[@name='DC.Subject.Level']/@content")
     l.year_in = Identity()
     l.add_value('year', ppath.basename(ppath.dirname(url_obj.path)))
     l.add_value('src_url', unicode(response.url))
     l.add_xpath('uoc', "/html/head/meta[@name='DC.Subject.UOC']/@content")
     l.gened_in = MapCompose(unicode.strip, lambda s: s == 'Y')
     l.add_xpath('gened', "/html/head/meta[@name='DC.Subject.GenED']/@content")
     l.add_xpath('faculty', "/html/head/meta[@name='DC.Subject.Faculty']/@content")
     l.add_xpath('school', ( "//div[@class='column content-col']/div[@class='internalContentWrapper']"
                             "/div[@class='summary']/p[strong[text()[contains(.,'School')]]]/a/text()"))
     l.add_xpath('campus', ( "//div[@class='column content-col']/div[@class='internalContentWrapper']"
                             "/div[@class='summary']/p[strong[text()[contains(.,'Campus')]]]/text()"))
     l.add_xpath('prereqs_str', ( "//div[@class='column content-col']/div[@class='internalContentWrapper']"
                     "/div[@class='summary']/p[text()[contains(.,'Prerequisite:')]]/text()"), 
                     re=r'Prerequisite:\s(.+)')
     l.add_xpath('eftsl', ( "//div[@class='column content-col']/div[@class='internalContentWrapper']"
                             "/div[@class='summary']/p[strong[text()[contains(.,'EFTSL')]]]/text()"))
     l.add_xpath('description_markup', ( "//div[@class='column content-col']/div[@class='internalContentWrapper']"
                                         "/h2[text()='Description']/following-sibling::div"))
     course_item = l.load_item()
     yield course_item
     yield Request(url=response.xpath(("//div[@class='column content-col']/div[@class='internalContentWrapper']"
                                     "/div[@class='summary']//a[text()[contains(.,'Timetable')]]/@href")).extract()[0], 
                     callback=self.parse_class_item, 
                     meta=dict(course_identifier={k: course_item.get(k, None) for k in ('code', 'career', 'year', )}))
Beispiel #21
0
    def parse_content(self, response):
        bbs_item_loader = ItemLoader(item=TutorialItem(), response=response)
        url = response.url
        html = response.body
        bbs_item_loader.add_value('url', url)
        bbs_item_loader.add_value('html', html)

        return bbs_item_loader.load_item()
 def parse(self, response):
     item_list = []
     for a in response.css(".menu_box .menu_main h2"):
         l = ItemLoader(item=CategoryItem(), response=response)
         # l.add_css('category', ".menu_box .menu_main h2")
         l.add_value("category", a.extract(), self.get_text)
         item_list.append(l.load_item())
     return item_list
Beispiel #23
0
    def parse_item(self, response):
        l = ItemLoader(item=MeizituItem(), response=response)
        # l.add_xpath('name', '//div[@class="postContent"]/div[@id="picture"]/p/a/text()')
        # l.add_xpath('tags', '//div[@class="postContent"]')
        l.add_xpath('img_url', '//div[@class="text"]/p/br/img/@src', Identity())
        l.add_value('url', response.url)

        return l.load_item()
Beispiel #24
0
    def parse_item(self, response):
        l = ItemLoader(item=MeizituItem(), response=response)
        l.add_xpath('name', '//h2/a/text()')
        l.add_xpath('tags', "//div[@id='maincontent']/div[@class='postmeta  clearfix']/div[@class='metaRight']/p")
        l.add_xpath('image_urls', "//div[@id='picture']/p/img/@src", Identity())

        l.add_value('url', response.url)
        return l.load_item()
 def parse_item(self, response):
     l = ItemLoader(item=CrawlpictureItem(), response=response)
     l.add_xpath('name', '//h2/a/text()')
     l.add_css('tags', 'div.metaRight p::text')
     #l.add_xpath('image_urls','//div[@id="picture"]/p/img/@src' or '//img[@class="scrollLoading"]/@src',Identity())
     l.add_css('image_urls', 'div.postContent img::attr(src)', Identity())
     l.add_value('url', response.url)
     return l.load_item()
 def parse_content(self, response):
     bbsItem_loader = ItemLoader(item=BbsItem(), response=response)
     url = str(response.url)
     bbsItem_loader.add_value("url", url)
     bbsItem_loader.add_xpath("forum", self._x_query["forum"])
     bbsItem_loader.add_xpath("poster", self._x_query["poster"])
     bbsItem_loader.add_xpath("content", self._x_query["page_content"])
     return bbsItem_loader.load_item()
 def parse(self, response):
     item_list = []
     for a in response.css(".menu_box .menu_main h2"):
         l = ItemLoader(item=CategoryItem(), response=response)
         # l.add_css('category', ".menu_box .menu_main h2")
         l.add_value("category", a.extract(), self.get_text)
         item_list.append(l.load_item())
     return item_list
Beispiel #28
0
 def parse_item(self, response):
     l = ItemLoader(item=CoserItem(), response=response)
     l.add_xpath('name', "//h1[@class='js-post-title']/text()")
     l.add_xpath('info', "//div[@class='post__info']/div[@class='post__type post__info-group']/span/text()")
     urls = l.get_xpath('//img[@class="detail_std detail_clickable"]/@src')
     urls = [url.replace('/w650', '') for url in urls]
     l.add_value('image_urls', urls)
     l.add_value('url', response.url)
     return l.load_item()
Beispiel #29
0
    def parse(self, response):
        sel = Selector(response)
        articulos = sel.xpath('/html/body/div[2]/div/div/div/div[1]/div[3]/div')

        for i, elem in enumerate(articulos):
            item = ItemLoader(Articulos(), elem)
            item.add_xpath('title', './/h3/text()')
            item.add_value('id', i)
            yield item.load_item()
Beispiel #30
0
	def parse_content(self, response):
		bbsItem_loader = ItemLoader(item = ScrapyspiderItem(), response = response)
		url = str(response.url)
		bbsItem_loader.add_value('url', url)
		bbsItem_loader.add_xpath('forum', self._x_query['forum'])
		bbsItem_loader.add_xpath('poster', self._x_query['poster'])
		bbsItem_loader.add_xpath('content', self._x_query['page_content'])

		return bbsItem_loader.load_item()
Beispiel #31
0
 def test_load_item_using_default_loader(self):
     i = TestItem()
     i['summary'] = u'lala'
     il = ItemLoader(item=i)
     il.add_value('name', u'marta')
     item = il.load_item()
     assert item is i
     self.assertEqual(item['summary'], u'lala')
     self.assertEqual(item['name'], [u'marta'])
Beispiel #32
0
    def parse_item(self, response):
        self.logger.info("parse_item url %s" % response.url)
        l = ItemLoader(item=ImgDownloadItem(), response=response)
        l.add_xpath('name', '//h1[@class="article-title"]/a/text()')
        # l.add_xpath('tags', "//div[@id='maincontent']/div[@class='postmeta  clearfix']/div[@class='metaRight']/p")
        l.add_xpath('image_urls', "//article[@class='article-content']/p/img/@src", Identity())

        l.add_value('url', response.url)
        return l.load_item()
Beispiel #33
0
 def parse(self, response):
     l = ItemLoader(item=CoserItem(), response=response)
     l.add_xpath('name', "//h1[@class='js-post-title']/text()")
     l.add_xpath('info', "//div[@class='post__info']/div[@class='post__type post__info-group']/span/text()")
     urls = l.get_xpath('//img[@class="detail_std detail_clickable"]/@src')
     urls = [url.replace('/w650', '') for url in urls]
     l.add_value('image_urls', urls)
     l.add_value('url', response.url)
     return l.load_item()
 def test_load_item_using_default_loader(self):
     i = TestItem()
     i["summary"] = u"lala"
     il = ItemLoader(item=i)
     il.add_value("name", u"marta")
     item = il.load_item()
     assert item is i
     self.assertEqual(item["summary"], u"lala")
     self.assertEqual(item["name"], [u"marta"])
Beispiel #35
0
    def parse_content(self, response):
        bbsItem_loader = ItemLoader(item=BbsItem(), response=response)
        url = str(response.url)
        bbsItem_loader.add_value('url', url)
        bbsItem_loader.add_xpath('forum', self._x_query['forum'])
        bbsItem_loader.add_xpath('poster', self._x_query['poster'])
        bbsItem_loader.add_xpath('content', self._x_query['page_content'])

        return bbsItem_loader.load_item()
Beispiel #36
0
    def parse2(self, response):
        item = json.loads(response.body_as_unicode())
        for i in range(len(item['list'])):
            data_tmp = item['list'][i]

            loader = ItemLoader(item=XqtestItem())
            loader.add_value('title', data_tmp['data'])
            org = loader.load_item()
            yield org
Beispiel #37
0
 def test_load_item_using_default_loader(self):
     i = TestItem()
     i['summary'] = u'lala'
     il = ItemLoader(item=i)
     il.add_value('name', u'marta')
     item = il.load_item()
     assert item is i
     self.assertEqual(item['summary'], u'lala')
     self.assertEqual(item['name'], [u'marta'])
 def parse_product(self, response):
     p = ItemLoader(item=Product(), response=response)
     p.add_css('nome', 'h1 > span[itemprop=name]::text')
     p.add_value('url', response.url)
     p.add_css('descricaoLongaHtml','.infoProdBox')
     p.add_css('descricaoLonga','.infoProdBox')
     #p.add_css('detalhes','.ficha-tecnica table tr th::text, .ficha-tecnica table tr td::text')
     p.add_css('image','ul.a-carousel-list > li > img', re='src=[\"|\'](?P<src>[^\"\']+)[\"|\']')
     p.add_css('categorias','div[class=breadcrumb-box] span[itemprop=name]::text')
     yield p.load_item()
Beispiel #39
0
    def parse(self, response):

        for e in response.xpath(
                '//table[@id="tbl_proxy_list"]//tr[count(td)=6]'):
            l = ItemLoader(ProxyHunterItem(), selector=e)
            l.add_value('prot', 'http')
            l.add_xpath('ip', 'td[1]', TakeFirst(), remove_tags, unicode.strip)
            l.add_xpath('port', 'td[2]', TakeFirst(), remove_tags,
                        unicode.strip)
            yield l.load_item()
Beispiel #40
0
    def parse_item(self, response):
        self.logger.info("parse_item url %s" % response.url)

        l = ItemLoader(item=ImgDownloadItem(), response=response)
        l.add_xpath('name', '//h1[@class="c333 subTitle"]/text()')
        l.add_xpath('desc', '//div[@class="txtmod"]/p/text()')
        l.add_value('url', response.url)
        l.add_xpath('image_urls', "//p[@class='tc mb10']/img/@src", Identity())

        return l.load_item()
Beispiel #41
0
 def parse_item2(self, response):
     l = ItemLoader(item=DmozItem(), response=response)
     l.add_xpath(
         'type',
         '//div[@class="location ask_main_location"]/span[@class="fl"]/a[last()]/text()'
     )
     l.add_xpath('type', '//div[@class="question"]/h2/text()')
     l.add_xpath('answer', '//div[@class="anwser"]/h2/text()')
     l.add_value('answer', '牛逼')
     yield l.load_item()
Beispiel #42
0
    def parse(self, response):
        sel = Selector(response)
        preguntas = sel.xpath(
            '//div[@id="question-mini-list"]/div')  #es una lista

        for i, elem in enumerate(preguntas):
            item = ItemLoader(Pregunta(), elem)  #elem tiene el xpath
            item.add_xpath('pregunta', './/h3/a/text()')
            item.add_value('id', i)
            yield item.load_item()
Beispiel #43
0
 def parsePost(self, response):
 	l = ItemLoader(item=Post(), response=response)
 	d = pyq(response.body)
 	l.add_value('url', response.url)
 	l.add_css('title', 'h1.entry-title::text')
 	l.add_css('date', 'span.entry-date::text')
 	l.add_css('author', 'span.author.vcard > a::text')
 	l.add_value('content', d('div.entry-content').text())
 	
 	return l.load_item()
Beispiel #44
0
 def parse_item(self, response):
     parser = ItemLoader(item=HistDataItem())
     parser.add_value('url', response.url) 
     fields = ['tk', 'date', 'datemonth', 'platform', 'timeframe', 'fxpair']
     for field in fields:
         parser.add_value(field, getValue('#'+field, response))
     item = parser.load_item()
     formdata = dict(zip(fields, [item['tk'], item['date'], item['datemonth'], item['platform'], item['timeframe'], item['fxpair']]))
     request = scrapy.FormRequest.from_response(response, formnumber=0, formdata=formdata, callback=getData)
     return request
Beispiel #45
0
 def _parse_item(self, response):
     print('--------------------start item : %s' % response.url)
     item = ItemLoader(item=MeizituItem(), response=response)
     item.add_xpath('title', ITEM_TITLE)
     item.add_xpath('tags', ITEM_TAGS)
     item.add_value('url', response.url)
     item.add_xpath('day', ITEM_DAY)
     item.add_xpath('month_year', ITEM_MONTH_YEAR)
     item.add_xpath('image_urls', ITEM_IMAGE_URLS)
     return item.load_item()
Beispiel #46
0
def Loader_index(self, item_selector):
    l = ItemLoader(item={}, selector=item_selector)

    conver_img = l.get_xpath('.//*[@class="lz_img"]/img/@src')

    l.add_xpath('title', './/*[@class="k_list-lb-2"]/div[1]/a[1]/text()')
    l.add_xpath('url', './/*[@class="k_list-lb-2"]/div[1]/a/@href')
    l.add_value('preview', conver_img)
    l.add_css('date', '#k_list-lb-2-f::text', re=r'(\d{4}-\d{2}-\d{2})')
    l.add_value('image_urls', conver_img)
    return l.load_item()
Beispiel #47
0
    def parse(self, response):
        sel = Selector(response)
        lugares = sel.xpath('//div[@id=hotellist_inner]/div')

        #vamos a iterar sobre todas las preguntas

        for i, elem in enumerate(lugares):
            item = ItemLoader(Lugar(), elem)
            item.add_xpath('lugar', './/h3/a/span/text()')
            item.add_value('id', i)
            yield item.load_item()
Beispiel #48
0
 def parse_product(self, response):
     p = ItemLoader(item=Product(), response=response)
     p.add_css('nome', 'h1 > span[itemprop=name]::text')
     p.add_value('url', response.url)
     p.add_css('descricaoLongaHtml', '.infoProdBox')
     p.add_css('descricaoLonga', '.infoProdBox')
     p.add_css('image',
               'ul.a-carousel-list > li > img',
               re='src=[\"|\'](?P<src>[^\"\']+)[\"|\']')
     p.add_css('categorias',
               'div[class=breadcrumb-box] span[itemprop=name]::text')
     yield p.load_item()
Beispiel #49
0
    def parse_item(self, response):

        loader = ItemLoader(ChinazItem(), response)
        loader.add_value('url', response.url)
        loader.add_xpath(
            'name',
            u'//span[@id="spanwillchuanwebName"]/following-sibling::text()')
        loader.add_xpath('domain', u'//a[@id="linkUrl"]/text()')
        loader.add_xpath('homepage', u'//a[@id="linkUrl"]/@href')
        loader.add_xpath('founded',
                         u'//span[.="建站时间:"]/following-sibling::text()')
        loader.add_xpath('company',
                         u'//span[.="网站所属:"]/following-sibling::text()')
        loader.add_xpath('location',
                         u'//span[.="所属地区:"]/following-sibling::a//text()')
        loader.add_xpath('founder',
                         u'//span[.="创始人/团队:"]/following-sibling::text()')
        loader.add_xpath('categories',
                         u'//span[.="网站类型:"]/following-sibling::a//text()')
        loader.add_xpath('rating',
                         u'//td[b="用户评分:"]/following-sibling::td/img/@src',
                         re=r'star_(\d)')
        loader.add_xpath(
            'keywords',
            u'//td[starts-with(b, "关 键 词")]/following-sibling::td/a/text()')
        loader.add_xpath('brief',
                         u'//td[b="网站简介:"]/following-sibling::td/text()')
        loader.add_xpath('alexa_rank',
                         u'//span[.="Alexa排名:"]/following-sibling::text()')
        loader.add_xpath('baidu_weight',
                         u'//td[.="百度权重:"]/following-sibling::td/img/@alt')
        loader.add_xpath('google_pagerank',
                         u'//td[.="PR值:"]/following-sibling::td/img/@alt')
        loader.add_xpath('chinaz_rank', u'//td[@class="scored"]/span/text()')
        loader.add_xpath('backlink_num',
                         u'//span[.="网站反链数: "]/following-sibling::text()')
        loader.add_xpath('keyword_num', u'//a[@id="tdgjcs"]/text()')
        loader.add_xpath('domain_birth',
                         u'//span[.="域名年限:"]/following-sibling::text()',
                         re=r':([0-9-]+)\)')
        loader.add_xpath('baidu_idx_num',
                         u'//span[.="百度收录:"]/following-sibling::text()')
        loader.add_xpath('google_idx_num',
                         u'//span[.="谷歌收录:"]/following-sibling::text()')
        loader.add_xpath('sogou_idx_num',
                         u'//span[.="搜狗收录:"]/following-sibling::text()')
        loader.add_xpath('introduction',
                         u'//div[h3="公司简介"]/following-sibling::div[1]',
                         MapCompose(Compose(remove_tags, unicode.strip)))
        loader.add_xpath(
            'snapshot', u'//figure/img/@src',
            MapCompose(partial(urljoin, 'http://top.chinaz.com/')))
        return loader.load_item()
 def parse_page(self, response):
     el = ItemLoader(item=AcademicNewsItem(), response=response)
     el.add_xpath('title', "//div[@class='title']/h1/text()")
     el.add_xpath('time_pub', "//span[@class='datetime']/text()")
     el.add_value('time_get', datetime.datetime.today().__format__("%Y-%m-%d %H:%M:%S"))
     el.add_xpath('author', "//div[@class='clear author']/text()")
     el.add_xpath('publisher', "//div[@class='clear author']/a[@target='_blank']/text()")
     el.add_xpath('source', "//div[@class='clear author']/a[@target='_blank']/text()")
     el.add_xpath('classf', "//div[@id='location']/a/text()")
     # soup = BeautifulSoup(response.body)
     el.add_xpath('body', "//div[@id='zoom']")
     el.add_value('url', response.url)
     return el.load_item()
Beispiel #51
0
 def parse_page(self, response):
     el = ItemLoader(item=NewsItem(), response=response)
     el.add_xpath("title", "//div[@class='title']/h1/text()")
     el.add_xpath("time_pub", "//span[@class='datetime']/text()")
     el.add_value("time_get", datetime.datetime.today().__format__("%Y-%m-%d %H:%M:%S"))
     el.add_xpath("author", "//div[@class='clear author']/text()")
     el.add_xpath("publisher", "//div[@class='clear author']/a[@target='_blank']/text()")
     el.add_xpath("source", "//div[@class='clear author']/a[@target='_blank']/text()")
     el.add_xpath("classf", "//div[@id='location']/a/text()")
     # soup = BeautifulSoup(response.body)
     el.add_xpath("body", "//div[@id='zoom']")
     el.add_value("url", response.url)
     return el.load_item()
Beispiel #52
0
    def parse_detail(self, response):
        print("response.url===", response.url)
        #具体值
        url = response.url

        #使用ItemLoader类
        item = ItemLoader(item=Meizitu2Item(), response=response)
        item.add_xpath("tilte", "//h2/a/text()")
        item.add_xpath("image_urls", '//div[@id="picture"]//img/@src')
        #添加值的方式
        item.add_value("url", url)

        return item.load_item()
    def parse(self, response):
        new_albums = response.css(self.filter_css)
        count = 0
        for new_album in new_albums:
            count += 1
            if count > self.count_limit:
                raise CloseSpider('done')

            loader = ItemLoader(KuwoScrapyItem(), new_album)
            loader.add_value('basic_source_info', '{}')
            loader.add_css('basic_source_name', self.name_css, TakeFirst())
            loader.add_css('basic_source_artist', self.artist_css, Join('&'))
            yield loader.load_item()
Beispiel #54
0
    def _record_parse(self, response):
        fval = '//div[@class="innertube"]/table//tr[{0}]/td[2]{1}/text()'

        l = ItemLoader(item=AviationItem(), response=response)
        l.add_xpath('date', fval.format(2, ""))
        l.add_xpath('time', fval.format(3, ""))
        l.add_xpath('operator', fval.format(5, "/a"))
        l.add_xpath('flight_number', fval.format(20, ""))
        l.add_value('fatalities', response.meta['fatalities'])
        l.add_xpath('departure', fval.format(18, "/"))
        l.add_xpath('destination', fval.format(19, "/"))
        l.add_xpath('crash', fval.format(15, "/"))
        yield l.load_item()
Beispiel #55
0
 def parse_item(self, response):
     l = ItemLoader(item=CoserItem(), response=response)
     l.add_xpath('name', "//div[@class='mb10 dib']/a/text()")
     l.add_xpath('info', "//div/p[@class='mb20']/text()")
     #l.add_xpath('image_urls',"//div[@class='content-img-wrap-inner']/img[@src]")
     l.add_value('url', response.url)
     # //div[@class='content-img-wrap']//img/@src
     # 抓不到,正则还是牛逼
     urls = l.selector.re(r'src="(.+?.jpg)/w650')
     # urls = l.get_xpath("//div[@class='content-img-wrap']//img/@src")
     # urls = [url.replace('/w650', '') for url in urls]
     l.add_value('image_urls', urls)
     # l.add_xpath('image_urls',"//div/p[@class='mb20']/text()")
     yield l.load_item()
Beispiel #56
0
 def parse(self, response):
     # crawl all display page
     for link in self.link_extractor['page_down'].extract_links(response):
         yield Request(url=link.url, callback=self.parse)
     print response.url
     self.browser.get(response.url)
     time.sleep(5)
     url = str(response.url)
     etaoItem_loader = ItemLoader(item=EtaoItem(), response=response)
     etaoItem_loader.add_value('url', url)
     etaoItem_loader.add_xpath('title', self._x_query['title'])
     etaoItem_loader.add_xpath('name', self._x_query['name'])
     etaoItem_loader.add_xpath('price', self._x_query['price'])
     yield etaoItem_loader.load_item()
Beispiel #57
0
    def dishparse(self,response):
        #item = DinnerItem()
        #item['dish_id'] = response.meta['id']
        #large_info = response.xpath("//div[@class='large_info']")
        #item['name'] = large_info.xpath("./div[@class='box']/h1/text()").extract()
        #item['tags'] = large_info.xpath("./div[contains(@class,'mgt20')]//a/text()").extract()
        #item['other'] = large_info.xpath("./ul//li/text()").extract()

        l = ItemLoader(item=DinnerItem(),response=response)
        l.add_value('dish_id',response.meta['id'])
        l.add_xpath('name',"//div[@class='large_info']/div[@class='box']/h1/text()")
        l.add_xpath('tags',"//div[@class='large_info']/div[contains(@class,'mgt20')]//a/text()")
        l.add_xpath('other',"//div[@class='large_info']/ul//li/text()")
        return l.load_item()
Beispiel #58
0
 def parse_detail(self, response):
     l = ItemLoader(response.meta['item'], response)
     # l.add_xpath('fanhao','//span[@class="list_text"]/em/b/a/text()')
     l.add_xpath('image_name', '//span[@class="list_text"]/em/b/a/text()')
     photo = response.xpath(
         '//span[@class="list_img"]/a/img/@data-original').extract()
     # item = response.meta['item']
     # item['fanhao'] = selector.xpath('//span[@class="list_text"]/em/b/a/text()').extract()
     # photo = selector.xpath('//span[@class="list_img"]/a/img/@data-original').extract()
     img = []
     for p in photo:
         img.append('http://www.nh87.cn' + p)
     l.add_value('image_urls', img)
     # 返回item
     return l.load_item()