Python XPathItemLoader.add_xpath Exemples, scrapy.contrib.loader.XPathItemLoader.add_xpath Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : livingsocial_spider.py Projet : yanniey/Scrapy_livingsocial_chicago

	def parse(self, response):
		"""
		Default callback used by Scrapy to process downloaded responses
		Testing contracts:
		@url http://www.livingsocial.com/cities/15-san-francisco
		@returns items 1
		@scrapes title link
		"""

		selector = HtmlXPathSelector(response)

		for deal in selector.xpath(self.deals_list_xpath):
			loader = XPathItemLoader(LivingSocialDeal(),selector=deal)

			# define processors
			loader.default_input_processor = MapCompose(unicode.strip) 
			# stripe out white-space of unicode strings
			loader.default_output_processor = Join() 
			# join the data together by a space

			# iterate over fields and add xpaths to the loader
			for field, xpath in self.item_fields.iteritems(): 
			# iteritems() iterate the (key,value) of items in a dictionary. There are also iterkeys() and itervalues() functions. 
				loader.add_xpath(field, xpath) 
			yield loader.load_item() 
			# yield each other and move on to the next

# output as json file: scrapy crawl livingsocial -o items.json

Exemple #2

0

Afficher le fichier

Fichier : wggesucht_spider.py Projet : bogdimon/wg-gesucht_crawler

    def parse(self, response):
        """
        Default callback used by Scrapy to process downloaded responses

        # Testing contracts:
        # @url http://www.livingsocial.com/cities/15-san-francisco
        # @returns items 1
        # @scrapes title link

        """
        selector = HtmlXPathSelector(response)

        # iterate over deals
        for entry in selector.xpath(self.entries_list_xpath):
            loader = XPathItemLoader(WGGesuchtEntry(), selector=entry)

            # define processors
            loader.default_input_processor = MapCompose(unicode.strip)
            loader.default_output_processor = Join()

            # iterate over fields and add xpaths to the loader
            for field, xpath in self.item_fields.iteritems():
                loader.add_xpath(field, xpath)
            yield loader.load_item()

        cur_index = response.meta.get("cur_index", 1)
        new_url = re.sub("\d+.html", str(cur_index) + ".html", response.url)

        print("\n" + str(response.url) + "\n" + new_url + "\n")

        if cur_index < 59:
            yield Request(new_url, callback=self.parse, meta={"cur_index": cur_index + 1})

Exemple #3

0

Afficher le fichier

Fichier : spiders.py Projet : iamsuz/twitterhandle

    def parse_item(self,response):

        l = XPathItemLoader(item=TwitterBotItem(),response=response)
        print "###################"
        l.add_xpath('company','//*[@class="trends-inner"]/div/div[2]/ul/li[1]/a/text()')
        l.add_xpath('street_address','//*[@id="info-container"]/div[1]/dl/dd[1]/span[1]/text()')
        l.add_xpath('locality','//*[@id="info-container"]/div[1]/dl/dd[1]/span[2]/text()')
        l.add_xpath('region','//*[@id="info-container"]/div[1]/dl/dd[1]/span[3]/text()')
        l.add_xpath('postalcode','//*[@id="info-container"]/div[1]/dl/dd[1]/span[4]/text()')


        res = l.load_item()

        results = {'name':'','address':''}

        if 'company' in res:
            results['name'] = res['company']
        if 'street_address' in res:
            results['address'] = res['street_address']
        if 'locality' in res:
            results['address'] = results['address'] + res['locality']
        if 'region' in res:
            results['address'] = results['address'] + res['region']
        if 'postalcode' in res:
            results['address'] = results['address'] + res['postalcode']

        return res

Exemple #4

0

Afficher le fichier

Fichier : group_spider.py Projet : whitefoxx/douban_crawler

    def parse(self, response):
        url = response.url
        group_name = url[url.find("group") :].split("/")[1]
        hxs = HtmlXPathSelector(response)

        dls = hxs.select('//dl[@class="obu"]')
        items = []
        for dl in dls:
            item = GroupUserItem()
            l = XPathItemLoader(item=item, selector=dl)
            l.add_xpath("homepage", "dt/a/@href")
            l.add_xpath("image", "dt/a/img/@src")
            l.add_xpath("name", "dd/a/text()")
            l.add_value("group", group_name)
            yield l.load_item()

        links = hxs.select('//span[@class="next"]/a/@href').extract()
        for url in links:
            yield Request(url, callback=self.parse)
        if len(links) < 1:
            p = re.compile('<span class="next">.*?<a href="(.+?)">', re.S)
            m = p.search(response.body_as_unicode())
            if m:
                url = m.group(1)
                yield Request(url, callback=self.parse)

Exemple #5

0

Afficher le fichier

Fichier : livingsocial_spider.py Projet : raghavlite/yana_bot

    def parse(self, response):
        """
        Default callback used by Scrapy to process downloaded responses
        Testing contracts:
        @url http://www.livingsocial.com/cities/15-san-francisco
        @returns items 1
        @scrapes title link
        """
        selector = HtmlXPathSelector(response)

        # iterate over deals
        for deal in selector.xpath(self.deals_list_xpath):
            loader = XPathItemLoader(LivingSocialDeal(), selector=deal)

            # define processors
            loader.default_input_processor = MapCompose(unicode.strip)
            loader.default_output_processor = Join()

            # iterate over fields and add xpaths to the loader
            for field, xpath in self.item_fields.iteritems():
                loader.add_xpath(field, xpath)
            yield loader.load_item()

Exemple #6

0

Afficher le fichier

Fichier : NrcMaterialsScraper.py Projet : SkyTruth/scraper

    def parse_materials(self, response):
        reportnum = response.request.meta['reportnum']
        text = unicode (response.body, response.encoding)
        hxs = HtmlXPathSelector(text=text)
        materials = hxs.select ('//table[@class="t16Standard"]/tr')
        if (len(materials) == 0):
            self.log('Materials data not present in response from {0}'.format(response.url), log.INFO)
        else:
            # Skip the first report record because this is the header row
            materials.pop (0)
            if (len(materials) == 0):
                self.log('No materials reports found in response {0}'
                         .format(reportnum), log.INFO)
            else:
                self.log('Retrieved {0} materials records in report {1}'
                         .format(len(materials),reportnum), log.INFO)

        for material in materials:
            l = XPathItemLoader(NrcScrapedMaterial(), material)
            l.name_in = lambda slist: [s[:32] for s in slist]
            l.add_value('reportnum', reportnum)
            for name, params in NrcScrapedMaterial.fields.items():
                if 'xpath' in params:
                    l.add_xpath(name, params['xpath'])
            item = l.load_item()
            yield item
     
        self.db.setBotTaskStatus(reportnum, self.name, 'DONE')

Exemple #7

0

Afficher le fichier

Fichier : spidersVault.py Projet : bsaideepak/crawlers

    def parse_item(self,response):
        l = XPathItemLoader(item = BurrpItem(),response = response)

        l.add_xpath('company','//*[@id="listings-details"]/section[2]/div/div[1]/div[1]/span/p/text()')
        l.add_xpath('phone','//*[@id="listings-details"]/section[2]/div/div[1]/div[1]/div/ul/li[1]/strong/text()')
        l.add_xpath('address','//*[@id="listings-details"]/section[2]/div/div[1]/div[1]/div/ul/li[2]/text()')
        l.add_xpath('region','//*[@id="listings-details"]/section[2]/div/div[1]/div[1]/p/a/text()')
        l.add_xpath('cuisine1','//*[@id="listings-details"]/section[2]/div/div[1]/div[1]/div/ul/li[3]/a[1]/text()')
        l.add_xpath('cuisine2','//*[@id="listings-details"]/section[2]/div/div[1]/div[1]/div/ul/li[3]/a[2]/text()')

        res = l.load_item()

        results = {'name':'','address':'','phone':''}

        if 'company' in res:
            results['name'] = res['company']
        if 'address' in res:
            results['address'] = res['address']
        if 'locality' in res:
            results['address'] = results['address'] + res['locality']
        if 'region' in res:
            results['address'] = results['address'] + res['region']
        if 'postalcode' in res:
            results['address'] = results['address'] + res['postalcode']

        return res

Exemple #8

0

Afficher le fichier

Fichier : spidersVault.py Projet : bsaideepak/crawlers

    def parse_item(self,response):
        l = XPathItemLoader(item = LocalItem(),response = response)

        l.add_xpath('company','//*[@id="biz-vcard"]/div[2]/h1/span/text()')
        l.add_xpath('phone','//*[@id="biz-vcard"]/div[5]/div[2]/address/p/strong/text()')
        l.add_xpath('locality','//*[@id="biz-vcard"]/div[5]/div[2]/address/p/span[2]/text()')
        l.add_xpath('region','//*[@id="biz-vcard"]/div[5]/div[2]/address/p/span[3]/text()')
        l.add_xpath('postalcode','//*[@id="biz-vcard"]/div[5]/div[2]/address/p/span[4]/text()')

        res =  l.load_item()

        results = {'name':'','address':'','phone':''}

        if 'company' in res:
            results['name'] = res['company']
        if 'locality' in res:
            results['address'] = res['locality']
        if 'region' in res:
            results['address'] = results['address'] + res['region']
        if 'postalcode' in res:
            results['address'] = results['address'] + res['postalcode']
        if 'phone' in res:
            results['phone'] = results['phone']

        return res

Exemple #9

0

Afficher le fichier

Fichier : ParseSpider.py Projet : talentsun/gmatclub

	def parse_rc(self,response):
		loader = XPathItemLoader(item=ParseRcItem(), response=response)
		id = self.parse_id_from_url(response.url)
		loader.add_value('questionId', id)
		loader.add_xpath('text', '//div[@class="text"]/text()')
		loader.add_xpath('text', '//div[@class="text"]/span/text()')
		loader.add_xpath('answerList','//div[@class="item clearfix"]/span/text()')
		loader.add_xpath('choiceList','//div[@class="item clearfix"]/b/text()')
		loader.add_xpath('answer','//div[@class="answer clearfix hidden QuesHidden"]/b/text()')
		# loader.add_xpath('explanation','//div[@id="DivExplain"]')
		item =  loader.load_item()
		if len(item['text']) ==3:
			test = item['text'][0] + '<span style="text-decoration:underline;">' + item['text'][2]  + '</span>'+ item['text'][1]
		else:
			test = item['text'][0]

		for filename in self.fileList:
			index = filename.find(id)
			if index != -1:
				f = open('/home/huwei/origin/rcarticle/' + filename)
				artile = f.read()
				f.close

		content = self.rc_content.format(artile[24:len(artile) - 4],item['questionId'][0],
			item['questionId'][0],test,
			item['questionId'][0],item['choiceList'][0],item['choiceList'][0],item['answerList'][0],
			item['questionId'][0],item['choiceList'][1],item['choiceList'][1],item['answerList'][1],
			item['questionId'][0],item['choiceList'][2],item['choiceList'][2],item['answerList'][2],
			item['questionId'][0],item['choiceList'][3],item['choiceList'][3],item['answerList'][3],
			item['questionId'][0],item['choiceList'][4],item['choiceList'][4],item['answerList'][4],
			item['questionId'][0],item['answer'][0])
		wf = open('/home/huwei/gmatclub/rc/' + id + '.html','w')
		wf.write(content)
		wf.close()
		return item

Exemple #10

0

Afficher le fichier

Fichier : jabong_spider.py Projet : DeepFashion/E-Commerce-Scrapy

    def parse(self, response):
        """
        Default callback used by Scrapy to process downloaded responses

        """

        selector = HtmlXPathSelector(response)


        details=urlparse(response.request.url)
        queryStr={x.split('=')[0]:(x.split('=')[1]) for x in details.query.split("&")}
        print "\n",queryStr['page']

        # iterate over deals
        for deal in selector.select(self.products_list_xpath):
            loader = XPathItemLoader(JabongData(), selector=deal)

            # define processors
            loader.default_input_processor = MapCompose(unicode.strip)
            loader.default_output_processor = Join()

            # iterate over fields and add xpaths to the loader
            for field, xpath in self.item_fields.iteritems():
                loader.add_xpath(field, xpath)

            # adding the request URL to the loader 
            loader.add_value("requestURL",unicode(response.request.url, "utf-8"))


            # adding the category for the request
            loader.add_value("category",unicode(self.category))

            yield loader.load_item()

Exemple #11

0

Afficher le fichier

Fichier : webarticle.py Projet : fredatshift/collective-intelligence

    def parse_article(self, response):
        """
        The lines below is a spider contract. For more info see:
        http://doc.scrapy.org/en/latest/topics/contracts.html

        @url http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/
        @scrapes name
        """
        
        selector = Selector(response)
        loader = XPathItemLoader(LeMondeArt(), selector=selector)
        
        self.log('\n\nA response from %s just arrived!' % response.url)
        
        # define processors
        text_input_processor = MapCompose(unicode.strip)
        loader.default_output_processor = Join()
        
        # Populate the LeMonde Item with the item loader
        for field, xpath in self.article_item_fields.iteritems():
            try:
                loader.add_xpath(field, xpath, text_input_processor)
            except ValueError:
                self.log("XPath %s not found at url %s" % (xpath, response.url))
            
        #loader.add_value("Url",response.url)
        

        yield loader.load_item()

Exemple #12

0

Afficher le fichier

Fichier : FracFocusScraper.py Projet : SkyTruth/scraper

    def scrape_content_items (self, response):
        hxs = HtmlXPathSelector(response)
        stats = self.crawler.stats
        page_num = hxs.select ('//*[@id="MainContent_DocumentList1_GridView1_PageCurrent"]/@value').extract()
        if page_num:
            page_num = page_num[0]
            self.log('%s Scraping page %s' % (response.meta['cookiejar'], page_num), log.INFO)
        else:
            self.log('%s No page number found' % (response.meta['cookiejar']), log.WARNING)

        stats.inc_value ('_pages', spider=self)
        reports = hxs.select ('//table[@id="MainContent_DocumentList1_GridView1"]//tr')

        for report in reports:
            l = XPathItemLoader(FracFocusScrape(), report)
            l.state_in = lambda slist: [s[:20] for s in slist]
            l.county_in = lambda slist: [s[:20] for s in slist]
            for name, params in FracFocusScrape.fields.items():
                l.add_xpath(name, params['xpath'])
            item = l.load_item()
            if item.get('api'):
                if self.db.itemExists(item):
                    stats.inc_value ('_existing_count', spider=self)
                else:
                    stats.inc_value ('_new_count', spider=self)
#                print item['operator']
                    yield item
        if not stats.get_value('_existing_count') and not stats.get_value('_new_count'):
            self.log('%s No records found' % (response.meta['cookiejar']), log.WARNING)

Exemple #13

0

Afficher le fichier

Fichier : livingsocial_spider.py Projet : catomania/new-coder

	def parse(self, response): # actually a method
		"""
		Default callback used by Scrapy to process downloaded responses
		
		Testing contracts:
		@url http://www.livingsocial.com/cities/15-san-francisco
		@returns items 1
		@scrapes title link
		
		"""
		         
		selector = HtmlXPathSelector(response) # instantiate HtmlXPathSelector() w/ response parameter
		
		# iterate over deals
		for deal in selector.xpath(self.deals_list_xpath): #multiple deals per page
			loader = XPathItemLoader(LivingSocialDeal(), selector=deal) #iterate over each deal
			
			# define processors
			# An Item Loader contains one input processor and one output processor for each (item) field.
			loader.default_input_processor = MapCompose(unicode.strip) #strip out white-space of unicode strings
			loader.default_output_processor = Join() #join data by a space
			
			# iterate over fields and add xpaths to the loader
			for field, xpath in self.item_fields.iteritems(): #itemitems() method allows you to iterate (k, v) of items in a dict
				loader.add_xpath(field, xpath) #add specific field xpath to loader
			yield loader.load_item() # load_item: grabs each item field (link, title, etc), gets xpath, process data
			# w/ input output processor. Yield each item, then move onto next deal

Exemple #14

0

Afficher le fichier

Fichier : rssurls_discovery_spider.py Projet : solaise73/adaptfm

	def parse(self, response):
		x = XmlXPathSelector(response)
		x.register_namespace("im", "http://itunes.apple.com/rss")
		x.register_namespace('atom','http://www.w3.org/2005/Atom')
		feedCount = str(len(self.start_urls))
		self.i=self.i+1
		self.log('Reading rss url [%s of %s]' % (self.i, feedCount), level=log.INFO)
		entries = x.select('//atom:entry')
		
		if entries:

			# a itunes rss feed
			for entry in entries:
				id = entry.select('./atom:id/@im:id').extract()
				self.log('Entry %s' % (str(id)), level=log.INFO)
				yield Request('http://itunes.apple.com/lookup?id='+ id[0], callback=self.getItunesTrackJson)
			
			
			
		else:
			# a single feed
			l = XPathItemLoader(PodcastItem(), x)
			l.add_value('id', 'rssdisco_'+response.url)
			l.add_value('audioType', 'disco')
			l.add_value('brandFeed', response.url)
			l.add_xpath('brandName', '//./channel/title/text()')
			self.log('Feed from rss %s' % (response.url), level=log.INFO)
			
			item = l.load_item()
			
	
			yield item

Exemple #15

0

Afficher le fichier

Fichier : yahoo_answer.py Projet : KeithYue/QA-spider

    def get_answer(self, selector, question_loader):
        answer_loader = XPathItemLoader(item = YahooAnswer(), selector = selector)
        answer_loader.add_xpath('answer_id', './@id')
        answer_loader.add_xpath('answer_content','.//div[@class="qa-container"]//div[@class="content"]//text()')
        answer_loader.add_value('answerer',self.get_user(selector))
        answer_loader.add_value('question_id',question_loader.get_output_value('question_id'))
        answer_loader.add_xpath('answering_date',''.join([
            './/div[@class="qa-container"]//ul[@class="meta"]',
            '/li[1]/abbr/@title'
            ]))
        answer_loader.add_xpath('marks',''.join([
            './/div[@class="utils-container"]',
            '//li[@class="rate-up"]',
            '//span[@class="seo-rated"]/text()'
            ]))
        answer_loader.add_xpath('marks',''.join([
            './/div[@class="utils-container"]',
            '//li[@class="rate-up"]',
            '//span[@class="seo-rated"]//strong/text()'
            ]))
# get the good number ot bad number
        marks = answer_loader.get_output_value('marks')
        # print marks
        if marks.find('good'):
            answer_loader.add_value('number_of_good_marks', marks.split(' ')[0])
#bad numbers
# is best answer
        answer_class = selector.select('./@class').extract()[0]
        if answer_class.find('best') != -1:
            answer_loader.add_value('is_best_answer', 1)
        else:
            answer_loader.add_value('is_best_answer', 0)

        return answer_loader.load_item()

Exemple #16

0

Afficher le fichier

Fichier : rafap.py Projet : TotallyBullshit/finance-2

    def parse(self, response):
        
        ubi = XPathItemLoader(item=FinanceIndex(), response=response)
        ubi.add_value("name", "Uruguay Bond Index")
        ubi.add_value("unit", "bps")
        ubi.add_xpath("value", "//span/text()")

        return [ubi.load_item()]

Exemple #17

0

Afficher le fichier

Fichier : kitco.py Projet : TotallyBullshit/finance-2

    def parse(self, response):
        
        gold = XPathItemLoader(item=FinanceIndex(), response=response)
        gold.add_value("name", "Oro Spot Cierre Londres")
        gold.add_value("unit", "USD")
        gold.add_xpath("value", "//td[@bgcolor='#cccc99'][1]//text()")

        return [gold.load_item()]

Exemple #18

0

Afficher le fichier

Fichier : tcad.py Projet : kerinin/giscrape

 def history(text, url):
   response = http.TextResponse(url=url, body=str(text.replace(u'\xa0','')))
   h = XPathItemLoader(item=TCADValueHistoryItem(), response=response)
   
   h.add_xpath('year', '//td[1]/text()')
   h.add_xpath('value', '//td[4]/text()')
   
   return h.load_item()

Exemple #19

0

Afficher le fichier

Fichier : horrible_spyder.py Projet : therin/edx

 def parse(self, response):
    hxs = HtmlXPathSelector(response)
    entries = hxs.select('//tr[contains(@class,"trusted tlistrow")]/td[contains(@class, "tlistname")]')
    for entry in entries:
    	l = XPathItemLoader(item=TorrentItem(), selector=entry )
        l.add_xpath('torrent', 'a/@href')
        l.add_xpath('title', 'a[contains(@href, "nyaa")]/text()')
        yield l.load_item()

Exemple #20

0

Afficher le fichier

Fichier : iWatchOnline_spider.py Projet : JoeOBrien/tvlinksscrapy

	def parse(self, response):
		selector = HtmlXPathSelector(response)
		for link in selector.select(self.links_list_xpath):
			loader = XPathItemLoader(iWatchOnline(), selector=link)
			loader.default_input_processor = MapCompose(unicode.strip)
			loader.default_output_processor = Join()
			for field, xpath in self.episodes_field.iteritems():
				loader.add_xpath(field,xpath)
			yield loader.load_item()

Exemple #21

0

Afficher le fichier

Fichier : pandora.py Projet : rimbi/bookcrawler

    def parse_item(self, response):
		l = XPathItemLoader(item=BookItem(), response=response)
		l.add_xpath('name',     '//span[@id=\'ctl00_ContentPlaceHolderMainOrta_LabelAdi\']/text()')
		l.add_xpath('isbn',     '//span[@id=\'ctl00_ContentPlaceHolderMainOrta_LabelIsbn\']/text()')
		l.add_xpath('author',   '//span[@id=\'ctl00_ContentPlaceHolderMainOrta_LabelYazar\']/a/text()')
		l.add_xpath('publisher','//a[@id=\'ctl00_ContentPlaceHolderMainOrta_HyperLinkYayinci\']/text()')
		l.add_xpath('price',    '//span[@class=\'fiyat\']/text()', u'(.*) TL')
		l.add_value('link', response.url)
		l.add_value('store', 4)
		return l.load_item()

Exemple #22

0

Afficher le fichier

Fichier : spidersVault.py Projet : bsaideepak/crawlers

    def parse_item(self,response):

        l = XPathItemLoader(item = HotfrogItem(),response = response)
        l.add_xpath('company','/html/body/center/table[2]/text()')
        res =  l.load_item()
        print("")
        print("")
        return res
        print("")
        print("")

Exemple #23

0

Afficher le fichier

Fichier : fxstreet.py Projet : TotallyBullshit/finance-2

 def parse(self, response):
     
     items = []
     for name, pattern, pos in rates:
         rate = XPathItemLoader(item=FinanceIndex(), response=response)
         rate.add_value("name", name)
         rate.add_value("unit", "%")
         rate.add_xpath("value", "//a[contains(text(), '%s')]/parent::td/following-sibling::td[%d]/text()" % (pattern, pos))
         items.append(rate.load_item())
     return items

Exemple #24

0

Afficher le fichier

Fichier : zhizhu_user_topic_spider.py Projet : KeithYue/Zhihu_Spider

    def get_UT_item(self, sel, user_url):
        '''
        given the selector of topic and user url, generate the u_t relationship
        '''
        ut_loader = XPathItemLoader(item=ZhiHuU_T(), selector = sel)
        ut_loader.add_value('crawled_from', user_url)
        ut_loader.add_value('user_url', '/'+'/'.join(user_url.split('/')[-3:-1]))
        ut_loader.add_xpath('topic_url', './/a[contains(@class, "zm-list-avatar-link")]/@href')

        return  ut_loader.load_item()

Exemple #25

0

Afficher le fichier

Fichier : netkitap.py Projet : sardok/bookcrawler

    def parse_item(self, response):
		l = XPathItemLoader(item=BookItem(), response=response)
		l.add_xpath('name',     '//h1[@class=\'kitapad14pnt\']/b/text()')
		l.add_xpath('isbn',     '//span[@class=\'kunye\']/text()', u'ISBN: ([0-9\-X]+)')
		l.add_xpath('author',   '//span[@class=\'yazarad12pnt\']/a/span[@class=\'yazarad12pnt\']/text()')
		l.add_xpath('publisher','//h3[@class=\'kapakyazisi\']/b/font/a/text()')
		l.add_xpath('price',    '//span[@class="kapakyazisi"]/font/b/text()', u'(.*) TL')
		l.add_value('link', response.url)
		l.add_value('store', 5)
		return l.load_item()

Exemple #26

0

Afficher le fichier

Fichier : ideefixe.py Projet : rimbi/bookcrawler

    def parse_item(self, response):
		l = XPathItemLoader(item=BookItem(), response=response)
		l.add_xpath('name',     '//div[@class=\'boxTanimisim\']/div/text()')
		l.add_xpath('isbn',     '//div[@id=\'tanitimbox\']/text()', u'.*ISBN : ([0-9]+)')
		l.add_xpath('author',   '//div[@class=\'boxTanimVideo\']/a/text()')
		l.add_xpath('publisher','//h3[@class=\'boxTanimyayinevi\']/a/b/text()')
		l.add_xpath('price',    '//b[@class=\'pricerange\']/text()', u'\s*([0-9,]*) TL \(KDV Dahil\)')
		l.add_value('link', response.url)
		l.add_value('store', 2)
		return l.load_item()

Exemple #27

0

Afficher le fichier

Fichier : kitapyurdu.py Projet : rimbi/bookcrawler

    def parse_item(self, response):
		l = XPathItemLoader(item=BookItem(), response=response)
		l.add_xpath('name',     '//span[@class=\'kitapismi\']/text()')
		l.add_xpath('isbn',     '//span[@class=\'normalkucuk\']/text()', u'ISBN:([0-9]+)')
		l.add_xpath('author',   '//span/a[contains(@href, "/yazar/")]/text()')
		l.add_xpath('publisher','//span/a[contains(@href, "/yayinevi/")]/text()')
		l.add_xpath('price',    '//td/text()', u'Kitapyurdu Fiyatı:(.*) TL\.')
		l.add_value('link', response.url)
		l.add_value('store', 3)
		return l.load_item()

Exemple #28

0

Afficher le fichier

Fichier : lazytweet.py Projet : KeithYue/QA-spider

 def get_user(self, selector):
     user_loader = XPathItemLoader(item = LazyTweetUser(), selector = selector)
     user_loader.add_xpath('twitter_username', ''.join([
         './a[1]/text()'
         ]))
     user_loader.add_value('twitter_url', ''.join([
         r'http://twitter.com/',
         user_loader.get_output_value('twitter_username')
         ]))
     return user_loader.load_item()

Exemple #29

0

Afficher le fichier

Fichier : kaogmat.py Projet : talentsun/gmatclub

 def parse_argument(self, response):
     loader = XPathItemLoader(item=Argument(), response=response)
     id = self.parse_id_from_url(response.url)
     if id:
         loader.add_value('id', id)
     else:
         loader.add_value('id', -1)
     loader.add_xpath('rating', '//b[@id="QuestionRateValue"]/text()')
     loader.add_xpath('essay', '//div[@class="essay"]')
     return loader.load_item()

Exemple #30

0

Afficher le fichier

Fichier : bcu.py Projet : TotallyBullshit/finance-2

    def parse(self, response):

        rate = XPathItemLoader(item=FinanceIndex(), response=response)
        
        rate.add_value("name", "Tasa Objetivo BCU")
        rate.add_value("unit", "%")
        rate.add_xpath("value", "8.75")
        #rate.update_only_if_change = True
        
        return [rate.load_item()]

Exemple #31

0

Afficher le fichier

Fichier : zhizhu_user_topic_spider.py Projet : yyhTHU/Zhihu_Spider

    def get_UT_item(self, sel, user_url):
        '''
        given the selector of topic and user url, generate the u_t relationship
        '''
        ut_loader = XPathItemLoader(item=ZhiHuU_T(), selector=sel)
        ut_loader.add_value('crawled_from', user_url)
        ut_loader.add_value('user_url',
                            '/' + '/'.join(user_url.split('/')[-3:-1]))
        ut_loader.add_xpath(
            'topic_url', './/a[contains(@class, "zm-list-avatar-link")]/@href')

        return ut_loader.load_item()

Exemple #32

0

Afficher le fichier

Fichier : yahoo_answer.py Projet : KeithYue/QA-spider

    def get_user(self, selector):
        user_loader = XPathItemLoader(item = YahooUser(), selector = selector)
        user_loader.add_xpath('user_name', './/span[contains(@class, "user")]//span[contains(@class, "fn")]/text()')
        user_loader.add_xpath('user_url', './/span[@class="user"]//a[@class="url"]/@href')
        user_loader.add_value('user_id', re.match(r'http://answers\.yahoo\.com/my/profile\?show=(.*)',
            user_loader.get_output_value('user_url')
            ).group(1))

        if user_loader.get_collected_values('user_name'):
            return user_loader.load_item()
        else:
            return None

Exemple #33

0

Afficher le fichier

    def parse(self, response):

        items = []
        for name, pattern, pos in rates:
            rate = XPathItemLoader(item=FinanceIndex(), response=response)
            rate.add_value("name", name)
            rate.add_value("unit", "%")
            rate.add_xpath(
                "value",
                "//a[contains(text(), '%s')]/parent::td/following-sibling::td[%d]/text()"
                % (pattern, pos))
            items.append(rate.load_item())
        return items

Exemple #34

0

Afficher le fichier

Fichier : techInAsiaScraper.py Projet : blaklaybul/AsiaScraper

    def parse(self, response):

        selector = HtmlXPathSelector(response)

        for startup in selector.select(self.startup_results_xpath):
            loader = XPathItemLoader(SearchResults(), selector=startup)

            loader.default_input_processor = MapCompose(unicode.strip)
            loader.default_output_processor = Join()

            for field, xpath in self.item_fields.iteritems():
                loader.add_xpath(field, xpath)
            yield loader.load_item()

Exemple #35

0

Afficher le fichier

Fichier : livesocial_spider.py Projet : andrewFisherUa/i-love-tutorials

    def parse(self, response):
        selector = HtmlXPathSelector(response)
        # looking for a deals
        for deal in selector.select(self.deals_list_xpath):
            loader = XPathItemLoader(LivingSocialDeal(), selector=deal)

            loader.default_input_processor = MapCompose(unicode.strip)
            loader.default_output_processor = Join()

            for field, xpath in self.item_fields.iteritems():
                loader.add_xpath(field, xpath)

            yield loader.load_item()

Exemple #36

0

Afficher le fichier

Fichier : auctionzip_spider.py Projet : elcpls/auction-scraper

    def parse_links(self, response):
        listing = re.findall(r"lid=(\d+)",response.url)

        loader = XPathItemLoader(item=AuctionsItem(), response=response)
        loader.add_value("id",listing[0])
        loader.add_xpath("auctioneer",settings['AUCTION_AUCTIONEER'])
        loader.add_xpath("contact_number",settings['AUCTION_CONTACT_NUMBER'])
        loader.add_xpath("date",settings['AUCTION_DATE'])
        loader.add_xpath("time",settings['AUCTION_TIME'])
        loader.add_xpath("location",settings['AUCTION_LOCATION'])
        loader.add_value("link",response.url)
        loader.add_xpath("listing",settings['AUCTION_LISTING'])

        return loader.load_item()

Exemple #37

0

Afficher le fichier

 def parse(self, response):
     hxs = HtmlXPathSelector(response)
     orden_compra, anio = re.search(r'wOCabc=(\d+)&wEjercicio=(\d+)',
                                    urlparse(response.url).query).groups()
     for tr in hxs.select('//table[contains(@width, "760")][2]/tr'):
         i = CompraLineaItem()
         l = XPathItemLoader(item=i, selector=tr)
         l.add_xpath('cantidad', 'td[1]/text()')
         l.add_xpath('importe', 'td[2]/text()')
         l.add_xpath('detalle', 'td[3]/text()')
         l.add_value('orden_compra', int(orden_compra))
         l.add_value('anio', int(anio))
         x = l.load_item()
         yield x

Exemple #38

0

Afficher le fichier

 def get_question(self, selector, response):
     # both select function and selector's join function need to add dot to search from relative based directory
     question_loader = XPathItemLoader(item = LazyTweetQuestion(), \
             selector = selector)
     question_loader.add_xpath(
         'question_content', ''.join([
             './/span[@class="post-body"]',
             '//span[@class="post-status"]/descendant-or-self::text()'
         ]))
     # not useful
     question_loader.add_xpath(
         'question_tags', ''.join(['//*[@id="post-tags"]/ul/li/a/text()']))
     question_loader.add_xpath(
         'asking_date', ''.join([
             './/span[@class="post-meta"]//span[@class="timestamp"]/text()'
         ]))
     question_loader.add_value(
         'asker',
         self.get_user(
             selector.select(''.join(['.//span[@class="post-meta"]']))))
     question_loader.add_xpath(
         'number_of_answers',
         ''.join(['.//span[@class="post-meta"]', '//a[last()]/text()']))
     question_loader.add_value('question_id', response.url.split('/')[-1])
     print question_loader.get_output_value('question_tags')
     return question_loader.load_item()

Exemple #39

0

Afficher le fichier

Fichier : spiders.py Projet : RuralIndia/pari_scrapy

    def parse_articles(self, response):
        hxs = HtmlXPathSelector(response)

        l = XPathItemLoader(item=Article(), response=response)
        l.add_xpath("title", "//h1[contains(@class,'detail-title')]/text()")
        l.add_xpath(
            "content",
            "//div[contains(@class,'article-text')]//p[contains(@class,'body')]"
        )
        l.add_xpath("date", "//span[contains(@class,'dateline')]/text()")
        l.add_xpath("location", " ")
        l.add_xpath("keywords", "//div[@id='articleKeywords']/p/a/text()")
        l.add_value("link", response.url)
        l.add_value("author", 'Sainath')
        return l.load_item()

Exemple #40

0

Afficher le fichier

    def parse(self, response):
        """Get response from start_urls"""

        selector = HtmlXPathSelector(response)

        for deal in selector.xpath(self.xpath_for_deals):
            loader = XPathItemLoader(LivingSocial(), selector=deal)

            # define processors
            loader.default_input_processor = MapCompose(unicode.strip)
            loader.default_output_processor = Join()

            # iterate over fields and add xpaths to the loader
            for field, xpath in self.item_fields.iteritems():
                loader.add_xpath(field, xpath.strip())
            yield loader.load_item()

Exemple #41

0

Afficher le fichier

    def parse_full_report(self, response):
        # need to work around weird bug where lxml can't handle encode=WINDOWS-1252
        # so pull out the body, convert to utf-8 and create a new TextResponse object to contain it
        # since XPathItemLoader requires a Response object
        text = unicode (response.body, response.encoding)
        t = TextResponse (url=response.url, body=text.encode('utf-8'), encoding='utf-8')

        l= XPathItemLoader(NrcScrapedFullReport(), response=t)
        url_parts = urlsplit(response.url)
        l.add_value('reportnum', parse_qs(url_parts.query)['standard_web inc_seq'])
        l.add_xpath('full_report_body', '//body')
        l.add_value('full_report_url', response.url)
        item = l.load_item()
        reportnum = item['reportnum']
        yield item
        self.db.setBotTaskStatus(reportnum, self.name, 'DONE')

Exemple #42

0

Afficher le fichier

 def get_answer(self, selector, response):
     answer_loader = XPathItemLoader(item = LazyTweetAnswer(), \
             selector = selector)
     answer_loader.add_value('question_id', response.url.split('/')[-1])
     answer_loader.add_value(
         'answerer',
         self.get_user(
             selector.select(''.join(['.//span[@class="answer-meta"]']))))
     answer_loader.add_xpath(
         'answer_content', ''.join([
             './/span[@class="answer-body"]',
             '//span[@class="answer-status"]//descendant-or-self::text()'
         ]))
     print answer_loader.get_output_value('answer_content')
     a = input()
     return answer_loader.load_item()

Exemple #43

0

Afficher le fichier

Fichier : livingsocial_spider.py Projet : mandliya/python-web-crawler

    def parse(self, response):
        selector = HtmlXPathSelector(response)

        #iterate over deals
        for deal in selector.select(self.deals_list_xpath):
            loader = XPathItemLoader(LivingSocialDeal(), selector=deal)

            #define processor
            # renove whitespace
            loader.default_input_processor = MapCompose(unicode.strip)
            loader.default_output_processor = Join()

            #iterate over fields and add xpaths to the loader
            for field, xpath in self.item_fields.iteritems():
                loader.add_xpath(field, xpath)
            yield loader.load_item()

Exemple #44

0

Afficher le fichier

Fichier : chunyu_doctor_spider.py Projet : lee670523/gitPython

    def parse_doctor_detail(self, response):
        """ This function parses a sample response. Some contracts are mingled
        with this docstring.

        @url http://www.chunyuyisheng.com/doctor/clinic_web_31f4d70d2867b969
        @returns items 1 1
        @returns requests 0 0
        """

        hxs = HtmlXPathSelector(response)

        l = XPathItemLoader(CYDoctorItem(), hxs)

        l.add_xpath('_name', ("//div[@class='bdHd']/h1/text()"))

        shortdesc = hxs.select(
            "//div[@id='mainColumn']//p[@class='bdFt']/text()").extract()
        if len(shortdesc) == 1:
            shortdescStr = shortdesc[0].strip()
            words = shortdescStr.split()
            if len(words) == 3:
                l.add_value('title', words[0])
                l.add_value('hospital', words[1])
                l.add_value('specialty', words[2])
            else:
                print("title/hostpital/special error.")

        l.add_xpath(
            'specialtyDesc',
            "//div[@id='docOtherInfo']/div[@class='infoCell'][1]//p[2]/text()")
        l.add_xpath(
            'personalInfo',
            "//div[@id='docOtherInfo']/div[@class='infoCell'][2]//p[2]/text()")
        l.add_xpath('stars', "//p[@class='right starTxt']/text()")

        answer = hxs.select(
            "//div[@id='resolvedData']/p[1]/a/text()").extract()
        if len(answer) == 1:
            answerStr = answer[0].strip().replace(u"\xa0", "")
            m = re.match(u"解答:(?P<answer_cnt>\d+)", answerStr)
            if m.groupdict()["answer_cnt"] is not None:
                l.add_value('answers', m.groupdict()["answer_cnt"])

        review = hxs.select("//div[@id='resolvedData']/p[2]/text()").extract()
        if len(review) == 1:
            reviewStr = review[0].strip().replace(u"\xa0", "")
            m = re.match(u"评价:(?P<review_cnt>\d+)", reviewStr)
            if m.groupdict()["review_cnt"] is not None:
                l.add_value('reviews', m.groupdict()["review_cnt"])

        # l.add_xpath('answers', "//div[@id='resolvedData']/p[1]/a/text()")
        # l.add_xpath('reviews', "//div[@id='resolvedData']/p[2]/text()")

        ret = l.load_item()
        print ret

        yield ret

Exemple #45

0

Afficher le fichier

    def search_results(self, response):
        text = unicode (response.body, response.encoding)
        hxs = HtmlXPathSelector(text=text)
        reports = hxs.select ('//table[@class="t16Standard"]/tr')
        if (len(reports) == 0):
            self.log('Incident report data not present in response', log.ERROR)
        else:
            # Skip the first report record because this is the header row
            reports.pop (0)
            if (len(reports) == 0):
                self.log('No incident reports found in response', log.WARNING)
            else:
                self.log('Retrieved {0} incident reports'.format(len(reports)), log.INFO)

        for report in reports:
            l = XPathItemLoader(NrcScrapedReport(), report)
            l.context['base_url'] = response.url
            for name, params in NrcScrapedReport.fields.items():
                l.add_xpath(name, params['xpath'])
            item = l.load_item()
            if self.db.reportExists(item['reportnum']):
                self.log('Report {0} already exists.  Skipping to next report.'.format(item['reportnum']), log.INFO)
            else:
                f_request = Request(
                    item['full_report_url'],
                    callback=self.parse_full_report)
                m_request = Request(
                    item['materials_url'],
                    callback=self.parse_materials)
                yield item
                self.db.setBotTaskStatus(item['reportnum'], self.name, 'DONE')

#                if self.db.fullReportExists (item['reportnum']):
#                    self.log('Full report Report {0} already exists.  Skipping download.'.format(item['reportnum']), log.INFO)
#                else:
#                    yield f_request
#
#                if self.db.materialExists (item['reportnum']):
#                    self.log('Materials record(s) already exist for report {0}.  Skipping download.'.format(item['reportnum']), log.INFO)
#               else:
#                    yield m_request

        # get next page of results
        next = hxs.select('//td[@class="pagination"][4]/a/@href')
        if len(next) > 0:
            yield Request (urljoin(response.url, next[0].extract()), callback=self.search_results)

Exemple #46

0

Afficher le fichier

Fichier : blog.py Projet : mrgretwon/scraper

    def parse(self, response):

        selector = HtmlXPathSelector(response)

        # iterate over data_list
        for data in selector.select(self.data_list):
            loader = XPathItemLoader(TeoniteItem(), selector=data)

            loader.default_input_processor = MapCompose(str.strip)
            loader.default_output_processor = Join()

            # add xpath to loader
            for field, xpath in self.item_fields.items():
                loader.add_xpath(field, xpath)
            yield loader.load_item()

        for nextp in selector.select(self.next_page):
            yield response.follow(nextp, callback=self.parse)

Exemple #47

0

Afficher le fichier

    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        for qxs in hxs.select(self.lista_linhas_xpath):
            loader = XPathItemLoader(LinhaItem(), selector=qxs)
            loader.add_xpath('linha', './td[1]/p//text()')
            loader.add_xpath('nome', './td[3]/p//text()')

            link = self.base_url + qxs.select('./td[3]//a/@href').extract()[0]
            #TODO: Deveria manter o contexto e retornar os dados da proxima pagina
            #      mas o que parece eh que nao esta retornando
            request = Request(link, callback=self.parse_item)
            #pdb.set_trace()

            loader.add_value('ida', request.meta['ida'])
            loader.add_value('volta', request.meta['volta'])

            yield loader.load_item()

Exemple #48

0

Afficher le fichier

    def get_user(self, selector, response, label):
        user_loader = XPathItemLoader(item = StackOverflowUser(),
                selector = selector)
        user_loader.add_xpath('user_name', ''.join([
            './/div[contains(@class, "user-details")]',
            '/a/text()'
            ]))
        user_loader.add_xpath('user_link', ''.join([
            './/div[contains(@class, "user-details")]',
            '/a/@href'
            ]))

        if user_loader.get_output_value('user_link'):
            user_id = user_loader.get_output_value('user_link')
            user_loader.add_value('user_id',
                    user_loader.get_output_value('user_link'))

        return user_loader.load_item()

Exemple #49

0

Afficher le fichier

    def parse_talk(self, response):
        loader = XPathItemLoader(item=Pybr8TalksItem(), response=response)
        loader.add_xpath('title', '//div[@id="proposal"]/h1/text()')
        loader.add_xpath('description',
                         '//div[@class="twocolumn"]/div[2]/text()[2]')
        loader.add_xpath('author_name',
                         '//div[@class="twocolumn"]/div/div[2]/h3/text()')
        loader.add_xpath('author_profile',
                         '//div[@class="twocolumn"]/div/div[2]/text()[3]')

        return loader.load_item()

Exemple #50

0

Afficher le fichier

Fichier : crunchbaseevents_spider.py Projet : schwema/svi-training-a

    def parse(self, response):
        """
        Default callback used by Scrapy to process downloaded responses
        """
        selector = HtmlXPathSelector(response)

        #iterate over events
        for event in selector.select(self.events_list_xpath):
            loader = XPathItemLoader(CrunchBaseEvent(), selector=event)

            #define processors
            loader.default_input_processor = MapCompose(unicode.strip)
            loader.default_output_processor = Join()

            #iterate over fields and add xpaths to the loader.
            for field, xpath in self.item_fields.iteritems():
                loader.add_xpath(field, xpath)

            yield loader.load_item()

Exemple #51

0

Afficher le fichier

    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        item_name = hxs.select(
            "//input[@id='ctl00_ctlPagePlaceHolder_Keywords']/@value").extract(
            )
        item_hash = hashlib.md5(
            '%s::%s::%s' %
            (self.auction_id, item_name, self.name)).hexdigest()

        loader = XPathItemLoader(item=SearchResultItem(), response=response)
        loader.add_value("id", item_hash)
        loader.add_value("auction_id", self.auction_id)
        loader.add_value("site", self.name)
        loader.add_xpath(
            "name", "//input[@id='ctl00_ctlPagePlaceHolder_Keywords']/@value")
        loader.add_value("link", response.url)
        loader.add_xpath("price", "//td[7]/text()")

        return loader.load_item()

Exemple #52

0

Afficher le fichier

 def parse(self, response):
   response.body = response.body.replace('\\','').replace('\xa0','')
   p = XPathItemLoader(item=PersonItem(), response=response)
   
   try:
     p.add_value('first_name', re.findall( '&qf=(\w+)&', response.url )[0] )
     p.add_value('middle_name', re.findall( '&qmi=(\w+)&', response.url )[0] )
     p.add_value('last_name', re.findall( '&qn=(\w+)&', response.url )[0] )
     p.add_value('city', re.findall( '&qc=(\w+)&', response.url )[0] )
     p.add_value('state', re.findall( '&qs=(\w+)&', response.url )[0] )
     p.add_value('zipcode', re.findall( '&qz=(\d+)&', response.url )[0] )
     p.add_value('prop_ref', re.findall( '&prop_ref=(\d+)', response.url )[0] )
     
     p.add_xpath('cities', '//div[@class="addresses"]/p/b/text()[1]', re="([^\(]+)")
     p.add_xpath('age','//div[@class="greenTopBoxLeft round12_12_0_0"]/p[@class="nameAge"]/text()[2]', re=", Age (\d+)")
   except IndexError:
     pass
   else:
     return p.load_item()

Exemple #53

0

Afficher le fichier

Fichier : doc_spider.py Projet : lee670523/gitPython

    def parse_faculty_detail(self, response):
        """ This function parses a sample response. Some contracts are mingled
        with this docstring.

        @url http://www.haodf.com/faculty/DE4rO-XCoLU0Jq1rbc1P6dS2aO.htm
        @returns items 21 21
        @returns requests 3 3
        @scrapes _name specialty title shortDesc
        """
        hxs = HtmlXPathSelector(response)

        linkExtractor = SgmlLinkExtractor(
            allow=(r"/faculty/\S+/menzhen.htm\?orderby", ), unique=True)
        links = linkExtractor.extract_links(response)
        for link in links:
            yield Request(link.url, callback=self.parse_faculty_detail)

        specialty = hxs.select(
            "/html/body/div[3]/div/div[2]/div/a[3]/text()").extract()
        hospital = hxs.select(
            "/html/body/div[3]/div/div[2]/div/a[2]/text()").extract()

        docLinks = hxs.select(
            "//table[@id='doc_list_index']/tr[descendant::td[contains(@class, 'tda')]]"
        )
        #docLinks = hxs.select("//table[@id='doc_list_index']/tr")

        for doc in docLinks:
            l = XPathItemLoader(DoctorItem(), doc)

            docNames = doc.select(
                "./td[@class='tda']/li/a[contains(@href, 'http://www.haodf.com/doctor/')]/text()"
            ).extract()

            if len(docNames) != 0:
                print docNames[0]

            l.add_xpath(
                '_name',
                "./td[@class='tda']/li/a[contains(@href, 'http://www.haodf.com/doctor/')]/text()"
            )
            l.add_value('specialty', specialty)
            l.add_value('hospital', hospital)
            l.add_xpath('title', "./td[@class='tda']/li/p[1]/text()")
            l.add_xpath('acadamicDegree', "./td[@class='tda']/li/p[2]/text()")
            l.add_xpath('shortDesc', "./td[@class='tdb']/text()")
            #clinic time todo

            ret = l.load_item()
            #print ret

            yield ret

Exemple #54

0

Afficher le fichier

    def myparse(self, response):
        print "myParse"
        selector = HtmlXPathSelector(response)
        # l = selector.select(self.deals_list_xpath)
        l = selector.select('//div[@id="detailed"]')
        ll = l.select('.//div[@class="title4"]/a/text()').extract()
        open(ll[0].strip() + '.html', 'wb').write(response.body)
        print ll[0].strip()
        for deal in l:

            #loader = XPathItemLoader(LivingSocialDeal(),selector=deal)
            loader = XPathItemLoader(MoviesClass(), selector=deal)
            loader.default_input_processor = MapCompose(unicode.strip)
            loader.default_output_processor = Join()
            loader.default_output_processor = TakeFirst()

            for field, xpath in self.mov_fields.iteritems():
                loader.add_xpath(field, xpath)
                x = deal.select(field).extract()
            yield loader.load_item()

Exemple #55

0

Afficher le fichier

Fichier : leboncoin.py Projet : ajocelynpatrick/scrapy-lbc

    def parse(self, response):
      # hxs     = HtmlXPathSelector(response)
      # ads     = hxs.select('//div[@class="list-ads"]/a')
      # items   = []
      # for ad in ads:
      #     item = LeboncoinItem()
      #     item['name']    = ad.select('div[@class="ad-lbc"]/div[@class="detail"]/div[@class="title"]/text()').re('^\s*([\w\s]+\w)\s*')
      #     item['photo']   = ad.select('div[@class="ad-lbc"]/div[@class="image"]/div[@class="image-and-nb"]/img/@src').extract()
      #     item['url']     = ad.select('@href').extract()

           # self.log(item['name'])
            #print item['name'],':' ,item['photo'],'--->', item['url']
           #html = '<div><div style="width:150px;height:250px;float:left;text-align:center">\
           #<img src="%s" alt="" /><br />\
           #<p><a href="%s">%s</a></p>\
           #</div></div>' % (''.join(item['photo']), ''.join(item['url']), ''.join(item['name']) )

           ##print photo
           #items.append(item)
           ##   put in filename
           #filename = response.url.split("/")[-4]
           #open('/tmp/lbc/'+filename+'.html', 'a').write(html)
        #return items
        #yield items
        hxs = HtmlXPathSelector(response)
        for qxs in hxs.select('//div[@class="list-ads"]/a'):
            loader = XPathItemLoader(LeboncoinItem(), selector=qxs)
            loader.add_xpath('name'      ,  'div[@class="ad-lbc"]/div[@class="detail"]/div[@class="title"]/text()', re='^\s*([\w\s]+\w)\s*' )
            loader.add_xpath('photo'     ,  'div[@class="ad-lbc"]/div[@class="image"]/div[@class="image-and-nb"]/img/@src' )
            loader.add_xpath('url'       ,  '@href' )
            loader.add_value('category'  ,  response.url.split("/")[-4]  )

            yield loader.load_item()

Exemple #56

0

Afficher le fichier

    def parse_materials(self, response):
        text = unicode (response.body, response.encoding)
        hxs = HtmlXPathSelector(text=text)
        materials = hxs.select ('//table[@class="t16Standard"]/tr')
        if (len(materials) == 0):
            self.log('Materials data not present in response from {0}'.format(response.url), log.INFO)
        else:
            # Skip the first report record because this is the header row
            materials.pop (0)
            if (len(materials) == 0):
                self.log('No incident reports found in response', log.INFO)
            else:
                self.log('Retrieved {0} materials records'.format(len(materials)), log.INFO)

        for material in materials:
            l = XPathItemLoader(NrcScrapedMaterial(), material)
            l.add_value('reportnum', response.url, TakeFirst(), re='P3_SEQNOS:(\d+)')
            for name, params in NrcScrapedMaterial.fields.items():
                if 'xpath' in params:
                    l.add_xpath(name, params['xpath'])
            item = l.load_item()
            yield item

Exemple #57

0

Afficher le fichier

Fichier : livingsocial_spider.py Projet : wesleymutwiri/100_days_of_code

    def parse(self, response):
        """
        Default callback used by Scrapy to process downloaded responses
        Testing contracts:
        @url http://www.livingsocial.com/cities/15-san-francisco
        @returns items 1
        @scrapes title link
        """
        selector = HtmlXPathSelector(response)

        # iterate over deals
        for deal in selector.xpath(self.deals_list_xpath):
            loader = XPathItemLoader(LivingSocialDeal(), selector=deal)

            # define processors
            loader.default_input_processor = MapCompose(unicode.strip)
            loader.default_output_processor = Join()

            # iterate over fields and add xpaths to the loader
            for field, xpath in self.item_fields.iteritems():
                loader.add_xpath(field, xpath)
            yield loader.load_item()

Exemple #58

0

Afficher le fichier

Fichier : hospital_spider.py Projet : lee670523/gitPython

    def parse(self, response):
        """ This function parses a sample response. Some contracts are mingled
        with this docstring.

        @url http://www.haodf.com/yiyuan/shanghai/list.htm
        @returns items 21 21
        @returns requests 3 3
        @scrapes _hospitalName grade area city
        """
        hxs = HtmlXPathSelector(response)

        city_tree = hxs.select("//div[@id='el_tree_1000000']")

        # Used for hospital
        _cityName = city_tree.select(
            "div[@class='kstl2']/a/text()").extract()[0]

        l = XPathItemLoader(CityItem(), city_tree)
        l.add_xpath('cityAreas', "div[@class='ksbd']/ul/li/a/text()")
        l.add_xpath('_cityName', "div[@class='kstl2']/a/text()")
        yield l.load_item()

        for url in city_tree.select("div[@class='kstl']/a/@href").extract():
            yield Request(url, callback=self.parse)

        area_list = hxs.select(
            "//div[@id='el_result_content']/div/div[@class='bxmd']/div")
        hospital_list = area_list.select("div[@class='m_ctt_green']/ul/li/a")
        for hospital in hospital_list:
            l = XPathItemLoader(HospitalItem(), hospital)
            l.add_xpath('_hospitalName', "text()")
            featureList = hospital.select(
                "following-sibling::span/text()").extract()
            if len(featureList) == 1:
                featureStr = featureList[0].strip()
                m = re.match(u"\((?P<grade>\S+)(|, 特色:(?P<feature>\S+))\)",
                             featureStr)
                if m is not None:
                    if m.groupdict()["grade"] is not None:
                        l.add_value('grade', m.groupdict()["grade"])
                    if m.groupdict()["feature"] is not None:
                        l.add_value('feature', m.groupdict()["feature"])
            #l.add_xpath('feature', "following-sibling::span/text()")
            l.add_xpath(
                'area',
                "parent::*/parent::*/parent::*/preceding-sibling::*[1]/attribute::id"
            )
            l.add_value('city', _cityName)
            yield l.load_item()

Exemple #59

0

Afficher le fichier

Fichier : proveedores.py Projet : ingindIsrael/GPB

    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        for tr in hxs.select('//div[@id="miListView"]/table/tr'):
            i = ProveedorItem()
            l = XPathItemLoader(item=i, selector=tr)
            l.add_xpath('nombre', 'td[1]/text()')
            l.add_xpath('domicilio', 'td[2]/text()')
            l.add_xpath('cuit', 'td[3]/text()')
            l.add_xpath('localidad', 'td[4]/text()')

            yield l.load_item()

        for l in self.extractor.extract_links(response):
            yield Request(l.url, callback=self.parse)

Exemple #60

0

Afficher le fichier

Fichier : talkspider_crawl.py Projet : dmclain/scrapy-pytexas-2013

 def parse_item(self, response):
     #hxs = HtmlXPathSelector(response)
     l = XPathItemLoader(item=PytexasItem(), response=response)
     l.add_xpath('title', '//*/div[@class="span6"]/h2/text()')
     l.add_xpath('speaker', '//*/div[@class="span6"]/h3/text()')
     l.add_xpath('description', '//*/div[@class="span6"]/p[2]/text()')
     #l.add_value('last_updated', 'today') # you can also use literal values
     return l.load_item()