Python XPathItemLoader.load_item Examples, scrapy.contrib.loader.XPathItemLoader.load_item Python Examples

Example #1

0

Show file

    def parse(self, response):

        hxs = HtmlXPathSelector(response)

        if "openmarket" in response.url:
            rate = XPathItemLoader(item=FinanceIndex(), response=response)
            rate.add_value("name", "Tasa Objetivo FED")
            rate.add_value("unit", "%")
            rate.add_value(
                "value",
                hxs.select("//td[@class='data'][3]/text()").re("\d+\.\d+"))
            #rate.update_only_if_change = True

            return [rate.load_item()]
        else:
            for line in response.body_as_unicode().splitlines():
                if "Federal funds (effective)" in line:
                    rate = XPathItemLoader(item=FinanceIndex(),
                                           response=response)
                    rate.add_value("name", "FED effective fund rate")
                    rate.add_value("unit", "%")
                    rate.add_value(
                        "value",
                        hxs.select(
                            "//th[contains(text(), 'Federal funds')]/following-sibling::td/text()"
                        ).re("\xa0(.*?)\xa0"))
                    return [rate.load_item()]

Example #2

0

Show file

File: hospital_spider.py Project: lee670523/gitPython

    def parse(self, response):
        """ This function parses a sample response. Some contracts are mingled
        with this docstring.

        @url http://www.haodf.com/yiyuan/shanghai/list.htm
        @returns items 21 21
        @returns requests 3 3
        @scrapes _hospitalName grade area city
        """
        hxs = HtmlXPathSelector(response)

        city_tree = hxs.select("//div[@id='el_tree_1000000']")

        # Used for hospital
        _cityName = city_tree.select(
            "div[@class='kstl2']/a/text()").extract()[0]

        l = XPathItemLoader(CityItem(), city_tree)
        l.add_xpath('cityAreas', "div[@class='ksbd']/ul/li/a/text()")
        l.add_xpath('_cityName', "div[@class='kstl2']/a/text()")
        yield l.load_item()

        for url in city_tree.select("div[@class='kstl']/a/@href").extract():
            yield Request(url, callback=self.parse)

        area_list = hxs.select(
            "//div[@id='el_result_content']/div/div[@class='bxmd']/div")
        hospital_list = area_list.select("div[@class='m_ctt_green']/ul/li/a")
        for hospital in hospital_list:
            l = XPathItemLoader(HospitalItem(), hospital)
            l.add_xpath('_hospitalName', "text()")
            featureList = hospital.select(
                "following-sibling::span/text()").extract()
            if len(featureList) == 1:
                featureStr = featureList[0].strip()
                m = re.match(u"\((?P<grade>\S+)(|, 特色:(?P<feature>\S+))\)",
                             featureStr)
                if m is not None:
                    if m.groupdict()["grade"] is not None:
                        l.add_value('grade', m.groupdict()["grade"])
                    if m.groupdict()["feature"] is not None:
                        l.add_value('feature', m.groupdict()["feature"])
            #l.add_xpath('feature', "following-sibling::span/text()")
            l.add_xpath(
                'area',
                "parent::*/parent::*/parent::*/preceding-sibling::*[1]/attribute::id"
            )
            l.add_value('city', _cityName)
            yield l.load_item()

Example #3

0

Show file

File: lazytweet.py Project: KeithYue/QA-spider

    def get_question(self, selector, response):
# both select function and selector's join function need to add dot to search from relative based directory
        question_loader = XPathItemLoader(item = LazyTweetQuestion(), \
                selector = selector)
        question_loader.add_xpath('question_content', ''.join([
            './/span[@class="post-body"]',
            '//span[@class="post-status"]/descendant-or-self::text()'
            ]))
        # not useful
        question_loader.add_xpath('question_tags', ''.join([
            '//*[@id="post-tags"]/ul/li/a/text()'
            ]))
        question_loader.add_xpath('asking_date', ''.join([
            './/span[@class="post-meta"]//span[@class="timestamp"]/text()'
            ]))
        question_loader.add_value('asker', self.get_user(selector.select(''.join([
            './/span[@class="post-meta"]'
            ]))))
        question_loader.add_xpath('number_of_answers', ''.join([
            './/span[@class="post-meta"]',
            '//a[last()]/text()'
            ]))
        question_loader.add_value('question_id', response.url.split('/')[-1])
        print question_loader.get_output_value('question_tags')
        return question_loader.load_item()

Example #4

0

Show file

File: leboncoin.py Project: ajocelynpatrick/scrapy-lbc

    def parse(self, response):
      # hxs     = HtmlXPathSelector(response)
      # ads     = hxs.select('//div[@class="list-ads"]/a')
      # items   = []
      # for ad in ads:
      #     item = LeboncoinItem()
      #     item['name']    = ad.select('div[@class="ad-lbc"]/div[@class="detail"]/div[@class="title"]/text()').re('^\s*([\w\s]+\w)\s*')
      #     item['photo']   = ad.select('div[@class="ad-lbc"]/div[@class="image"]/div[@class="image-and-nb"]/img/@src').extract()
      #     item['url']     = ad.select('@href').extract()

           # self.log(item['name'])
            #print item['name'],':' ,item['photo'],'--->', item['url']
           #html = '<div><div style="width:150px;height:250px;float:left;text-align:center">\
           #<img src="%s" alt="" /><br />\
           #<p><a href="%s">%s</a></p>\
           #</div></div>' % (''.join(item['photo']), ''.join(item['url']), ''.join(item['name']) )

           ##print photo
           #items.append(item)
           ##   put in filename
           #filename = response.url.split("/")[-4]
           #open('/tmp/lbc/'+filename+'.html', 'a').write(html)
        #return items
        #yield items
        hxs = HtmlXPathSelector(response)
        for qxs in hxs.select('//div[@class="list-ads"]/a'):
            loader = XPathItemLoader(LeboncoinItem(), selector=qxs)
            loader.add_xpath('name'      ,  'div[@class="ad-lbc"]/div[@class="detail"]/div[@class="title"]/text()', re='^\s*([\w\s]+\w)\s*' )
            loader.add_xpath('photo'     ,  'div[@class="ad-lbc"]/div[@class="image"]/div[@class="image-and-nb"]/img/@src' )
            loader.add_xpath('url'       ,  '@href' )
            loader.add_value('category'  ,  response.url.split("/")[-4]  )

            yield loader.load_item()

Example #5

0

Show file

File: yahoo_answer.py Project: KeithYue/QA-spider

    def get_answer(self, selector, question_loader):
        answer_loader = XPathItemLoader(item = YahooAnswer(), selector = selector)
        answer_loader.add_xpath('answer_id', './@id')
        answer_loader.add_xpath('answer_content','.//div[@class="qa-container"]//div[@class="content"]//text()')
        answer_loader.add_value('answerer',self.get_user(selector))
        answer_loader.add_value('question_id',question_loader.get_output_value('question_id'))
        answer_loader.add_xpath('answering_date',''.join([
            './/div[@class="qa-container"]//ul[@class="meta"]',
            '/li[1]/abbr/@title'
            ]))
        answer_loader.add_xpath('marks',''.join([
            './/div[@class="utils-container"]',
            '//li[@class="rate-up"]',
            '//span[@class="seo-rated"]/text()'
            ]))
        answer_loader.add_xpath('marks',''.join([
            './/div[@class="utils-container"]',
            '//li[@class="rate-up"]',
            '//span[@class="seo-rated"]//strong/text()'
            ]))
# get the good number ot bad number
        marks = answer_loader.get_output_value('marks')
        # print marks
        if marks.find('good'):
            answer_loader.add_value('number_of_good_marks', marks.split(' ')[0])
#bad numbers
# is best answer
        answer_class = selector.select('./@class').extract()[0]
        if answer_class.find('best') != -1:
            answer_loader.add_value('is_best_answer', 1)
        else:
            answer_loader.add_value('is_best_answer', 0)

        return answer_loader.load_item()

Example #6

0

Show file

File: letv_series.py Project: ElegantCloud/feiying

 def parse_series_item(self, response):
     hxs = HtmlXPathSelector(response)
     videos = hxs.select('//div[@class="vo1"]')
     for v in videos:
         l = XPathItemLoader(FySeriesItem(), v)
         series_id = self._get_series_id(response.url)
         text = v.select('dl[@class="vd1"]/dt[5]/text()').extract()
         episode_all = self._get_episode_all(text[0])
         l.add_xpath('title', 'div[@class="vd"]/text()[2]', MapCompose(unicode.strip),
                 re='](.+)')
         l.add_xpath('image_url', 'dl[@class="vd1"]/dd/img/@src')
         l.add_xpath('director', 'dl[@class="vd1"]/dt[1]/text()', self._get_default,
                 re='...(.+)')
         l.add_xpath('actor', 'dl[@class="vd1"]/dt[2]/text()', self._get_default, re='...(.+)')
         l.add_xpath('origin', 'dl[@class="vd1"]/dt[4]/text()', self._get_default, re='...(.+)')
         l.add_xpath('episode_count', 'dl[@class="vd1"]/dt[5]/text()', self._get_default,
                 re='\d+')
         l.add_xpath('release_date', 'dl[@class="vd1"]/dt[6]/text()', self._get_default,
                 re='...(.+)')
         l.add_xpath('description', 'dl[@class="vd4"][2]/dd/text()', MapCompose(unicode.strip,
                 self._get_default))
         l.add_value('source_id', self.name+'_'+series_id)
         l.add_value('episode_all', episode_all)
         l.add_value('channel', 2)
         series = l.load_item()
         request = Request(self.episode_list_url + series_id, callback=self.parse_episode_list)
         request.meta['series'] = series
         yield request

Example #7

0

Show file

File: flipkart_spider.py Project: sankar009/E-Commerce-Scrapy

    def parse(self, response):
        """
        Default callback used by Scrapy to process downloaded responses

        """
        selector = HtmlXPathSelector(response)


        details=urlparse(response.request.url)
        queryStr={x.split('=')[0]:(x.split('=')[1]) for x in details.query.split("&")}
        print "\n",(urllib.unquote(queryStr['p%5B%5D']).split("=")[1]),queryStr['start']

        
        for deal in selector.select(self.deals_list_xpath):
            loader = XPathItemLoader(flipkartData(), selector=deal)

            # define processors
            loader.default_input_processor = MapCompose(unicode.strip)
            loader.default_output_processor = Join()

            # iterate over fields and add xpaths to the loader
            for field, xpath in self.item_fields.iteritems():
                loader.add_xpath(field, xpath)

            # adding the request URL to the loader
            loader.add_value("requestURL",unicode(response.request.url, "utf-8"))

            # adding the category for the request
            loader.add_value("category",unicode(self.category))

            yield loader.load_item()

Example #8

0

Show file

File: digitalpodcast_discovery_spider.py Project: solaise73/adaptfm

	def parse(self, response):
		x = XmlXPathSelector(response)
		#x.register_namespace("xsi", "http://www.w3.org/2001/XMLSchema-instance")
		
		#programs = x.select('./body/outline[position()=4]/outline[position()<4]')
		programs = x.select('//body/outline/outline')
		podcastCount = str(len(programs))
		i=0
		allitems=[]
		for program in programs:
			i=i+1
			l = XPathItemLoader(PodcastItem(), selector=program)
			l.add_xpath('id', 'concat("dpc_", ./@xmlUrl)')
			l.add_value('audioType', 'disco')
			l.add_xpath('brandId', './@xmlUrl')
			l.add_xpath('brandFeed', './@xmlUrl')
			l.add_xpath('brandName', './@title')
			l.add_xpath('brandDescription', './@description')
			l.add_xpath('brandHomepage', './@htmlUrl')
			
			self.log('Discovering dpc [%s of %s] feeds' % (i, podcastCount), level=log.INFO)
		
		
			item = l.load_item()
			yield item

Example #9

0

Show file

File: rssurls_discovery_spider.py Project: solaise73/adaptfm

	def parse(self, response):
		x = XmlXPathSelector(response)
		x.register_namespace("im", "http://itunes.apple.com/rss")
		x.register_namespace('atom','http://www.w3.org/2005/Atom')
		feedCount = str(len(self.start_urls))
		self.i=self.i+1
		self.log('Reading rss url [%s of %s]' % (self.i, feedCount), level=log.INFO)
		entries = x.select('//atom:entry')
		
		if entries:

			# a itunes rss feed
			for entry in entries:
				id = entry.select('./atom:id/@im:id').extract()
				self.log('Entry %s' % (str(id)), level=log.INFO)
				yield Request('http://itunes.apple.com/lookup?id='+ id[0], callback=self.getItunesTrackJson)
			
			
			
		else:
			# a single feed
			l = XPathItemLoader(PodcastItem(), x)
			l.add_value('id', 'rssdisco_'+response.url)
			l.add_value('audioType', 'disco')
			l.add_value('brandFeed', response.url)
			l.add_xpath('brandName', '//./channel/title/text()')
			self.log('Feed from rss %s' % (response.url), level=log.INFO)
			
			item = l.load_item()
			
	
			yield item

Example #10

0

Show file

File: billboard_spider.py Project: Ramblurr/tomahawk-contrib

    def parse_page(self, response, chart, next_pages):
        hxs = HtmlXPathSelector(response)

        # parse every chart entry
        list = []
        for item in  hxs.select('//*[@class="printable-row"]'):
            loader = XPathItemLoader(SingleItem(), selector=item)
            loader.add_xpath('rank', 'div/div[@class="prank"]/text()')
            loader.add_xpath('track', 'div/div[@class="ptitle"]/text()')
            loader.add_xpath('artist', 'div/div[@class="partist"]/text()')
            loader.add_xpath('album', 'div/div[@class="palbum"]/text()')

            single = loader.load_item()
            list.append(dict(single))

        chart['list'] += list

        if len(next_pages) == 0:
            log.msg("Done with %s" %(chart['name']))
            yield chart
        else:
            next_page = next_pages.popleft()
            log.msg("Starting nextpage (%s) of %s - %s left" % (next_page, chart['name'], len(next_pages)))
            request = Request('http://www.billboard.com'+next_page,
                            callback = lambda r: self.parse_page(r, chart, next_pages))

            yield request

Example #11

0

Show file

File: group_spider.py Project: whitefoxx/douban_crawler

    def parse(self, response):
        url = response.url
        group_name = url[url.find("group") :].split("/")[1]
        hxs = HtmlXPathSelector(response)

        dls = hxs.select('//dl[@class="obu"]')
        items = []
        for dl in dls:
            item = GroupUserItem()
            l = XPathItemLoader(item=item, selector=dl)
            l.add_xpath("homepage", "dt/a/@href")
            l.add_xpath("image", "dt/a/img/@src")
            l.add_xpath("name", "dd/a/text()")
            l.add_value("group", group_name)
            yield l.load_item()

        links = hxs.select('//span[@class="next"]/a/@href').extract()
        for url in links:
            yield Request(url, callback=self.parse)
        if len(links) < 1:
            p = re.compile('<span class="next">.*?<a href="(.+?)">', re.S)
            m = p.search(response.body_as_unicode())
            if m:
                url = m.group(1)
                yield Request(url, callback=self.parse)

Example #12

0

Show file

File: itunesglobal_discovery_spider.py Project: solaise73/adaptfm

    def getRssFeedFromItunes(self, response):
        itunes1 = json.loads(response.body)
        metaData = response.meta["metaData"]

        podcastId = metaData["podcastId"]
        podcastName = metaData["podcastName"]
        genreName = metaData["genreName"]
        itunesPopularInGenre = metaData["itunesPopularInGenre"]

        self.log("%s %s %s" % (response.url, podcastId, podcastName), level=log.INFO)
        if itunes1["resultCount"] == 1:  # should only ever be one as looking up by Id
            l = XPathItemLoader(PodcastItem(), response=response)
            l.add_value("id", "itunesglobal_" + itunes1["results"][0]["feedUrl"])
            l.add_value("audioType", "disco")
            l.add_value("brandName", podcastName)
            l.add_value("brandCategory", genreName)
            l.add_value("brandFeed", itunes1["results"][0]["feedUrl"])

            l.add_value("itunesPopularInGenre", str(itunesPopularInGenre))
            l.add_value("itunesTrackId", str(itunes1["results"][0]["trackId"]))
            l.add_value("itunesCollectionId", str(itunes1["results"][0]["collectionId"]))
            if "artistId" in itunes1["results"][0]:
                l.add_value("itunesArtistId", str(itunes1["results"][0]["artistId"]))

            item = l.load_item()
            yield item

        else:
            self.log(
                "--FAILED itunes Json Discovering genre %s %s %s" % (response.url, podcastId, podcastName),
                level=log.WARNING,
            )
            return

Example #13

0

Show file

File: NrcMaterialsScraper.py Project: netconstructor/scraper-2

    def parse_materials(self, response):
        reportnum = response.request.meta['reportnum']
        text = unicode (response.body, response.encoding)
        hxs = HtmlXPathSelector(text=text)
        materials = hxs.select ('//table[@class="t16Standard"]/tr')
        if (len(materials) == 0):
            self.log('Materials data not present in response from {0}'.format(response.url), log.INFO)
        else:
            # Skip the first report record because this is the header row
            materials.pop (0)
            if (len(materials) == 0):
                self.log('No materials reports found in response {0}'
                         .format(reportnum), log.INFO)
            else:
                self.log('Retrieved {0} materials records in report {1}'
                         .format(len(materials),reportnum), log.INFO)

        for material in materials:
            l = XPathItemLoader(NrcScrapedMaterial(), material)
            l.name_in = lambda slist: [s[:32] for s in slist]
            l.add_value('reportnum', reportnum)
            for name, params in NrcScrapedMaterial.fields.items():
                if 'xpath' in params:
                    l.add_xpath(name, params['xpath'])
            item = l.load_item()
            yield item
     
        self.db.setBotTaskStatus(reportnum, self.name, 'DONE')

Example #14

0

Show file

File: ParseSpider.py Project: talentsun/gmatclub

	def parse_rc(self,response):
		loader = XPathItemLoader(item=ParseRcItem(), response=response)
		id = self.parse_id_from_url(response.url)
		loader.add_value('questionId', id)
		loader.add_xpath('text', '//div[@class="text"]/text()')
		loader.add_xpath('text', '//div[@class="text"]/span/text()')
		loader.add_xpath('answerList','//div[@class="item clearfix"]/span/text()')
		loader.add_xpath('choiceList','//div[@class="item clearfix"]/b/text()')
		loader.add_xpath('answer','//div[@class="answer clearfix hidden QuesHidden"]/b/text()')
		# loader.add_xpath('explanation','//div[@id="DivExplain"]')
		item =  loader.load_item()
		if len(item['text']) ==3:
			test = item['text'][0] + '<span style="text-decoration:underline;">' + item['text'][2]  + '</span>'+ item['text'][1]
		else:
			test = item['text'][0]

		for filename in self.fileList:
			index = filename.find(id)
			if index != -1:
				f = open('/home/huwei/origin/rcarticle/' + filename)
				artile = f.read()
				f.close

		content = self.rc_content.format(artile[24:len(artile) - 4],item['questionId'][0],
			item['questionId'][0],test,
			item['questionId'][0],item['choiceList'][0],item['choiceList'][0],item['answerList'][0],
			item['questionId'][0],item['choiceList'][1],item['choiceList'][1],item['answerList'][1],
			item['questionId'][0],item['choiceList'][2],item['choiceList'][2],item['answerList'][2],
			item['questionId'][0],item['choiceList'][3],item['choiceList'][3],item['answerList'][3],
			item['questionId'][0],item['choiceList'][4],item['choiceList'][4],item['answerList'][4],
			item['questionId'][0],item['answer'][0])
		wf = open('/home/huwei/gmatclub/rc/' + id + '.html','w')
		wf.write(content)
		wf.close()
		return item

Example #15

0

Show file

 def get_question(self, selector, response):
     # both select function and selector's join function need to add dot to search from relative based directory
     question_loader = XPathItemLoader(item = LazyTweetQuestion(), \
             selector = selector)
     question_loader.add_xpath(
         'question_content', ''.join([
             './/span[@class="post-body"]',
             '//span[@class="post-status"]/descendant-or-self::text()'
         ]))
     # not useful
     question_loader.add_xpath(
         'question_tags', ''.join(['//*[@id="post-tags"]/ul/li/a/text()']))
     question_loader.add_xpath(
         'asking_date', ''.join([
             './/span[@class="post-meta"]//span[@class="timestamp"]/text()'
         ]))
     question_loader.add_value(
         'asker',
         self.get_user(
             selector.select(''.join(['.//span[@class="post-meta"]']))))
     question_loader.add_xpath(
         'number_of_answers',
         ''.join(['.//span[@class="post-meta"]', '//a[last()]/text()']))
     question_loader.add_value('question_id', response.url.split('/')[-1])
     print question_loader.get_output_value('question_tags')
     return question_loader.load_item()

Example #16

0

Show file

    def parse(self, response):  # actually a method
        """
		Default callback used by Scrapy to process downloaded response
		
		"""

        selector = HtmlXPathSelector(
            response)  # instantiate HtmlXPathSelector() w/ response parameter

        # iterate over deals
        for content in selector.xpath(
                self.content_list_xpath):  #multiple deals per page
            loader = XPathItemLoader(RedditLearnPython(),
                                     selector=content)  #iterate over each deal

            # define processors
            loader.default_input_processor = MapCompose(
                unicode.strip)  #strip out white-space of unicode strings
            loader.default_output_processor = Join()  #join data by a space

            # iterate over fields and add xpaths to the loader
            for field, xpath in self.item_fields.iteritems(
            ):  #itemitems() method allows you to iterate (k, v) of items in a dict
                loader.add_xpath(field,
                                 xpath)  #add specific field xpath to loader
            yield loader.load_item(
            )  # load_item: grabs each item field (link, title, etc), gets xpath, process data
            # w/ input output processor. Yield each item, then move onto next deal

Example #17

0

Show file

File: trulia.py Project: unreal-estate/giscrape

    def parse_sale(self, response):
        l = XPathItemLoader(item=SaleItem(), response=response)

        l.add_value('url', response.url)
        l.add_xpath('address', '//h1[@class="address"]/text()')

        l.add_xpath('price', '//div[@class="price"]/text()')
        l.add_xpath('sale_date',
                    '//th[text()="Last sale:"]/../td/div[last()]/text()',
                    re=r'on (\w+)')

        l.add_xpath('bedrooms', '//th[text()="Bedrooms:"]/../td/text()')
        l.add_xpath('bathrooms',
                    '//th[text()="Bathrooms:"]/../td/text()',
                    re=r'(\d+)')
        l.add_xpath('powder_rooms',
                    '//th[text()="Bathrooms:"]/../td/text()',
                    re=r', (\d+)')
        l.add_xpath('property_type',
                    '//th[text()="Property type:"]/../td/text()')
        l.add_xpath('size',
                    '//th[text()="Size:"]/../td/text()',
                    re=r'([\d|,]+) sqft')
        l.add_xpath('lot', '//th[text()="Lot:"]/../td/text()')
        l.add_xpath('price_per_sf', '//th[text()="Price/sqft:"]/../td/text()')
        l.add_xpath('year_built', '//th[text()="Year built:"]/../td/text()')

        l.add_xpath('public_records',
                    'id("property_public_info_module")/ul/li/span/text()')

        return l.load_item()

Example #18

0

Show file

File: NrcExtractor.py Project: netconstructor/scraper-2

 def process_item(self, task_id):
     report = self.db.loadScrapedFullReport(task_id)
     if report is None:
         return
         
     text = report['full_report_body']
     text = "".join(chr(min(ord(c),127)) for c in text)
     t = TextResponse (url=report['full_report_url'], body=text.encode('utf-8')) #must have utf-8 here
     l = XPathItemLoader(NrcParsedReport(), response=t)
     l.add_value('reportnum', task_id)
     
     patterns = self.compile_patterns ()
     
     for p in patterns:
         l.add_value(p[0], text, TakeFirst(), unicode.strip, re=p[1])
             
     county = l.get_output_value('county')
     pattern = self.get_area_code_pattern(county)
     if pattern:
         l.add_value ('areaid', county)
         l.add_value('blockid', text, TakeFirst(), unicode.strip, re="%s[\s]+(?:BLOCK[\s]+)?([\d]+)" % pattern)
         l.add_value('blockid', text, TakeFirst(), unicode.strip, re="BLOCK[\s]+([\d]+)")
         
                     
     item = l.load_item()
     
     yield item
     self.item_completed(task_id)

Example #19

0

Show file

File: franceculture_discovery_spider.py Project: solaise73/adaptfm

	def parse_page(self, response):
		x = HtmlXPathSelector(response)
		#x.register_namespace("xsi", "http://www.w3.org/2001/XMLSchema-instance")
		
		l = XPathItemLoader(PodcastItem(), x)
		l.add_xpath('id', 'concat("frc_", //head/link[@rel="alternate"][@type="application/rss+xml"])')
		l.add_value('audioType', 'disco')
		l.add_xpath('brandId', '//head/link[@rel="alternate"][@type="application/rss+xml"]')
		l.add_xpath('brandFeed', '//head/link[@rel="alternate"][@type="application/rss+xml"]')

		l.add_xpath('brandName', './/div[contains(@class, "article-full")]/h2/text()')
		l.add_xpath('brandImage', './/div[contains(@class, "article-full")]/div[@class="illustration"]/img/@src')
		l.add_xpath('brandDescription', './/div[contains(@class, "article-full")]/span[position()=1]/text()')
		l.add_value('brandHomepage', response.url)
		
		l.add_value('channelName', 'France Culture')
		l.add_value('channelHomepage', 'http://www.franceculture.fr/')
		l.add_value('channelImage', 'http://www.franceculture.fr/sites/all/themes/franceculture/images/logo.png')
		
		l.add_value('ownerId', 'FRR')
		l.add_value('ownerName', 'Radio France')
		l.add_value('ownerHomepage', 'http://www.radiofrance.fr/')
		l.add_value('ownerKey', 'frr')
		l.add_value('ownerImage', 'http://www.radiofrance.fr/fileadmin/templates/images/bloc_tete/logo.png')
		item = l.load_item()
		self.log('Discovering frr %s' % (item['brandName']), level=log.INFO)

		
		yield item

Example #20

0

Show file

File: jabong_spider.py Project: DeepFashion/E-Commerce-Scrapy

    def parse(self, response):
        """
        Default callback used by Scrapy to process downloaded responses

        """

        selector = HtmlXPathSelector(response)


        details=urlparse(response.request.url)
        queryStr={x.split('=')[0]:(x.split('=')[1]) for x in details.query.split("&")}
        print "\n",queryStr['page']

        # iterate over deals
        for deal in selector.select(self.products_list_xpath):
            loader = XPathItemLoader(JabongData(), selector=deal)

            # define processors
            loader.default_input_processor = MapCompose(unicode.strip)
            loader.default_output_processor = Join()

            # iterate over fields and add xpaths to the loader
            for field, xpath in self.item_fields.iteritems():
                loader.add_xpath(field, xpath)

            # adding the request URL to the loader 
            loader.add_value("requestURL",unicode(response.request.url, "utf-8"))


            # adding the category for the request
            loader.add_value("category",unicode(self.category))

            yield loader.load_item()

Example #21

0

Show file

File: livingsocial_spider.py Project: catomania/new-coder

	def parse(self, response): # actually a method
		"""
		Default callback used by Scrapy to process downloaded responses
		
		Testing contracts:
		@url http://www.livingsocial.com/cities/15-san-francisco
		@returns items 1
		@scrapes title link
		
		"""
		         
		selector = HtmlXPathSelector(response) # instantiate HtmlXPathSelector() w/ response parameter
		
		# iterate over deals
		for deal in selector.xpath(self.deals_list_xpath): #multiple deals per page
			loader = XPathItemLoader(LivingSocialDeal(), selector=deal) #iterate over each deal
			
			# define processors
			# An Item Loader contains one input processor and one output processor for each (item) field.
			loader.default_input_processor = MapCompose(unicode.strip) #strip out white-space of unicode strings
			loader.default_output_processor = Join() #join data by a space
			
			# iterate over fields and add xpaths to the loader
			for field, xpath in self.item_fields.iteritems(): #itemitems() method allows you to iterate (k, v) of items in a dict
				loader.add_xpath(field, xpath) #add specific field xpath to loader
			yield loader.load_item() # load_item: grabs each item field (link, title, etc), gets xpath, process data
			# w/ input output processor. Yield each item, then move onto next deal

Example #22

0

Show file

File: spidersVault.py Project: bsaideepak/crawlers

    def parse_item(self,response):
        l = XPathItemLoader(item = LocalItem(),response = response)

        l.add_xpath('company','//*[@id="biz-vcard"]/div[2]/h1/span/text()')
        l.add_xpath('phone','//*[@id="biz-vcard"]/div[5]/div[2]/address/p/strong/text()')
        l.add_xpath('locality','//*[@id="biz-vcard"]/div[5]/div[2]/address/p/span[2]/text()')
        l.add_xpath('region','//*[@id="biz-vcard"]/div[5]/div[2]/address/p/span[3]/text()')
        l.add_xpath('postalcode','//*[@id="biz-vcard"]/div[5]/div[2]/address/p/span[4]/text()')

        res =  l.load_item()

        results = {'name':'','address':'','phone':''}

        if 'company' in res:
            results['name'] = res['company']
        if 'locality' in res:
            results['address'] = res['locality']
        if 'region' in res:
            results['address'] = results['address'] + res['region']
        if 'postalcode' in res:
            results['address'] = results['address'] + res['postalcode']
        if 'phone' in res:
            results['phone'] = results['phone']

        return res

Example #23

0

Show file

File: wggesucht_spider.py Project: bogdimon/wg-gesucht_crawler

    def parse(self, response):
        """
        Default callback used by Scrapy to process downloaded responses

        # Testing contracts:
        # @url http://www.livingsocial.com/cities/15-san-francisco
        # @returns items 1
        # @scrapes title link

        """
        selector = HtmlXPathSelector(response)

        # iterate over deals
        for entry in selector.xpath(self.entries_list_xpath):
            loader = XPathItemLoader(WGGesuchtEntry(), selector=entry)

            # define processors
            loader.default_input_processor = MapCompose(unicode.strip)
            loader.default_output_processor = Join()

            # iterate over fields and add xpaths to the loader
            for field, xpath in self.item_fields.iteritems():
                loader.add_xpath(field, xpath)
            yield loader.load_item()

        cur_index = response.meta.get("cur_index", 1)
        new_url = re.sub("\d+.html", str(cur_index) + ".html", response.url)

        print("\n" + str(response.url) + "\n" + new_url + "\n")

        if cur_index < 59:
            yield Request(new_url, callback=self.parse, meta={"cur_index": cur_index + 1})

Example #24

0

Show file

    def parse_item(self, response, loop, fields):
        hxs = HtmlXPathSelector(response)
        self.macro.update({'URL': response.url})

        for e in hxs.select(loop or '(//*)[1]'):
            loader = XPathItemLoader(item=Item(), selector=e)

            for k, v in fields.iteritems():
                if 'value' in v:
                    get_v_x = loader.get_value
                    v_x = v.get('value')
                elif 'xpath' in v:
                    get_v_x = loader.get_xpath
                    v_x = v.get('xpath')
                else:
                    log.msg(u'field [{}] should contains "value" or "xpath"'.
                            format(k),
                            level=log.WARNING)
                    continue

                val = get_v_x(self.macro.expand(v_x),
                              utils.convert_type(v.get('parse', {})),
                              re=v.get('regex'))

                if not val and 'default' in v:
                    val = self.macro.expand(v.get('default'))

                qry = v.get('filter', {})
                if utils.filter_data(qry, val):
                    loader.add_value(k, val)
                else:
                    break
            else:
                yield loader.load_item()

Example #25

0

Show file

File: billboard_spider.py Project: bencevans/tomahawk-contrib

    def parse_items(self, hxs, chart, typeItem):
        # parse every chart entry
        chart_list = []
        for item in hxs.select(
                '//div[contains(@class,"chart_listing")]/article'):
            loader = XPathItemLoader(typeItem, selector=item)
            loader.add_xpath(
                'rank',
                'header/span[contains(@class, "chart_position")]/text()')
            # ptitle yields the title for the type, so just set the title to whatever the chartype is.
            if 'artist' in chart['type'].lower():
                loader.add_xpath('artist',
                                 'header/p[@class="chart_info"]/a/text()')
            else:
                loader.add_xpath(chart['type'].lower(), 'header/h1/text()')
                loader.add_xpath('artist',
                                 'header/p[@class="chart_info"]/a/text()')
                loader.add_xpath('album',
                                 'header/p[@class="chart_info"]/text()')

            single = loader.load_item()
            chart_list.append(dict(single))

        chart['list'] += chart_list

        return chart

Example #26

0

Show file

File: sverigesradio_discovery_spider.py Project: solaise73/adaptfm

    def parse(self, response):
        x = HtmlXPathSelector(response)
        # x.register_namespace("xsi", "http://www.w3.org/2001/XMLSchema-instance")

        # programs = x.select('./body/outline[position()=4]/outline[position()<4]')
        programs = x.select('//div[@class="itemContainer"]')
        podcastCount = str(len(programs))
        i = 0
        allitems = []
        for program in programs:
            i = i + 1
            l = XPathItemLoader(PodcastItem(), selector=program)
            l.add_xpath("id", 'concat("svr_", .//span/h3/a[@class="programName"]/@href)')
            l.add_value("audioType", "disco")
            l.add_xpath("brandId", './/span/h3/a[@class="programName"]/@href')
            l.add_xpath("brandFeed", 'concat("http://sverigesradio.se/sida/poddradio.aspx", ./a/@href)')

            l.add_xpath("brandName", './/span/h3/a[@class="programName"]/text()')

            l.add_xpath("brandDescription", './/div[@class="views-field-field-emission-desc-courte-value"]/p/text()')
            l.add_xpath("brandHomepage", 'concat("http://sverigesradio.se/sida/poddradio.aspx", ./a/@href)')

            l.add_value("ownerId", "SR")
            l.add_value("ownerName", "Sveriges Radio")
            l.add_value("ownerHomepage", "http://sverigesradio.se/")
            l.add_value("ownerKey", "sr")
            l.add_value(
                "ownerImage", "http://sverigesradio.se/diverse/appdata/isidor/images/news_images/3297/459929_87_56.jpg"
            )

            self.log("Discovering svr [%s of %s] feeds" % (i, podcastCount), level=log.INFO)

            item = l.load_item()
            yield item

Example #27

0

Show file

    def parse_item(self, response):
        url_obj = urlparse(response.url)
        path = url_obj.path
        if path.endswith("/"):
            path = path[:-1]
        page = path.split("/")[-1]
        fullDomain = getDomainName(response.url)  # with HTTP or HTTPS
        domain = fullDomain.split("/")[-2]
        newpath = r'C:\\Users\\****\\scrapy_projects\\tutorial\\' + domain
        if not os.path.exists(newpath):
            os.makedirs(newpath)
        os.chdir(newpath)
        filename = '%s.html' % (domain + " " + page)
        with open(filename, 'wb') as f:
            f.write(response.body)
        links = 'links-%s.txt' % (domain + " " + page)
        content = 'contents-%s.txt' % (domain + " " + page)
        f1.write("\n")
        f1.write(domain + sep)
        f1.write(page + sep)
        # 16 whois attributes
        f1.write(str(whois.whois(response.url).whois_server) + sep)
        f1.write(str(whois.whois(response.url).referral_url) + sep)
        f1.write(str(whois.whois(response.url).updated_date) + sep)
        f1.write(str(whois.whois(response.url).creation_date) + sep)
        f1.write(str(whois.whois(response.url).expiration_date) + sep)
        f1.write(str(whois.whois(response.url).name_servers) + sep)
        f1.write(str(whois.whois(response.url).status) + sep)
        f1.write(str(whois.whois(response.url).emails) + sep)
        f1.write(str(whois.whois(response.url).dnssec) + sep)
        f1.write(str(whois.whois(response.url).name) + sep)
        f1.write(str(whois.whois(response.url).org) + sep)
        f1.write(str(whois.whois(response.url).address) + sep)
        f1.write(str(whois.whois(response.url).city) + sep)
        f1.write(str(whois.whois(response.url).state) + sep)
        f1.write(str(whois.whois(response.url).zipcode) + sep)
        f1.write(str(whois.whois(response.url).country) + sep)

        extractLinks(links, response)
        countRelAbsHttpsLinks(links)
        countInOutLinks(links)
        countSlashes(links)
        imagePreloading(links)

        extractText(content, response)
        countSentences(content)
        checkGrammar(content)

        # Average word length is: ??? global_wc can be zero
        f1.write(str("%.2f" % (global_wordLen / global_wc)) + sep)
        # Number of words in the page:
        f1.write(str(global_wc) + sep)
        # Downloads images
        loader = XPathItemLoader(item=ImageItem(), response=response)
        loader.add_xpath('image_urls', '//img/@src')
        hashImages()  # Calculates hashes of images downloaded by scrapy
        # Write label into the data file
        f1.write(my_dict.get(fullDomain, "redirect"))

        return loader.load_item()

Example #28

0

Show file

File: spidersVault.py Project: bsaideepak/crawlers

    def parse_item(self,response):

        l = XPathItemLoader(item = YellowPagesItem(),response = response)
        l.add_xpath('company','//*[@id="main-content"]/div[1]/div[1]/h1/text()')
        l.add_xpath('st_add','//*[@id="main-content"]/div[1]/div[1]/div/section[2]/div[1]/p[1]/text()')
        l.add_xpath('city','//*[@id="main-content"]/div[1]/div[1]/div/section[2]/div[1]/p[2]/text()')
        l.add_xpath('phone','//*[@id="main-content"]/div[1]/div[1]/div/section[2]/div[1]/p[3]/text()')

        #reviews left

        res =  l.load_item()
        print("")
        print("")
        results = {'name':'','address':'','phone':''}

        if 'company' in res:
            results['name'] = res['company']
        if 'st_add' in res:
            results['address'] = res['st_add']
        if 'city' in res:
            results['address'] = results['address'] + res['city']
        if 'phone' in res:
            results['phone'] = res['phone']

        print("")
        return res

Example #29

0

Show file

File: livingsocial_spider.py Project: yanniey/Scrapy_livingsocial_chicago

	def parse(self, response):
		"""
		Default callback used by Scrapy to process downloaded responses
		Testing contracts:
		@url http://www.livingsocial.com/cities/15-san-francisco
		@returns items 1
		@scrapes title link
		"""

		selector = HtmlXPathSelector(response)

		for deal in selector.xpath(self.deals_list_xpath):
			loader = XPathItemLoader(LivingSocialDeal(),selector=deal)

			# define processors
			loader.default_input_processor = MapCompose(unicode.strip) 
			# stripe out white-space of unicode strings
			loader.default_output_processor = Join() 
			# join the data together by a space

			# iterate over fields and add xpaths to the loader
			for field, xpath in self.item_fields.iteritems(): 
			# iteritems() iterate the (key,value) of items in a dictionary. There are also iterkeys() and itervalues() functions. 
				loader.add_xpath(field, xpath) 
			yield loader.load_item() 
			# yield each other and move on to the next

# output as json file: scrapy crawl livingsocial -o items.json

Example #30

0

Show file

File: webarticle.py Project: fredatshift/collective-intelligence

    def parse_article(self, response):
        """
        The lines below is a spider contract. For more info see:
        http://doc.scrapy.org/en/latest/topics/contracts.html

        @url http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/
        @scrapes name
        """
        
        selector = Selector(response)
        loader = XPathItemLoader(LeMondeArt(), selector=selector)
        
        self.log('\n\nA response from %s just arrived!' % response.url)
        
        # define processors
        text_input_processor = MapCompose(unicode.strip)
        loader.default_output_processor = Join()
        
        # Populate the LeMonde Item with the item loader
        for field, xpath in self.article_item_fields.iteritems():
            try:
                loader.add_xpath(field, xpath, text_input_processor)
            except ValueError:
                self.log("XPath %s not found at url %s" % (xpath, response.url))
            
        #loader.add_value("Url",response.url)
        

        yield loader.load_item()

Example #31

0

Show file

File: franceradio_discovery_spider.py Project: solaise73/adaptfm

	def parse(self, response):
		x = HtmlXPathSelector(response)
		#x.register_namespace("xsi", "http://www.w3.org/2001/XMLSchema-instance")
		
		#programs = x.select('./body/outline[position()=4]/outline[position()<4]')
		programs = x.select('//div[@class="item-list"]/ul/li[contains(@class,"views-row")]/div/div/div')
		podcastCount = str(len(programs))
		i=0
		allitems=[]
		for program in programs:
			i=i+1
			l = XPathItemLoader(PodcastItem(), selector=program)
			l.add_xpath('id', 'concat("fri_", .//li/a[@class="rss"]/@href)')
			l.add_value('type', 'disco')
			l.add_xpath('brandId', './/li/a[@class="rss"]/@href')
			l.add_xpath('brandFeed', 'concat("http://www.franceinfo.fr", .//li[contains(@class,"link_rss")]/a[@class="rss"]/@href)')
			l.add_xpath('brandName', './/h3/a/text()')
			l.add_xpath('brandTimes', './/div[@class="views-field-field-emission-texte-diffusion-value"]/text()')
			l.add_xpath('brandDescription', './/div[@class="views-field-field-emission-desc-courte-value"]/p/text()')
			l.add_xpath('brandHomepage', './/h3/a/@href')
			
			l.add_value('channelId', 'franceinfo')
			l.add_xpath('channelName', '//head/meta[@property="og:site_name"]/@content')
			l.add_xpath('channelDescription', '//head/meta[@property="og:description"]/@content')
			l.add_xpath('channelImage', '//div[@id="header"]/div/span/a/img/@src')
			l.add_xpath('brandHomepage', './/h3/a/@href')
			l.add_xpath('brandHomepage', './/h3/a/@href')
			l.add_xpath('brandHomepage', './/h3/a/@href')
			l.add_xpath('brandHomepage', './/h3/a/@href')
			
			self.log('Discovering fri [%s of %s] feeds' % (i, podcastCount), level=log.INFO)
		
		
			item = l.load_item()
			yield item

Example #32

0

Show file

    def parse(self, response):
        """
        Default callback used by Scrapy to process downloaded responses

        Testing contracts:
        @url http://www.livingsocial.com/cities/15-san-francisco
        @returns items 1
        @scrapes title link
        """
        # Gives ability to select parts of response defined in deals_list_xpath
        selector = HtmlXPathSelector(response)

        # Iterate through found deals
        for deal in selector.xpath(self.deals_list_xpath):
            # Loads data into item fields defined in items.py
            loader = XPathItemLoader(LivingSocialDeal(), selector=deal)

            # Define processors for clean up and joining elements
            loader.default_input_processor = MapCompose(unicode.strip)
            loader.default_output_processor = Join()

            # Iterate over item_fields dict and add xpaths to loader
            for field, xpath in self.item_fields.iteritems():
                loader.add_xpath(field, xpath)
            yield loader.load_item()

Example #33

0

Show file

    def parse_article(self, response):
        """
        The lines below is a spider contract. For more info see:
        http://doc.scrapy.org/en/latest/topics/contracts.html

        @url http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/
        @scrapes name
        """

        selector = Selector(response)
        loader = XPathItemLoader(LeMondeArt(), selector=selector)

        self.log('\n\nA response from %s just arrived!' % response.url)

        # define processors
        text_input_processor = MapCompose(unicode.strip)
        loader.default_output_processor = Join()

        # Populate the LeMonde Item with the item loader
        for field, xpath in self.article_item_fields.iteritems():
            try:
                loader.add_xpath(field, xpath, text_input_processor)
            except ValueError:
                self.log("XPath %s not found at url %s" %
                         (xpath, response.url))

        #loader.add_value("Url",response.url)

        yield loader.load_item()

Example #34

0

Show file

File: spiders.py Project: iamsuz/twitterhandle

    def parse_item(self,response):

        l = XPathItemLoader(item=TwitterBotItem(),response=response)
        print "###################"
        l.add_xpath('company','//*[@class="trends-inner"]/div/div[2]/ul/li[1]/a/text()')
        l.add_xpath('street_address','//*[@id="info-container"]/div[1]/dl/dd[1]/span[1]/text()')
        l.add_xpath('locality','//*[@id="info-container"]/div[1]/dl/dd[1]/span[2]/text()')
        l.add_xpath('region','//*[@id="info-container"]/div[1]/dl/dd[1]/span[3]/text()')
        l.add_xpath('postalcode','//*[@id="info-container"]/div[1]/dl/dd[1]/span[4]/text()')


        res = l.load_item()

        results = {'name':'','address':''}

        if 'company' in res:
            results['name'] = res['company']
        if 'street_address' in res:
            results['address'] = res['street_address']
        if 'locality' in res:
            results['address'] = results['address'] + res['locality']
        if 'region' in res:
            results['address'] = results['address'] + res['region']
        if 'postalcode' in res:
            results['address'] = results['address'] + res['postalcode']

        return res

Example #35

0

Show file

File: FracFocusScraper.py Project: SkyTruth/scraper

    def scrape_content_items (self, response):
        hxs = HtmlXPathSelector(response)
        stats = self.crawler.stats
        page_num = hxs.select ('//*[@id="MainContent_DocumentList1_GridView1_PageCurrent"]/@value').extract()
        if page_num:
            page_num = page_num[0]
            self.log('%s Scraping page %s' % (response.meta['cookiejar'], page_num), log.INFO)
        else:
            self.log('%s No page number found' % (response.meta['cookiejar']), log.WARNING)

        stats.inc_value ('_pages', spider=self)
        reports = hxs.select ('//table[@id="MainContent_DocumentList1_GridView1"]//tr')

        for report in reports:
            l = XPathItemLoader(FracFocusScrape(), report)
            l.state_in = lambda slist: [s[:20] for s in slist]
            l.county_in = lambda slist: [s[:20] for s in slist]
            for name, params in FracFocusScrape.fields.items():
                l.add_xpath(name, params['xpath'])
            item = l.load_item()
            if item.get('api'):
                if self.db.itemExists(item):
                    stats.inc_value ('_existing_count', spider=self)
                else:
                    stats.inc_value ('_new_count', spider=self)
#                print item['operator']
                    yield item
        if not stats.get_value('_existing_count') and not stats.get_value('_new_count'):
            self.log('%s No records found' % (response.meta['cookiejar']), log.WARNING)

Example #36

0

Show file

File: spidersVault.py Project: bsaideepak/crawlers

    def parse_item(self,response):
        l = XPathItemLoader(item = BurrpItem(),response = response)

        l.add_xpath('company','//*[@id="listings-details"]/section[2]/div/div[1]/div[1]/span/p/text()')
        l.add_xpath('phone','//*[@id="listings-details"]/section[2]/div/div[1]/div[1]/div/ul/li[1]/strong/text()')
        l.add_xpath('address','//*[@id="listings-details"]/section[2]/div/div[1]/div[1]/div/ul/li[2]/text()')
        l.add_xpath('region','//*[@id="listings-details"]/section[2]/div/div[1]/div[1]/p/a/text()')
        l.add_xpath('cuisine1','//*[@id="listings-details"]/section[2]/div/div[1]/div[1]/div/ul/li[3]/a[1]/text()')
        l.add_xpath('cuisine2','//*[@id="listings-details"]/section[2]/div/div[1]/div[1]/div/ul/li[3]/a[2]/text()')

        res = l.load_item()

        results = {'name':'','address':'','phone':''}

        if 'company' in res:
            results['name'] = res['company']
        if 'address' in res:
            results['address'] = res['address']
        if 'locality' in res:
            results['address'] = results['address'] + res['locality']
        if 'region' in res:
            results['address'] = results['address'] + res['region']
        if 'postalcode' in res:
            results['address'] = results['address'] + res['postalcode']

        return res

Example #37

0

Show file

    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        parse_prices = lambda l: filter(bool, [item.strip() for item in l])

        item_name = hxs.select(
            "//input[contains(concat(' ', @class, ' '), ' search-within ')]/@value"
        ).extract()
        item_hash = hashlib.md5(
            '%s::%s::%s' %
            (self.auction_id, item_name, self.name)).hexdigest()
        item_price = parse_prices(
            hxs.select("//div[2]//div[2]/text()").extract())

        loader = XPathItemLoader(item=SearchResultItem(), response=response)
        loader.add_value("id", item_hash)
        loader.add_value("auction_id", self.auction_id)
        loader.add_value("site", self.name)
        loader.add_xpath(
            "name",
            "//input[contains(concat(' ', @class, ' '), ' search-within ')]/@value"
        )
        loader.add_value("link", response.url)
        loader.add_value("price", item_price)

        return loader.load_item()

Example #38

0

Show file

File: spidersVault.py Project: bsaideepak/crawlers

    def parse_item(self,response):
        l = XPathItemLoader(item = FoursquareItem(),response = response)

        l.add_xpath('phone','//*[@id="container"]/div/div[2]/div[1]/div[2]/div[3]/div[1]/div[2]/div[2]/span/text()')
        l.add_xpath('st_add','//*[@id="container"]/div/div[2]/div[1]/div[2]/div[2]/div[2]/div[2]/div/span[1]/text()')
        l.add_xpath('locality','//*[@id="container"]/div/div[2]/div[1]/div[2]/div[2]/div[2]/div[2]/div/span[2]/text()')
        l.add_xpath('state','//*[@id="container"]/div/div[2]/div[1]/div[2]/div[2]/div[2]/div[2]/div/span[3]/text()')
        l.add_xpath('postalcode','//*[@id="container"]/div/div[2]/div[1]/div[2]/div[2]/div[2]/div[2]/div/span[4]/text()')
        l.add_xpath('country','//*[@id="container"]/div/div[2]/div[1]/div[2]/div[2]/div[2]/div[2]/div/text()[3]/text()')
        l.add_xpath('company','//*[@id="container"]/div/div[2]/div[1]/div[2]/div[2]/div[2]/h1/text()')

        res =  l.load_item()
        results = {'name':'','address':'','phone':'','timings':''}


        if 'company' in res:
            results['name'] = res['company']
        if 'st_add' in res:
            results['address'] = res['st_add']
        if 'locality' in res:
            results['address'] = results['address'] + res['locality']
        if 'state' in res:
            results['address'] = results['address'] + res['state']
        if 'postalcode' in res:
            results['address'] = results['address'] + res['postalcode']
        if 'country' in res:
            results['address'] = results['address'] + res['country']
        if 'phone' in res:
            results['phone'] = res['phone']

        return results

Example #39

0

Show file

File: trulia.py Project: kerinin/giscrape

    def parse_listing(self, response):
        l = XPathItemLoader(item=ListingItem(), response=response)

        l.add_value("url", response.url)
        l.add_xpath("address", '//h1[@class="address"]/text()')

        l.add_xpath("price", '//div[@class="price"]/text()')

        l.add_xpath("bedrooms", '//th[text()="Bedrooms:"]/../td/text()')
        l.add_xpath("bathrooms", '//th[text()="Bathrooms:"]/../td/text()', re=r"(\d+)")
        l.add_xpath("powder_rooms", '//th[text()="Bathrooms:"]/../td/text()', re=r", (\d+)")
        l.add_xpath("property_type", '//th[text()="Property type:"]/../td/text()')
        l.add_xpath("size", '//th[text()="Size:"]/../td/text()', re=r"([\d|,]+) sqft")
        l.add_xpath("lot", '//th[text()="Lot:"]/../td/text()')
        l.add_xpath("price_per_sf", '//th[text()="Price/sqft:"]/../td/text()')
        l.add_xpath("year_built", '//th[text()="Year built:"]/../td/text()')
        l.add_xpath("date_listed", '//th[text()="Added on Trulia:"]/../td/text()')
        l.add_xpath("mls_id", '//th[text()="MLS/ID:"]/../td/text()')

        l.add_xpath("descriptive_title", '//h2[@class="descriptive_title"]/text()')
        l.add_xpath("description", '//div[@class="listing_description_module"]/text()')

        l.add_xpath("additional_fields", 'id("property_listing_details_module")/ul/li/span/text()')

        l.add_xpath("public_records", 'id("property_public_info_module")/ul/li/span/text()')

        return l.load_item()

Example #40

0

Show file

File: spidersVault.py Project: bsaideepak/crawlers

    def parse_item(self,response):
        l = XPathItemLoader(item = ZomatoItem(),response = response)

        l.add_xpath('phone1','//*[@id="phoneNoString"]/div/span/span[1]/text()')
        l.add_xpath('company','//html/body/div[3]/section/div/div[2]/div[2]/div[1]/h1/a/span/text()')
        l.add_xpath('phone2','//*[@id="phoneNoString"]/div/span/span[2]/text()')
        l.add_xpath('address','/html/body/div[3]/section/div/div[3]/div[3]/div[1]/div[2]/h4/text()[1]')
        l.add_xpath('review1','//*[@id="my-reviews-container"]/div[1]/div[3]/div[1]/div[1]/div[3]/div/div[1]/div/text()')
        l.add_xpath('review2','//*[@id="my-reviews-container"]/div[1]/div[3]/div[1]/div[2]/div[3]/div/div[1]/div/text()')
        l.add_xpath('timings','//*[@id="mainframe"]/section/div[1]/div/div[2]/div[1]/div[2]/div/div[4]/div[2]/div[1]/span/text()')

        res =  l.load_item()

        results = {'name':'','address':'','phone':'','review1':'','review2':'','timings':''}

        if 'company' in res:
            results['name'] = res['company']
        if 'address' in res:
            results['address'] = res['address']
        if 'phone' in res:
            results['phone'] = res['phone']
        if 'review1' in res:
            results['review1'] = res['review1']
        if 'review2' in res:
            results['review2'] = res['review2']
        if 'timings' in res:
            results['timings'] = res['timings']

        return res

Example #41

0

Show file

    def parse(self, response):
        """
        Default callback used by Scrapy to process downloaded responses

        Testing contracts:
        @url http://www.livingsocial.com/cities/15-san-francisco
        @returns items 1
        @scrapes title link

        """
        selector = HtmlXPathSelector(response)

        # iterate over deals
        for deal in selector.select(self.deals_list_xpath):
            loader = XPathItemLoader(LivingSocialDeal(), selector=deal)

            # define processors

            loader.default_input_processor = MapCompose(unicode.strip)
            loader.default_output_processor = Join()
            # iterate over fields and add xpaths to the loader

            for field, xpath in self.item_fields.iteritems():
                loader.add_xpath(field, xpath)
            yield loader.load_item()

Example #42

0

Show file

File: NrcExtractor.py Project: SkyTruth/scraper

    def process_item(self, task_id):
        report = self.db.loadScrapedFullReport(task_id)
        if report is None:
            return

        text = report["full_report_body"]
        text = "".join(chr(min(ord(c), 127)) for c in text)
        t = TextResponse(url=report["full_report_url"], body=text.encode("utf-8"))  # must have utf-8 here
        l = XPathItemLoader(NrcParsedReport(), response=t)
        l.add_value("reportnum", task_id)

        patterns = self.compile_patterns()

        for p in patterns:
            l.add_value(p[0], text, TakeFirst(), unicode.strip, re=p[1])

        county = l.get_output_value("county")
        pattern = self.get_area_code_pattern(county)
        if pattern:
            l.add_value("areaid", county)
            l.add_value("blockid", text, TakeFirst(), unicode.strip, re="%s[\s]+(?:BLOCK[\s]+)?([\d]+)" % pattern)
            l.add_value("blockid", text, TakeFirst(), unicode.strip, re="BLOCK[\s]+([\d]+)")

        item = l.load_item()

        yield item
        self.item_completed(task_id)

Example #43

0

Show file

File: lala.py Project: hufengze2012/seekingarrangement_spider

    def list_item(self, response):
#        log.msg("the reponse_URL:%s" % response.url,level=log.DEBUG)
        sel=Selector(text=response.body)
        result_list=sel.xpath("//div[@class='result-list-item__inner']").extract()
#        logging.info("the reponse URL:%s" % response.url)
#        with open("temp.txt",'wb') as f:
#            f.write(response.selector.xpath("//div[@class='result-list-item__inner']").extract())
#        result_list=response.xpath("//div[@class='result-list-item__inner']").extract()
        num=0
        for result_item in result_list:
            num=num+1
            log.msg("this is the %d item" % num,level=log.DEBUG)
            loader=XPathItemLoader(item=Person(),selector=Selector(text=result_item))
            loader.add_xpath('name',".//h4[@class='member-title result-name']/text()")
            loader.add_xpath('age',".//div[@class='primary-description truncated-line']/text()[1]")
            loader.add_xpath('bullet',".//div[@class='primary-description truncated-line']/text()[2]")
            loader.add_xpath('fit',".//ul[@class='unstyled-list'][1]/li[1]/text()")
            loader.add_xpath('nationnality',".//ul[@class='unstyled-list'][1]/li[2]/text()")
            loader.add_xpath('price',".//ul[@class='unstyled-list'][2]/li/text()")
#            items.append(item)
            yield loader.load_item()
        
        url=sel.xpath("//li[@class='pagination__next ']/a/@href").extract()
#        print url
        yield Request(url[0],callback=self.list_item)

Example #44

0

Show file

File: livingsocial_spider.py Project: raghavlite/yana_bot

    def parse(self, response):
        """
        Default callback used by Scrapy to process downloaded responses
        Testing contracts:
        @url http://www.livingsocial.com/cities/15-san-francisco
        @returns items 1
        @scrapes title link
        """
        selector = HtmlXPathSelector(response)

        # iterate over deals
        for deal in selector.xpath(self.deals_list_xpath):
            loader = XPathItemLoader(LivingSocialDeal(), selector=deal)

            # define processors
            loader.default_input_processor = MapCompose(unicode.strip)
            loader.default_output_processor = Join()

            # iterate over fields and add xpaths to the loader
            for field, xpath in self.item_fields.iteritems():
                loader.add_xpath(field, xpath)
            yield loader.load_item()

Example #45

0

Show file

File: spidersVault.py Project: bsaideepak/crawlers

    def parse_item(self,response):

        l = XPathItemLoader(item = AsklailaItem(),response = response)
        l.add_xpath('company','//*[@id="all-content"]/div[4]/div[1]/div[1]/div[1]/div[1]/h1/span/text()')
        l.add_xpath('st_add','//*[@id="ldpAdrsDetails"]/p[2]/span/span[1]/text()')
        l.add_xpath('locality','//*[@id="ldpAdrsDetails"]/p[2]/span/span[2]/a/title/text()')
        l.add_xpath('region','//*[@id="ldpAdrsDetails"]/p[2]/span/span[3]/text()')
        l.add_xpath('postalcode','//*[@id="ldpAdrsDetails"]/p[2]/span/span[4]/text()')
        l.add_xpath('phone','//*[@id="ldpAdrsDetails"]/p[1]/span/span[1]/text()')

        res =  l.load_item()

        results = {'name':'','address':'','phone':''}

        if 'company' in res:
            results['name'] = res['company']
        if 'st_add' in res:
            results['address'] = res['st_add']
        if 'locality' in res:
            results['address'] = results['address'] + res['locality']
        if 'region' in res:
            results['address'] = results['address'] + res['region']
        if 'postalcode' in res:
            results['address'] = results['address'] + res['postalcode']
        if 'phone' in res:
            results['phone'] = res['phone']

        return results

Example #46

0

Show file

File: talkspider_crawl.py Project: dmclain/scrapy-pytexas-2013

 def parse_item(self, response):
     #hxs = HtmlXPathSelector(response)
     l = XPathItemLoader(item=PytexasItem(), response=response)
     l.add_xpath('title', '//*/div[@class="span6"]/h2/text()')
     l.add_xpath('speaker', '//*/div[@class="span6"]/h3/text()')
     l.add_xpath('description', '//*/div[@class="span6"]/p[2]/text()')
     #l.add_value('last_updated', 'today') # you can also use literal values
     return l.load_item()

Example #47

0

Show file

    def parse(self, response):

        gold = XPathItemLoader(item=FinanceIndex(), response=response)
        gold.add_value("name", "Oro Spot Cierre Londres")
        gold.add_value("unit", "USD")
        gold.add_xpath("value", "//td[@bgcolor='#cccc99'][1]//text()")

        return [gold.load_item()]

Example #48

0

Show file

    def parse(self, response):

        ubi = XPathItemLoader(item=FinanceIndex(), response=response)
        ubi.add_value("name", "Uruguay Bond Index")
        ubi.add_value("unit", "bps")
        ubi.add_xpath("value", "//span/text()")

        return [ubi.load_item()]

Example #49

0

Show file

    def parse(self, response):
        """
        Default callback used by Scrapy to process downloaded responses

        """
        # with open('polydata/'+response.url.split('=')[1], 'wb') as f:
        #     f.write(response.body)
        # scraped_url_list = list()

        selector = HtmlXPathSelector(response)

        # iterate over deals
        for deal in selector.select(self.products_list_xpath):
            loader = XPathItemLoader(PolyvoreData(), selector=deal)
            # define processors
            loader.default_input_processor = MapCompose(unicode.strip)
            loader.default_output_processor = Join()

            # iterate over fields and add xpaths to the loader
            for field, xpath in self.item_fields.iteritems():
                loader.add_xpath(field, xpath)

            # adding the request URL to the loader
            loader.add_value("requestURL",
                             unicode(response.request.url, "utf-8"))

            # scraped_url_list.append(loader.load_item()['requestURL'])

            for item in deal.xpath('//*[@id="content"]/ul[1]/li'):
                ll = XPathItemLoader(PolyvoreData(), selector=item)
                # define processors
                ll.default_input_processor = MapCompose(unicode.strip)
                ll.default_output_processor = Join()

                for field, xpath in self.item_items.iteritems():
                    ll.add_xpath(field, xpath)

                ll.add_value("requestURL", loader.load_item()['requestURL'])
                ll.add_value("name", loader.load_item()['name'])
                ll.add_value("numlikes", loader.load_item()['numlikes'])
                yield ll.load_item()

            for item in deal.xpath('//*[@id="content"]/ul[2]/li'):
                ll = XPathItemLoader(PolyvoreData(), selector=item)
                # define processors
                ll.default_input_processor = MapCompose(unicode.strip)
                ll.default_output_processor = Join()

                for field, xpath in self.item_items.iteritems():
                    ll.add_xpath(field, xpath)

                ll.add_value("requestURL", loader.load_item()['requestURL'])
                ll.add_value("name", loader.load_item()['name'])
                ll.add_value("numlikes", loader.load_item()['numlikes'])
                yield ll.load_item()

Example #50

0

Show file

 def get_user(self, selector):
     user_loader = XPathItemLoader(item=LazyTweetUser(), selector=selector)
     user_loader.add_xpath('twitter_username', ''.join(['./a[1]/text()']))
     user_loader.add_value(
         'twitter_url', ''.join([
             r'http://twitter.com/',
             user_loader.get_output_value('twitter_username')
         ]))
     return user_loader.load_item()

Example #51

0

Show file

File: chunyu_doctor_spider.py Project: lee670523/gitPython

    def parse_doctor_detail(self, response):
        """ This function parses a sample response. Some contracts are mingled
        with this docstring.

        @url http://www.chunyuyisheng.com/doctor/clinic_web_31f4d70d2867b969
        @returns items 1 1
        @returns requests 0 0
        """

        hxs = HtmlXPathSelector(response)

        l = XPathItemLoader(CYDoctorItem(), hxs)

        l.add_xpath('_name', ("//div[@class='bdHd']/h1/text()"))

        shortdesc = hxs.select(
            "//div[@id='mainColumn']//p[@class='bdFt']/text()").extract()
        if len(shortdesc) == 1:
            shortdescStr = shortdesc[0].strip()
            words = shortdescStr.split()
            if len(words) == 3:
                l.add_value('title', words[0])
                l.add_value('hospital', words[1])
                l.add_value('specialty', words[2])
            else:
                print("title/hostpital/special error.")

        l.add_xpath(
            'specialtyDesc',
            "//div[@id='docOtherInfo']/div[@class='infoCell'][1]//p[2]/text()")
        l.add_xpath(
            'personalInfo',
            "//div[@id='docOtherInfo']/div[@class='infoCell'][2]//p[2]/text()")
        l.add_xpath('stars', "//p[@class='right starTxt']/text()")

        answer = hxs.select(
            "//div[@id='resolvedData']/p[1]/a/text()").extract()
        if len(answer) == 1:
            answerStr = answer[0].strip().replace(u"\xa0", "")
            m = re.match(u"解答:(?P<answer_cnt>\d+)", answerStr)
            if m.groupdict()["answer_cnt"] is not None:
                l.add_value('answers', m.groupdict()["answer_cnt"])

        review = hxs.select("//div[@id='resolvedData']/p[2]/text()").extract()
        if len(review) == 1:
            reviewStr = review[0].strip().replace(u"\xa0", "")
            m = re.match(u"评价:(?P<review_cnt>\d+)", reviewStr)
            if m.groupdict()["review_cnt"] is not None:
                l.add_value('reviews', m.groupdict()["review_cnt"])

        # l.add_xpath('answers', "//div[@id='resolvedData']/p[1]/a/text()")
        # l.add_xpath('reviews', "//div[@id='resolvedData']/p[2]/text()")

        ret = l.load_item()
        print ret

        yield ret

Example #52

0

Show file

File: tcad.py Project: unreal-estate/giscrape

        def history(text, url):
            response = http.TextResponse(url=url,
                                         body=str(text.replace(u'\xa0', '')))
            h = XPathItemLoader(item=TCADValueHistoryItem(), response=response)

            h.add_xpath('year', '//td[1]/text()')
            h.add_xpath('value', '//td[4]/text()')

            return h.load_item()

Example #53

0

Show file

File: tcad.py Project: unreal-estate/giscrape

        def improvement(text, url):
            response = http.TextResponse(url=url, body=str(text))
            i = XPathItemLoader(item=TCADImprovementItem(), response=response)

            i.add_xpath('id', '//td[1]/text()')
            i.add_xpath('state_category', '//td[2]/text()')
            i.add_xpath('description', '//td[3]/text()')

            return i.load_item()

Example #54

0

Show file

File: bcu.py Project: mhumpher/finance-1

    def parse(self, response):

        rate = XPathItemLoader(item=FinanceIndex(), response=response)

        rate.add_value("name", "Tasa Objetivo BCU")
        rate.add_value("unit", "%")
        rate.add_xpath("value", "8.75")
        #rate.update_only_if_change = True

        return [rate.load_item()]

Example #55

0

Show file

File: horrible_spyder.py Project: therin/edx

 def parse(self, response):
     hxs = HtmlXPathSelector(response)
     entries = hxs.select(
         '//tr[contains(@class,"trusted tlistrow")]/td[contains(@class, "tlistname")]'
     )
     for entry in entries:
         l = XPathItemLoader(item=TorrentItem(), selector=entry)
         l.add_xpath('torrent', 'a/@href')
         l.add_xpath('title', 'a[contains(@href, "nyaa")]/text()')
         yield l.load_item()

Example #56

0

Show file

    def parse_talk(self, response):
        loader = XPathItemLoader(item=Pybr8TalksItem(), response=response)
        loader.add_xpath('title', '//div[@id="proposal"]/h1/text()')
        loader.add_xpath('description',
                         '//div[@class="twocolumn"]/div[2]/text()[2]')
        loader.add_xpath('author_name',
                         '//div[@class="twocolumn"]/div/div[2]/h3/text()')
        loader.add_xpath('author_profile',
                         '//div[@class="twocolumn"]/div/div[2]/text()[3]')

        return loader.load_item()

Example #57

0

Show file

File: merval.py Project: mhumpher/finance-1

    def parse(self, response):

        rate = XPathItemLoader(item=FinanceIndex(), response=response)
        
        rate.add_value("name", "Merval")
        rate.add_value("unit", "")

        hxs = HtmlXPathSelector(response)
        rate.add_value("value", hxs.select("//span[contains(@id,'UltimoMerval')]/text()")[0].extract())
        
        return [rate.load_item()]