Python XmlXPathSelector.register_namespace Beispiele, scrapy.selector.XmlXPathSelector.register_namespace Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: rssurls_discovery_spider.py Projekt: solaise73/adaptfm

	def parse(self, response):
		x = XmlXPathSelector(response)
		x.register_namespace("im", "http://itunes.apple.com/rss")
		x.register_namespace('atom','http://www.w3.org/2005/Atom')
		feedCount = str(len(self.start_urls))
		self.i=self.i+1
		self.log('Reading rss url [%s of %s]' % (self.i, feedCount), level=log.INFO)
		entries = x.select('//atom:entry')
		
		if entries:

			# a itunes rss feed
			for entry in entries:
				id = entry.select('./atom:id/@im:id').extract()
				self.log('Entry %s' % (str(id)), level=log.INFO)
				yield Request('http://itunes.apple.com/lookup?id='+ id[0], callback=self.getItunesTrackJson)
			
			
			
		else:
			# a single feed
			l = XPathItemLoader(PodcastItem(), x)
			l.add_value('id', 'rssdisco_'+response.url)
			l.add_value('audioType', 'disco')
			l.add_value('brandFeed', response.url)
			l.add_xpath('brandName', '//./channel/title/text()')
			self.log('Feed from rss %s' % (response.url), level=log.INFO)
			
			item = l.load_item()
			
	
			yield item

Beispiel #2

0

Datei anzeigen

Datei: kerbango_spider.py Projekt: solaise73/adaptfm

	def parseSubGenre(self, response):
		x = XmlXPathSelector(response)
		x.register_namespace("kb", "http://www.kerbango.com/xml")
		metaData = response.meta['metaData']
		stations = x.select('//kb:results/kb:station_record') #   was limited to less 5 for now!!!

		for station in stations:
			metaData['channelPlaylist'] = [station.select('./kb:station_url_record/kb:url/text()').extract()[0].rstrip('/ \r\n')]
			metaData['channelName'] = station.select('./kb:station/text()').extract()	
			metaData['channelDescription'] = station.select('./kb:description/text()').extract()	
			metaData['streamId'] = station.select('./kb:esid/text()').extract()	
			metaData['streamBandwidth'] = station.select('./kb:station_url_record/kb:bandwidth_kbps/text()').extract()	
			metaData['streamData'] = station.select('./kb:station_url_record/kb:status_code/text()').extract()	
			metaData['channelGenreIds'] = metaData['genreId']
			metaData['channelGenres'] = metaData['genreName']
			metaData['channelCategory'] = metaData['genreName']
			
			
			self.log('parseSubGenre %s %s' % (metaData['genreName'], metaData['channelName'] ), level=log.INFO)
			channelName = metaData['channelName'][0]
			channelName = re.sub(r'Low$|High$', '', channelName).strip() #cope with bbc names that include bitratethy in name
			tuneInSearchUrl = 'http://tunein.com/search/suggest/?query='+ channelName
			#assume all is well and the supplied url is indeed a playlist!
			
			request = Request(tuneInSearchUrl,
				meta = {'metaData': copy.deepcopy(metaData)},
				callback=self.parseTuneInSearch,
				errback=lambda x:self.parsePlaylist(x,copy.deepcopy(metaData)) )

			yield request

Beispiel #3

0

Datei anzeigen

Datei: ShowRSSSpider.py Projekt: glards/zoink

	def parse(self, response):
		xxs = XmlXPathSelector(response)
		xxs.register_namespace("f", "http://www.w3.org/2005/Atom")
		entries = xxs.select('//item')
		for entry in entries:
			item = ZoinkscraperItem()

			item['name'] = entry.select('./title/text()')[0].extract_unquoted()
			item['url'] = entry.select('./link/text()')[0].extract()

			item['date'] = datetime.strptime(entry.select('./pubDate/text()')[0].extract()[:-6],'%a, %d %b %Y %H:%M:%S')
			yield item

Beispiel #4

0

Datei anzeigen

Datei: xml_spider.py Projekt: TohruOkamoto/JDA-scripts

	def parse(self, response):
		x = XmlXPathSelector(response)
		x.remove_namespaces()
		x.register_namespace("rdf", "http://www.w3.org/1999/02/22-rdf-syntax-ns#")
		items = []
		items = x.select('//record/metadata/RDF')

		jsons = []

		for item in items:
			creator = item.select('MetaResource/creator/Agent/name/text()').extract()
			title = item.select('Resource/title/text()').extract()
			uri = item.select('Resource/screen/Image/@rdf:about').extract()
			tags = item.select('Resource/subject/Description/value/text()').extract()
			thumbnail = item.select('Resource/thumbnail/Image/@rdf:about').extract()
			lat = item.select('Resource/spatial/Description/lat/text()').extract()
			long = item.select('Resource/spatial/Description/long/text()').extract()
			locality = item.select('Resource/spatial/Description/locality/text()').extract()
			
			tags_string = '"' + '", "'.join(tags) + '"'
			
			if not lat:
				newlat = 'null'
			else:
				newlat = lat[0]

			if not long:
				newlong = 'null'
			else:
				newlong = long[0]

			if not locality:
				newloc = ''
			else:
				newloc = locality[0]
			
			
			
			json_entry = '{"title": "' + title[0] + '", "uri": "' + uri[0] + '", "attribution_uri": "' + uri[0] + '", "media_creator_username": "******", "thumbnail_url": "' + thumbnail[0] + '", "media_geo_latitude": ' + newlat + ', "media_geo_longitude": ' + newlong + ', "location": "' + newloc + '", "tags": [' + tags_string + '], "archive":"Yahoo! Japan", "media_type": "Image", "layer_type": "Image", "child_items_count":0, "published":1}, '
			
			
			jsons.append(json_entry)
			

		resumptionToken = x.select('//resumptionToken/text()').extract()
		if resumptionToken == []:
			nextFileLink = ''
			open('last.txt', 'wb').write(''.join(jsons).encode("UTF-8"))
		else:
			nextFileLink = "http://search.shinrokuden.irides.tohoku.ac.jp/webapi/oaipmh?verb=ListRecords&metadataPrefix=sdn&resumptionToken=" + resumptionToken[0].encode('ascii')
			open(resumptionToken[0].encode('ascii') + '.txt', 'wb').write(''.join(jsons).encode("UTF-8"))
		yield Request(nextFileLink, callback = self.parse)

Beispiel #5

0

Datei anzeigen

def xmliter_lxml(obj, nodename, namespace=None):
    from lxml import etree
    reader = _StreamReader(obj)
    tag = '{%s}%s' % (namespace, nodename) if namespace else nodename
    iterable = etree.iterparse(reader, tag=tag, encoding=reader.encoding)
    selxpath = '//' + ('x:%s' % nodename if namespace else nodename)
    for _, node in iterable:
        nodetext = etree.tostring(node)
        node.clear()
        xs = XmlXPathSelector(text=nodetext)
        if namespace:
            xs.register_namespace('x', namespace)
        yield xs.select(selxpath)[0]

Beispiel #6

0

Datei anzeigen

Datei: iterators.py Projekt: 00gpowe/scrapy

def xmliter_lxml(obj, nodename, namespace=None):
    from lxml import etree
    reader = _StreamReader(obj)
    tag = '{%s}%s' % (namespace, nodename) if namespace else nodename
    iterable = etree.iterparse(reader, tag=tag, encoding=reader.encoding)
    selxpath = '//' + ('x:%s' % nodename if namespace else nodename)
    for _, node in iterable:
        nodetext = etree.tostring(node)
        node.clear()
        xs = XmlXPathSelector(text=nodetext)
        if namespace:
            xs.register_namespace('x', namespace)
        yield xs.select(selxpath)[0]

Beispiel #7

0

Datei anzeigen

Datei: test_selector.py Projekt: kenzouyeh/scrapy

    def test_selector_namespaces_simple(self):
        body = """
        <test xmlns:somens="http://scrapy.org">
           <somens:a id="foo"/>
           <a id="bar">found</a>
        </test>
        """

        response = XmlResponse(url="http://example.com", body=body)
        x = XmlXPathSelector(response)
        
        x.register_namespace("somens", "http://scrapy.org")
        self.assertEqual(x.select("//somens:a").extract(), 
                         ['<somens:a id="foo"/>'])

Beispiel #8

0

Datei anzeigen

Datei: sandc_spider.py Projekt: granttoeppen/gearscrape

 def parse(self, response):
     xxs = XmlXPathSelector(response)
     xxs.register_namespace('sac', 'http://www.steepandcheap.com/docs/steepcheap/rss.xml')
     deals = xxs.select('//item')
     items = []
     for deal in deals:
         item = DealItem()
         item['title'] = deal.select('title/text()').extract()
         item['link'] = deal.select('link/@href').extract()
         item['desc'] = deal.select('description/text()').extract()
         item['shortDesc'] = deal.select('sac:listDescription/text()').extract()
         item['curPrice'] = deal.select('sac:priceCurrent/text()').extract()
         item['regPrice'] = deal.select('sac:priceRegular/text()').extract()
         items.append(item)
         return items

Beispiel #9

0

Datei anzeigen

 def parse(self, response):
     base_url = get_base_url(response)
     xxs = XmlXPathSelector(response)
     xxs.register_namespace("g", "http://base.google.com/ns/1.0")
     products = xxs.select('//channel/item')
     for product in products:
         loader = ProductLoader(item=Product(), selector=product)
         loader.add_xpath('url', 'link/text()')
         loader.add_xpath('name', 'title/text()')
         loader.add_xpath('image_url', 'g:image_link/text()')
         loader.add_xpath('price', 'g:price/text()')
         loader.add_xpath('brand', 'g:brand/text()')
         loader.add_xpath('category', 'g:brand/text()')
         loader.add_xpath('sku', 'g:id/text()')
         loader.add_xpath('identifier', 'g:id/text()')
         yield loader.load_item()

Beispiel #10

0

Datei anzeigen

Datei: kerbango_spider.py Projekt: solaise73/adaptfm

	def parse(self, response):
		x = XmlXPathSelector(response)
		x.register_namespace("kb", "http://www.kerbango.com/xml")
		items = []
		#genres = x.select('//kb:results/kb:menu_record[kb:menu_id/text()="21"]|//kb:results/kb:menu_record[kb:menu_id/text()="21"]|//kb:results/kb:menu_record[kb:menu_id/text()="21"]|//kb:results/kb:menu_record[kb:menu_id/text()="21"]')  #  [kb:menu_id/text()="21"] limited to 21 for now!!!
		genres = x.select('//kb:results/kb:menu_record[kb:menu_id/text()="21"]') #[kb:menu_id/text()="21"]
		for genre in genres:
			metaData={}

			metaData['genreName'] = genre.select('./kb:name/text()').extract()[:1]
			metaData['genreId'] = genre.select('./kb:menu_id/text()').extract()[:1]
			request = Request('http://pri.kts-af.net/xml/index.xml?tuning_id='+ metaData['genreId'][0],
				meta = {'metaData': copy.deepcopy(metaData)}, 
				callback=self.parseSubGenre)
			self.logProgress ('parse', metaData['genreName'], metaData['genreId'], '', level=log.INFO)

			yield request

Beispiel #11

0

Datei anzeigen

Datei: itunesu_spider.py Projekt: solaise73/adaptfm

	def parse(self, response):
		xxs = XmlXPathSelector(response)
		for namespace, schema in self.namespaces.iteritems():
			xxs.register_namespace(namespace, schema)

		for entry in xxs.select('//itunesu:entry'):
			metaData={}
			metaData['audioType'] = entry.select('./im:contentType/@term').extract()
			metaData['brandId'] = entry.select('./itunesu:id/text()').extract()
			metaData['brandName'] = entry.select('./im:name/text()').extract()
			metaData['brandDescription'] = entry.select('./itunesu:summary/text()').extract()
			metaData['brandCategory'] = entry.select('./itunesu:category/@label').extract()
			metaData['brandGenres'] = entry.select('./itunesu:category/@label').extract()
			metaData['brandGenreIds'] = entry.select('./itunesu:category/@im:id').extract()
			metaData['brandPublishDate'] = entry.select('./im:releaseDate/text()').extract()
			
			metaData['itunesTrackId'] = entry.select('./itunesu:id/@im:id').extract()
			metaData['itunesArtworkUrl55'] = entry.select('./im:image[@height="55"]/text()').extract()
			metaData['itunesArtworkUrl60'] = entry.select('./im:image[@height="60"]/text()').extract()
			metaData['itunesArtworkUrl170'] = entry.select('./im:image[@height="170"]/text()').extract()
			metaData['itunesCollectionPrice'] = entry.select('./im:price/@amount').extract()
			metaData['itunesCollectionViewUrl'] = entry.select('./itunesu:link/@href').extract()
			
			#have got anything that identifies the "department" so using category for now
			metaData['channelName'] = metaData['brandCategory']


			metaData['ownerName'] = entry.select('./im:artist/text()').extract()
			metaData['ownerId'] = entry.select('./im:artist/@href').extract()

			#html = entry.select('./itunesu:content[@type="html"]/text()').extract()[0]
			#hxs = HtmlXPathSelector(text=html)
			#metaData['ownerName'] = hxs.select('//./a[contains(@href,"institution")]/text()').extract()


			itunesUrl = 'http://itunes.apple.com/WebObjects/DZR.woa/wa/viewPodcast?cc=us&mt=10&id=' + metaData['itunesTrackId'][0]

			request = Request(itunesUrl,
				method='GET',
				meta = {'metaData': copy.deepcopy(metaData), 'dont_retry': True},
				headers={ 
					"User-Agent": "iTunes/9.1.1"
				},
				callback=self.parseItunesHtml)
				#errback=lambda x:self.parseItem(x,copy.deepcopy(metaData)) ) 
			yield request

Beispiel #12

0

Datei anzeigen

Datei: CurrentSpider.py Projekt: mvrk/TEAKWOOD

    def parse(self, response):
        xxs = XmlXPathSelector(response)
        xxs.register_namespace('soapenv',
                               'http://schemas.xmlsoap.org/soap/envelope/')
        xxs.register_namespace('xsd', 'http://www.w3.org/2001/XMLSchema')
        xxs.register_namespace('xsi',
                               'http://www.w3.org/2001/XMLSchema-instance')
        xxs.register_namespace(
            'CurrentsAndMetadata',
            'http://opendap.co-ops.nos.noaa.gov/axis/webservices/currents/wsdl'
        )

        timelist = xxs.select(
            '//CurrentsAndMetadata:data/CurrentsAndMetadata:item/CurrentsAndMetadata:timeStamp/text()'
        ).extract()
        cspdlist = xxs.select(
            '//CurrentsAndMetadata:data/CurrentsAndMetadata:item/CurrentsAndMetadata:CS/text()'
        ).extract()
        cdirlist = xxs.select(
            '//CurrentsAndMetadata:data/CurrentsAndMetadata:item/CurrentsAndMetadata:CD/text()'
        ).extract()

        print len(timelist)

        for i in range(0, len(cdirlist)):
            sql_str = self.SQL_INSERT_STUB.format(
                self.get_current_station().lower(),
                str(timelist[i])[0:-2], str(cspdlist[i]), str(cdirlist[i]),
                'datafactory_currentdata')
            #d_time = datetime.datetime(str(timelist[i])[0:-2], pytz.UTC)
            d_time_unware = datetime.datetime.strptime(
                str(timelist[i])[0:-2], "%Y-%m-%d %H:%M:%S")
            d_time1 = pytz.utc.localize(d_time_unware)
            d_time = d_time1.astimezone(pytz.utc)
            if self.needStore(d_time):
                self.db.query(sql_str)

        self.db.commit()

        if timelist:
            sql_str = "INSERT INTO {0} (sid, stime, etime) VALUES (\"{1}\", \"{2}\", \"{3}\")".format(
                DB_SETTINGS['DATABASE_TIME_TABLE'], self.get_current_station(),
                self.startDate.astimezone(
                    pytz.utc).strftime("%Y-%m-%d %H:%M:%S"),
                self.endDate.astimezone(
                    pytz.utc).strftime("%Y-%m-%d %H:%M:%S"))

            self.db.query(sql_str)
            self.db.commit()

        self.station_slot = self.station_slot + 1

        if (self.station_slot < len(self.start_urls)):
            yield self.start_urls[self.station_slot]

Beispiel #13

0

Datei anzeigen

 def parse(self, response):
     xxs = XmlXPathSelector(response)
     base_url = get_base_url(response)
     xxs.register_namespace("f", "http://www.w3.org/2005/Atom")
     products = xxs.select('//f:entry')
     for product in products:
         product.register_namespace("g", "http://base.google.com/ns/1.0")
         product.register_namespace("p", "http://www.w3.org/2005/Atom")
         product_loader = ProductLoader(item=Product(), selector=product)
         name = product.select('./p:title/text()').extract()[0]
         if 'B-STOCK' in name.upper():
             continue
         product_loader.add_value('name', name)
         url = product.select('./p:link/@href').extract()[0]
         product_loader.add_value('url', urljoin_rfc(base_url, url))
         image_url = product.select('./g:image_link/text()').extract()
         if image_url:
             product_loader.add_value('image_url',
                                      urljoin_rfc(base_url, image_url[0]))
         category = product.select('./g:product_type/text()').extract()
         if category:
             product_loader.add_value('category', category[0])
         brand = product.select('./g:brand/text()').extract()
         if brand:
             product_loader.add_value('brand', brand[0])
         price = product.select('./g:sale_price/text()').extract()
         if price:
             product_loader.add_value('price', extract_price(price[0]))
         else:
             price = product.select('./g:price/text()').extract()
             product_loader.add_value('price', extract_price(price[0]))
         # sku = product.select('./g:gtin/text()').extract()
         # if sku:
         #     product_loader.add_value('sku', sku[0])
         identifier = product.select('./g:id/text()').extract()[0]
         product_loader.add_value('identifier', identifier)
         product_loader.add_value('sku', identifier)
         shipping_cost = product.select(
             './g:shipping/g:price/text()').extract()
         if shipping_cost:
             product_loader.add_value('shipping_cost',
                                      extract_price(shipping_cost[0]))
         product = product_loader.load_item()
         yield product

Beispiel #14

0

Datei anzeigen

Datei: bbc_spiderv3.py Projekt: solaise73/adaptfm

	def parse(self, response):
		x = XmlXPathSelector(response)
		x.register_namespace("xsi", "http://www.w3.org/2001/XMLSchema-instance")
		
		#programs = x.select('./program[position()<3]')
		programs = x.select('./program')
		allitems=[]
		for program in programs:
			parent={}
			
			parent['brandId'] = program.select('./systemRef[@systemId="pid.brand"][position()=1]/@key').extract()
			parent['brandIds'] = program.select('./systemRef[@systemId="pid.brand"]/@key').extract()
			parent['brandFeed'] = program.select('./link[@target="feed"]/@url').extract()[0]
			parent['brandName'] = program.select('./title/text()').extract()
			parent['brandShortName'] = program.select('./shortTitle/text()').extract()
			parent['brandDescription'] = program.select('./description/text()').extract()
			parent['brandHomepage'] = program.select('./link[@target="homepage"]/@url').extract()
			parent['brandImage'] = program.select('./image/@url').extract()
			parent['brandTimes'] = program.select('./@frequency').extract()
			parent['brandCurrentItem'] = program.select('./link[@target="currentItem"]/@url').extract()
			parent['brandLanguage'] = program.select('./@language').extract()
			parent['brandAvgDuration'] = program.select('./@typicalDuration').extract()
			parent['brandFrequency'] = program.select('./@frequency').extract()
			# parent['brandTags'] = program.select('xxx').extract()
			parent['brandGenres'] = program.select('./bbcGenre/@name').extract()
			parent['brandGenreIds'] = program.select('./systemRef[@systemId="pid.genre"]/@key').extract()
			
			parent['channelId'] = program.select('./network/@id').extract()
			parent['channelName'] = program.select('./network/@name').extract()
			# parent['channelDescription'] = program.select('//head/meta[@name="description"]/@content').extract()
			# parent['channelImage'] = 'http://sverigesradio.se/diverse/appdata/isidor/images/news_images/3297/459929_87_56.jpg'
			# parent['channelFeed'] = program.select('xxx').extract()
			parent['channelHomepage'] = 'http://www.bbc.co.uk/' + parent['channelId'][0]
			
			parent['ownerId'] = 'BBC'
			parent['ownerName'] = 'BBC Radio'
			parent['ownerKey'] = 'bbc'
			parent['ownerImage'] = 'http://static.bbci.co.uk/frameworks/barlesque/2.5.10/desktop/3.5/img/blq-blocks_grey_alpha.png'
			parent['ownerHomepage'] = 'http://www.bbc.co.uk/' + parent['ownerKey']
			
			request = Request(parent['brandFeed'], meta={'parent': parent}, callback=self.load_rss)
			allitems.append(request)
		return allitems

Beispiel #15

0

Datei anzeigen

 def parse(self, response):
   """
   Main parser
   """
   xxs = XmlXPathSelector(response)
   xxs.register_namespace('feedburner', "http://rssnamespace.org/feedburner/ext/1.0")
   # For each blog post we have:
   #
   # * A main url like: http://www.beppegrillo/YYYY/MM/page_name/index.html
   #   this contains the blog post and the "commenti piu' votati" section
   #
   # * A javascript page http://www.beppegrillo/YYYY/MM/page_name.js
   #   which contains a list of URLs pointing to pages containing
   #   subsets of the comments.
   #
   # Therefore, we have to return a request for each page, and a
   # request for each one of these subpages containing a subset of
   # the comments for later parsing, made by specific methods.
   for url in xxs.select('//feedburner:origLink/text()').extract():
     yield Request(url, callback=self.parse_page)
     yield Request(url.replace('/index.html', '.js'), callback=self.parse_javascript)

Beispiel #16

0

Datei anzeigen

 def parse(self, response):
     xxs = XmlXPathSelector(response)
     xxs.register_namespace("g", "http://base.google.com/ns/1.0")
     products = xxs.select('//channel/item')
     for product in products:
         loader = ProductLoader(item=Product(), selector=product)
         loader.add_xpath('url', 'link/text()')
         loader.add_xpath('name', 'title/text()')
         loader.add_xpath('image_url', 'g:image_link/text()')
         loader.add_xpath('price', 'g:price/text()')
         loader.add_xpath('brand', 'g:brand/text()')
         categories = product.select(
             'g:product_type/text()').extract()[0].split(' &gt; ')
         loader.add_value('category', categories)
         loader.add_xpath('sku', 'g:id/text()')
         loader.add_xpath('identifier', 'g:id/text()')
         stock = product.select(
             'g:availability/text()').extract()[0].lower()
         if stock != 'in stock':
             loader.add_value('stock', 0)
         yield loader.load_item()

Beispiel #17

0

Datei anzeigen

Datei: readapt_spider.py Projekt: solaise73/adaptfm

	def parseFeed(self, response):
		jsonResponse = response.meta['jsonResponse']
		
		brandStats = jsonResponse['stats']['stats_fields']['episodePublishDate']
		#maxDate = brandStats['max']
		#updateDoc = '<delete><query>brandFeed:"'+brandFeed+'"</query></delete>'


		x = XmlXPathSelector(response)
		x.register_namespace("xsi", "http://www.w3.org/2001/XMLSchema-instance")
		x.register_namespace("itunes", "http://www.itunes.com/dtds/podcast-1.0.dtd")
		x.register_namespace("media", "http://search.yahoo.com/mrss/")

		#########
		newEpisodes = x.select('//channel/item[enclosure[contains(@type,"audio") or contains(@type,"video")]]')
		metaData = {}
		metaData['rssUrl'] = response.url
		episodes = []
		#create a single solr update doc that contains all the new episodes and deletes expired ones
		

		for xmlEpisode in newEpisodes:
			jsonBrand = jsonResponse['grouped']['brandFeed']['groups'][0]['doclist']['docs'][0]
			episode = self.load_item(jsonBrand, xmlEpisode, metaData).__dict__.values()[0]
			episodes.append(episode)

		updatejson = JSONEncoder().encode(episodes)
		yield Request(
			url=self.solrUpdateUrl, 
			method='POST', 
			body=updatejson,
			headers={'Content-Type':'application/json'},
			callback=self.dummyEnd
		)

Beispiel #18

0

Datei anzeigen

Datei: rss_spider.py Projekt: solaise73/adaptfm

	def load_podcast_rss(self, response):
		x = XmlXPathSelector(response)
		x.register_namespace("xsi", "http://www.w3.org/2001/XMLSchema-instance")
		x.register_namespace("itunes", "http://www.itunes.com/dtds/podcast-1.0.dtd")
		x.register_namespace("media", "http://search.yahoo.com/mrss/")
		metaData = response.meta['metaData']
		itunesTrackId =  metaData['itunesTrackId']
		metaData['rssUrl'] = response.url
		
		##########
		# a limit of 50 episodes has been hard coded here, this should be in settings somewhere
		#########
		episodes = x.select('//channel/item[enclosure[contains(@type,"audio") or contains(@type,"video")]][position()<50]')
		podcastEpisodeCount = str(len(episodes))

		items = []
		self.totalPodcastEpisodes = self.totalPodcastEpisodes + len(episodes)
		if len(episodes)==0:
			self.logProgress('Empty feed', metaData['brandName'][0], '', itunesTrackId, log.WARNING, ('No episodes for %s' % (response.url)))
			
			metaData['itemtype']=['noepisodes']
			item = self.load_item(x.select('//channel'), metaData)
			yield item
		else:
			podcastEpisodeIndex = str(len(items))
			podcastEpisodeCount = str(len(episodes))
			self.logProgress('load_podcast_rss', metaData['brandName'][0], '', itunesTrackId, log.INFO, ('%s/%s' % (podcastEpisodeIndex, podcastEpisodeCount)))
			for episode in episodes:
				metaData['itemtype']=['ondemand']
				item = self.load_item(episode, metaData)
				yield item

Beispiel #19

0

Datei anzeigen

Datei: itunesu_spider.py Projekt: solaise73/adaptfm

	def parsePlaylistXML(self, response):
		metaData = response.meta['metaData']
		xxs = XmlXPathSelector(response)

		xxs.register_namespace('itunesu2', 'http://www.apple.com/itms/')

		for episode in xxs.select('//itunesu2:Protocol/itunesu2:plist/itunesu2:dict/itunesu2:array/itunesu2:dict[itunesu2:dict/itunesu2:string[.="mp3"]]'):
			metaData['episodeDuration'] = episode.select('./itunesu2:dict/itunesu2:key[.="duration"]/following-sibling::itunesu2:integer[1]/text()').extract()
			metaData['episodeTitle'] = episode.select('./itunesu2:dict/itunesu2:key[.="songName"]/following-sibling::itunesu2:string[1]/text()').extract()
			metaData['rssItemMediaType'] = episode.select('./itunesu2:dict/itunesu2:key[.="fileExtension"]/following-sibling::itunesu2:string[1]/text()').extract()
			metaData['episodeDescription'] = episode.select('./itunesu2:dict/itunesu2:key[.="description"]/following-sibling::itunesu2:string[1]/text()').extract()
			metaData['itunesArtistName'] = episode.select('./itunesu2:dict/itunesu2:key[.="artistName"]/following-sibling::itunesu2:string[1]/text()').extract()
			metaData['episodePublishDate'] = episode.select('./itunesu2:dict/itunesu2:key[.="releaseDate"]/following-sibling::itunesu2:string[1]/text()').extract()
			metaData['brandFeed'] = episode.select('./itunesu2:dict/itunesu2:key[.="feedURL"]/following-sibling::itunesu2:string[1]/text()').extract()
			metaData['episodeId'] = episode.select('./itunesu2:dict/itunesu2:key[.="episodeGUID"]/following-sibling::itunesu2:string[1]/text()').extract()
			metaData['episodeMedia'] = episode.select('./itunesu2:key[.="URL"]/following-sibling::itunesu2:string[1]/text()').extract()

			if 'itunesArtworkUrl170' not in metaData:
				metaData['itunesArtworkUrl170'] = episode.select('./itunesu2:key[.="artworkURL"]/following-sibling::itunesu2:string[1]/text()').extract()

			if 'episodeFirstBroadcast' not in metaData:
				metaData['episodeFirstBroadcast'] = metaData['episodePublishDate']

			yield self.parseItem(metaData)

Beispiel #20

0

Datei anzeigen

Datei: bbc_spiderv3.py Projekt: solaise73/adaptfm

	def load_rss(self, response):
		x = XmlXPathSelector(response)
		x.register_namespace("xsi", "http://www.w3.org/2001/XMLSchema-instance")
		x.register_namespace("itunes", "http://www.itunes.com/dtds/podcast-1.0.dtd")
		x.register_namespace("media", "http://search.yahoo.com/mrss/")
		
		title = x.select('//./channel/title/text()').extract()[0]
		parent = response.meta['parent']
		
		request = Request('http://itunes.apple.com/search?term='+ title +'&entity=podcast&attribute=titleTerm', meta = {'parent': parent, 'rss': x, 'rssUrl': response.url}, callback=self.get_itunes_info)
		
		return request

Beispiel #21

0

Datei anzeigen

Datei: CurrentSpider.py Projekt: mvrk/BACKUP

    def parse(self, response):
        xxs = XmlXPathSelector(response)
        xxs.register_namespace('soapenv', 'http://schemas.xmlsoap.org/soap/envelope/')
        xxs.register_namespace('xsd', 'http://www.w3.org/2001/XMLSchema')
        xxs.register_namespace('xsi', 'http://www.w3.org/2001/XMLSchema-instance')        
        xxs.register_namespace('CurrentsAndMetadata', 'http://opendap.co-ops.nos.noaa.gov/axis/webservices/currents/wsdl')

        timelist = xxs.select('//CurrentsAndMetadata:data/CurrentsAndMetadata:item/CurrentsAndMetadata:timeStamp/text()').extract()
        cspdlist = xxs.select('//CurrentsAndMetadata:data/CurrentsAndMetadata:item/CurrentsAndMetadata:CS/text()').extract()
        cdirlist = xxs.select('//CurrentsAndMetadata:data/CurrentsAndMetadata:item/CurrentsAndMetadata:CD/text()').extract()
	

        print len(timelist) 
        
        for i in range(0, len(cdirlist)):
            sql_str = self.SQL_INSERT_STUB.format(self.get_current_station().lower(), str(timelist[i])[0:-2], str(cspdlist[i]), str(cdirlist[i]), 'datafactory_currentdata')
            #d_time = datetime.datetime(str(timelist[i])[0:-2], pytz.UTC)
            d_time_unware = datetime.datetime.strptime(str(timelist[i])[0:-2], "%Y-%m-%d %H:%M:%S")
            d_time1 = pytz.utc.localize(d_time_unware)
            d_time = d_time1.astimezone(pytz.utc)
            if self.needStore(d_time):
                self.db.query(sql_str)

        self.db.commit()

        if timelist:
            sql_str = "INSERT INTO {0} (sid, stime, etime) VALUES (\"{1}\", \"{2}\", \"{3}\")".format(
                DB_SETTINGS['DATABASE_TIME_TABLE'],
                self.get_current_station(),
                self.startDate.astimezone(pytz.utc).strftime("%Y-%m-%d %H:%M:%S"),
                self.endDate.astimezone(pytz.utc).strftime ("%Y-%m-%d %H:%M:%S")
            )

            self.db.query(sql_str)
            self.db.commit()

        self.station_slot = self.station_slot + 1

        if (self.station_slot < len(self.start_urls)):
            yield self.start_urls[self.station_slot]

Beispiel #22

0

Datei anzeigen

Datei: franceradio_spider.py Projekt: solaise73/adaptfm

	def load_rss(self, response):
		x = XmlXPathSelector(response)
		x.register_namespace("xsi", "http://www.w3.org/2001/XMLSchema-instance")
		x.register_namespace("itunes", "http://www.itunes.com/dtds/podcast-1.0.dtd")
		x.register_namespace("media", "http://search.yahoo.com/mrss/")
		
		title = x.select('//./channel/image/title/text()').extract()[0]
		parent = response.meta['parent']

		#alterative URL?
		#http://ax.itunes.apple.com/WebObjects/MZStoreServices.woa/wa/wsSearch?term=%22Les%20Aventuriers%22&entity=podcast&attribute=titleTerm&country=FR
		itunesUrl = 'http://itunes.apple.com/search?term='+ title +'&entity=podcast&attribute=titleTerm&country=FR'

		request = Request(itunesUrl, dont_filter=True, meta = {'parent': parent, 'rss': x, 'rssUrl': response.url}, callback=self.get_itunes_info)
		print itunesUrl
		return request

Beispiel #23

0

Datei anzeigen

Datei: librivox_spider.py Projekt: solaise73/adaptfm

	def parseAudioBookRSS(self, response):
		x = XmlXPathSelector(response)
		x.register_namespace("xsi", "http://www.w3.org/2001/XMLSchema-instance")
		x.register_namespace("itunes", "http://www.itunes.com/dtds/podcast-1.0.dtd")
		x.register_namespace("media", "http://search.yahoo.com/mrss/")
		metaData = response.meta['metaData']

		chapters = x.select('//channel/item')
		items = []

		
		for chapter in chapters:
			metaData['audioType']=['audiobook']
			item = self.load_item(chapter, copy.deepcopy(metaData))
			items.append(item)


		return items

Beispiel #24

0

Datei anzeigen

Datei: test_selector.py Projekt: kenzouyeh/scrapy

    def test_selector_namespaces_multiple(self):
        body = """<?xml version="1.0" encoding="UTF-8"?>
<BrowseNode xmlns="http://webservices.amazon.com/AWSECommerceService/2005-10-05"
            xmlns:b="http://somens.com"
            xmlns:p="http://www.scrapy.org/product" >
    <b:Operation>hello</b:Operation>
    <TestTag b:att="value"><Other>value</Other></TestTag>
    <p:SecondTestTag><material/><price>90</price><p:name>Dried Rose</p:name></p:SecondTestTag>
</BrowseNode>
        """
        response = XmlResponse(url="http://example.com", body=body)
        x = XmlXPathSelector(response)

        x.register_namespace("xmlns", "http://webservices.amazon.com/AWSECommerceService/2005-10-05")
        x.register_namespace("p", "http://www.scrapy.org/product")
        x.register_namespace("b", "http://somens.com")
        self.assertEqual(len(x.select("//xmlns:TestTag")), 1)
        self.assertEqual(x.select("//b:Operation/text()").extract()[0], 'hello')
        self.assertEqual(x.select("//xmlns:TestTag/@b:att").extract()[0], 'value')
        self.assertEqual(x.select("//p:SecondTestTag/xmlns:price/text()").extract()[0], '90')
        self.assertEqual(x.select("//p:SecondTestTag").select("./xmlns:price/text()")[0].extract(), '90')
        self.assertEqual(x.select("//p:SecondTestTag/xmlns:material").extract()[0], '<material/>')

Beispiel #25

0

Datei anzeigen

Datei: npr_spider.py Projekt: solaise73/adaptfm

	def get_itunes(self, response):
	
		itunes = json.loads(response.body)
		rss = XmlXPathSelector(response.meta['rss'])
		rss.register_namespace("xsi", "http://www.w3.org/2001/XMLSchema-instance")
		rss.register_namespace("itunes", "http://www.itunes.com/dtds/podcast-1.0.dtd")
		rss.register_namespace("media", "http://search.yahoo.com/mrss/")

		episodes = rss.select('./channel/item')
		items = []
		
		for episode in episodes:
			item = PodcastItem()
			
			item['brandGenres'] = episode.select('//./channel/itunes:category/@text').extract()
			item['brandGenreIds'] = episode.select('//./channel/./itunes:category/@text').extract()
			item['brandDescription'] = episode.select('//./channel/description/text()').extract()
			item['brandShortName'] = episode.select('//./channel/title/text()').extract()
			item['brandLanguage'] = episode.select('//./channel/language/text()').extract()
			item['brandHomepage'] = episode.select('//./channel/link/text()').extract()
			item['brandImage'] = episode.select('//./channel/itunes:image/@href').extract()
			
			
			item['channelId'] = episode.select('//./channel/itunes:author/text()').extract()
			item['channelName'] = episode.select('//./channel/itunes:author/text()').extract()
			#item['channelDescription'] = parent['channelDescription']
			item['channelImage'] = episode.select('//./channel/itunes:image/@href').extract()
			item['channelHomepage'] = episode.select('//./channel/link/text()').extract()
			
			item['ownerId'] = 'NPR'
			item['ownerName'] = 'NPR'
			item['ownerKey'] = 'npr'
			item['ownerImage'] = 'http://media.npr.org/chrome/news/nprlogo_138x46.gif'
			item['ownerHomepage'] = 'http://www.npr.org'
			
			item['brandFeed'] = response.meta['rss']
			item['brandName'] =  episode.select('//./channel/title/text()').extract()
			item['brandId'] = response.meta['rss']
			
			item['type'] = 'podcast'
			
			item['id'] = episode.select('./guid/text()').extract()
			item['episodeId'] = episode.select('./guid/text()').extract()
			item['episodeTitle'] = episode.select('./title/text()').extract()
			item['episodeSubtitle'] = episode.select('./itunes:subtitle/text()').extract()
			item['episodeDescription'] = episode.select('./description/text()').extract()
			#item['episodeStart'] = episode.select('xxx').extract()
			#item['episodeEnd'] = episode.select('xxx').extract()
			item['episodeDuration'] = episode.select('./enclosure/@length').extract()
			item['episodePublishDate'] = episode.select('./pubDate/text()').extract()
			item['episodeMimeType'] = episode.select('./media:content/@type').extract()
			item['episodeMedia'] = episode.select('./guid/text()').extract()
			#item['episodeImage'] = episode.select('xxx').extract()
			#item['episodeHomepage'] = episode.select('xxx').extract()
			item['episodeFirstBroadcast'] = episode.select('./pubDate/text()').extract()
			#item['episodeAvailableUntil'] = episode.select('xxx').extract()
			#item['episodeTags'] = episode.select('xxx').extract()
			#item['episodeRelatedTitle'] = episode.select('xxx').extract()
			#item['episodeRelatedUrl'] = episode.select('xxx').extract()
			
			#item['itunesGenres'] =  itunes['results'][0]['genres'][0]
			#item['itunesGenreIds'] =   itunes['results'][0]['genreIds'][0]
			#item['itunesartworkUrl30'] =  itunes['results'][0]['artworkUrl30'][0]
			
			
			
			items.append(item)
		return items

Beispiel #26

0

Datei anzeigen

Datei: itunes_spider.py Projekt: solaise73/adaptfm

    def load_podcast_rss(self, response):
        x = XmlXPathSelector(response)
        x.register_namespace("xsi", "http://www.w3.org/2001/XMLSchema-instance")
        x.register_namespace("itunes", "http://www.itunes.com/dtds/podcast-1.0.dtd")
        x.register_namespace("media", "http://search.yahoo.com/mrss/")
        metaData = response.meta["metaData"]
        genreName = metaData["genreName"]
        genreCount = metaData["genreCount"]
        genreIndex = metaData["genreIndex"]
        letterValue = metaData["letterValue"]
        pageCount = metaData["pageCount"]
        pageIndex = metaData["pageIndex"]
        popularInGenre = metaData["popularInGenre"]
        podcastId = metaData["podcastId"]
        podcastName = metaData["podcastName"]
        podcastGenreIndex = metaData["podcastGenreIndex"]
        podcastGenreCount = metaData["podcastGenreCount"]
        podcastDescription = metaData["podcastDescription"]
        podcastCount = str(len(self.indexedPodcasts))
        itunes1 = metaData["itunes1"]
        # this is json from a 2nd lookup to get the "artist" data, where artist may be channel or owner data
        itunes2 = metaData["itunes2"] if "itunes2" in metaData else {}
        episodes = x.select("./channel/item")
        podcastEpisodeCount = str(len(episodes))
        rssUrl = response.url
        items = []

        for episode in episodes:
            l = XPathItemLoader(PodcastItem(), selector=episode)

            #############
            # RSS per channel values
            #############
            l.add_xpath("rssChannelTitle", "//./channel/title/text()")
            l.add_xpath("rssChannelLink", "//./channel/link/text()")
            l.add_xpath("rssChannelDescription", "//./channel/description/text()")
            l.add_xpath("rssChannelCopyright", "//./channel/copyright/text()")
            l.add_xpath("rssChannelLanguage", "//./channel/language/text()")
            l.add_xpath("brandLanguage", "//./channel/language/text()")
            l.add_xpath("rssChannelGenerator", "//./channel/generator/text()")
            l.add_xpath("rssChannelPubDate", "//./channel/pubDate/text()")
            l.add_xpath("rssChannelitunesAuthor", "//./channel/itunes:author/text()")
            l.add_xpath("rssChannelitunesCategory", "//./channel/itunes:category/@text")
            l.add_xpath("rssChannelitunesExplicit", "//./channel/itunes:explicit/text()")
            l.add_xpath("rssChannelitunesImage", "//./channel/itunes:image/@href")
            l.add_xpath("rssChannelitunesKeywords", "//./channel/itunes:keywords/text()")
            l.add_xpath("rssChannelitunesOwnerEmail", "//./channel/itunes:owner/itunes:email/text()")
            l.add_xpath("rssChannelitunesOwnerName", "//./channel/itunes:owner/itunes:name/text()")
            l.add_xpath("rssChannelitunesSubtitle", "//./channel/itunes:subtitle/text()")
            l.add_xpath("rssChannelitunesSummary", "//./channel/itunes:summary/text()")
            l.add_xpath("rssChannelImageUrl", "//./channel/image/url/text()")
            l.add_xpath("rssChannelImageTitle", "//./channel/image/title/text()")
            l.add_xpath("rssChannelImageLink", "//./channel/image/link/text()")
            l.add_xpath("rssChannelLastBuildDate", "//./channel/lastBuildDate/text()")

            #############
            # RSS per item values
            #############
            l.add_xpath("rssItemTitle", "./title/text()")
            l.add_xpath("rssItemDescription", "./description/text()")
            l.add_xpath("rssItemPubDate", "./pubDate/text()")
            l.add_xpath("rssItemLink", "./link/text()")
            l.add_xpath("rssItemGuid", "./guid/text()")
            l.add_xpath("rssItemAuthor", "./author/text()")
            l.add_xpath("rssItemitunesSummary", "./itunes:summary/text()")
            l.add_xpath("rssItemitunesKeywords", "./itunes:keywords/text()")
            l.add_xpath("rssItemitunesDuration", "./itunes:duration/text()")
            l.add_xpath("rssItemitunesExplicit", "./itunes:explicit/text()")
            l.add_xpath("rssItemEnclosureUrl", "./enclosure/@url")
            l.add_xpath("rssItemEnclosureLength", "./enclosure/@length")
            l.add_xpath("rssItemEnclosureType", "./enclosure/@type")

            l.add_xpath("episodeDuration", "./itunes:duration/text()")

            #############
            # iTunes lookup meta values
            #############
            if "artistId" in itunes1:
                l.add_value("itunesArtistId", str(itunes1["artistId"]))
            if "artistName" in itunes1:
                l.add_value("itunesArtistName", itunes1["artistName"])
            if "artistViewUrl" in itunes1:
                l.add_value("itunesArtistViewUrl", itunes1["artistViewUrl"])
            if "artworkUrl100" in itunes1:
                l.add_value("itunesArtworkUrl100", itunes1["artworkUrl100"])
            if "artworkUrl30" in itunes1:
                l.add_value("itunesArtworkUrl30", itunes1["artworkUrl30"])
            if "artworkUrl60" in itunes1:
                l.add_value("itunesArtworkUrl60", itunes1["artworkUrl60"])
            if "artworkUrl600" in itunes1:
                l.add_value("itunesArtworkUrl600", itunes1["artworkUrl600"])
            if "collectionId" in itunes1:
                l.add_value("itunesCollectionId", str(itunes1["collectionId"]))
            if "collectionName" in itunes1:
                l.add_value("itunesCollectionName", itunes1["collectionName"])
            if "collectionPrice" in itunes1:
                l.add_value("itunesCollectionPrice", str(itunes1["collectionPrice"]))
            if "collectionViewUrl" in itunes1:
                l.add_value("itunesCollectionViewUrl", itunes1["collectionViewUrl"])
            if "genreIds" in itunes1:
                l.add_value("itunesGenreIds", itunes1["genreIds"])
            if "genres" in itunes1:
                l.add_value("itunesGenres", itunes1["genres"])
            if "itunesCollectionCensoredName" in itunes1:
                l.add_value("itunesCollectionCensoredName", itunes1["collectionCensoredName"])
            if "itunesCollectionExplicitness" in itunes1:
                l.add_value("itunesCollectionExplicitness", itunes1["collectionExplicitness"])
            if "itunesCurrency" in itunes1:
                l.add_value("itunesCurrency", itunes1["currency"])
            # if 'itunesPopular' in itunes1 : l.add_value('itunesPopular', itunes1['popular'])
            # if 'itunesPopularInGenre' in itunes1 : l.add_value('itunesPopularInGenre', itunes1['popularInGenre'])
            if "itunesTrackCensoredName" in itunes1:
                l.add_value("itunesTrackCensoredName", itunes1["trackCensoredName"])
            if "itunesTrackCount" in itunes1:
                l.add_value("itunesTrackCount", itunes1["trackCount"])
            if "primaryGenreName" in itunes1:
                l.add_value("itunesPrimaryGenreName", itunes1["primaryGenreName"])
            if "releaseDate" in itunes1:
                l.add_value("itunesReleaseDate", itunes1["releaseDate"])
            if "trackExplicitness" in itunes1:
                l.add_value("itunesTrackExplicitness", itunes1["trackExplicitness"])
            if "trackId" in itunes1:
                l.add_value("itunesTrackId", str(itunes1["trackId"]))
            if "trackPrice" in itunes1:
                l.add_value("itunesTrackPrice", str(itunes1["trackPrice"]))
            if "trackViewUrl" in itunes1:
                l.add_value("itunesTrackViewUrl", itunes1["trackViewUrl"])

            l.add_value("itunesPopular", metaData["itunesPopular"])
            l.add_value("itunesPopularInGenre", metaData["itunesPopularInGenre"])
            l.add_value("itunesSimilar", metaData["podcastSimilar"])
            l.add_value("itunesRelated", metaData["podcastRelated"])

            item = l.load_item()

            #############
            ## Test and copy from alternate sources
            #############
            # Do some more if else/else to fill the fields here.....

            #######
            ## Episode items
            #############
            item["id"] = item["rssItemGuid"] if "rssItemGuid" in item else ["Unknown"]
            item["episodeId"] = item["rssItemGuid"] if "rssItemGuid" in item else ["Unknown"]
            item["episodeTitle"] = item["rssItemTitle"] if "rssItemTitle" in item else ["Unknown"]
            item["episodeSubtitle"] = item["rssItemSubtitle"] if "rssItemSubtitle" in item else ["Unknown"]
            item["episodeDescription"] = item["rssItemDescription"] if "rssItemDescription" in item else ["Unknown"]
            item["episodeDuration"] = item["episodeDuration"] if "episodeDuration" in item else ["0"]

            item["episodeMimeType"] = item["rssItemEnclosureType"] if "rssItemEnclosureType" in item else ["Unknown"]
            item["episodeMedia"] = item["rssItemEnclosureUrl"] if "rssItemEnclosureUrl" in item else ["Unknown"]
            item["episodeImage"] = item["rssChannelImageUrl"] if "rssChannelImageUrl" in item else ["Unknown"]
            item["episodeHomepage"] = item["rssItemLink"] if "rssItemLink" in item else ["Unknown"]
            # item['episodeFirstBroadcast'] = item['rssItemGuid']
            # item['episodeAvailableUntil'] = item['rssItemGuid']
            item["episodeTags"] = item["rssItemitunesKeywords"] if "rssItemitunesKeywords" in item else ["Unknown"]
            # item['episodeRelatedTitle'] = item['rssItemGuid']
            # item['episodeRelatedUrl'] = item['rssItemGuid']

            # item['seriesId'] = item['xxx']
            # item['seriesName'] = item['xxx']
            # item['seriesDescription'] = item['xxx']
            # item['seriesImage'] = item['xxx']
            # item['seriesFeed'] = item['xxx']
            # item['seriesCurrentItem'] = item['xxx']
            # item['seriesHomepage'] = item['xxx']
            # item['seriesTags'] = item['xxx']
            if "itunesReleaseDate" in item:
                item["episodePublishDate"] = item["itunesReleaseDate"]
            elif "rssItemPubDate" in item:
                item["episodePublishDate"] = item["rssItemPubDate"]
            else:
                item["episodePublishDate"] = ["1900-01-01T00:00:00Z"]

                #######
                ## Brand items
                #############
            item["brandId"] = [rssUrl]
            item["brandFeed"] = [rssUrl]

            # brandName
            if "rssChannelTitle" in item:
                item["brandName"] = item["rssChannelTitle"]
            elif "itunesTrackName" in item:
                item["brandName"] = item["itunesTrackName"]
            else:
                item["brandName"] = ["Unknown"]

            if "rssChannelDescription" in item:
                item["brandDescription"] = item["rssChannelDescription"]
            elif "podcastDescription" in metaData:
                item["brandDescription"] = metaData["podcastDescription"]
            else:
                ["Unknown"]

            item["brandSimilar"] = metaData["podcastSimilar"]
            item["brandRelated"] = metaData["podcastRelated"]
            item["brandHomepage"] = item["rssChannelLink"] if "rssChannelLink" in item else ["Unknown"]
            item["brandLanguage"] = item["brandLanguage"] if "brandLanguage" in item else ["xx"]
            item["brandTags"] = item["rssChannelitunesKeywords"] if "rssChannelitunesKeywords" in item else ["Unknown"]
            item["brandGenres"] = item["itunesGenres"] if "itunesGenres" in item else ["Unknown"]
            item["brandGenreIds"] = item["itunesGenreIds"] if "itunesGenreIds" in item else ["Unknown"]

            if "itunesArtworkUrl100" in item:
                item["brandImage"] = item["itunesArtworkUrl100"]
            elif "rssChannelitunesImage" in item:
                item["brandImage"] = item["rssChannelitunesImage"]
            else:
                item["brandImage"] = ["Unknown"]

                # itunes feeds do not stick to the parent genre in itunesPrimaryGenreName as found here
                # http://itunes.apple.com/us/genre/podcasts/id26?mt=2
                # this makes it difficult to have a smallish number of top level categories
                # so the text value from the scrape is used in preference
            if "genreName" in metaData:
                item["brandCategory"] = metaData["genreName"]
            elif "itunesPrimaryGenreName" in item:
                item["brandCategory"] = item["itunesPrimaryGenreName"]
            elif "rssChannelitunesCategory" in item:
                item["brandCategory"] = [item["rssChannelitunesCategory"][0]]
            else:
                item["brandCategory"] = ["Unknown"]

                #######
                ## Channel items
                #############
            item["channelId"] = item["itunesArtistId"] if "itunesArtistId" in item else ["Unknown"]

            if "rssChannelitunesAuthor" in item:
                item["channelName"] = item["rssChannelitunesAuthor"]
            elif "itunesArtistName" in item:
                item["channelName"] = item["itunesArtistName"]
            else:
                item["channelName"] = ["Unknown"]

                #######
                ## Depends on what itunes returns for Artist lookup
                #######
                # itunes1 based on scraped podcastId, itunes2 based on artistId from first itunes lookup if present

                # if itunes1.artistId NOT present
                # brandName = itunes1.trackName, channelName = itunes1.artistName, ownerName remains unknown
            if "artistId" not in itunes1:
                item["brandName"] = itunes1["trackName"]
                item["channelName"] = itunes1["artistName"]
                item["ownerName"] = "Unknown"

                # if itunes1.artistId present then 2nd lookup made to itunes:
                # if itunes1.artistName == itunes2.artistName
                # brandName = itunes1.trackName, channelName = remains unknown, ownerName = itunes1.artistName
            elif "artistId" in itunes1 and "artistId" in itunes2:
                if itunes1["artistId"] == itunes2["artistId"]:
                    item["brandName"] = itunes1["trackName"]
                    item["channelName"] = "Unknown"
                    item["ownerName"] = itunes1["artistName"]
                    item["ownerId"] = itunes1["artistId"]

                    # if itunes1.artistName != itunes2.artistName
                    # brandName = itunes1.trackName, channelName = itunes1.artistName, ownerName = itunes2.artistName
                else:
                    item["brandName"] = itunes1["trackName"]
                    item["channelName"] = itunes1["artistName"]
                    item["ownerName"] = itunes2["artistName"]
                    item["ownerId"] = itunes2["artistId"]

            self.indexedEpisodes.append(item["episodeId"])
            episodeCount = str(len(self.indexedEpisodes))
            episodeTitle = item["episodeTitle"]
            podcastEpisodeIndex = str(len(items))

            self.log(
                "[%s/%s] Indexing genre %s[%d of %d] letter %s page [%s of %d] podcast [id:%s] [%d of %d] %s : [%s/%s] %s"
                % (
                    episodeCount,
                    podcastCount,
                    genreName,
                    genreIndex,
                    genreCount,
                    letterValue,
                    pageIndex,
                    pageCount,
                    podcastId,
                    podcastGenreIndex,
                    podcastGenreCount,
                    podcastName,
                    podcastEpisodeIndex,
                    podcastEpisodeCount,
                    episodeTitle,
                ),
                level=log.INFO,
            )

            items.append(item)
        return items

Beispiel #27

0

Datei anzeigen

Datei: rss_links_spider.py Projekt: solaise73/adaptfm

	def parse(self, response):
		x = XmlXPathSelector(response)
		x.register_namespace('atom','http://www.w3.org/2005/Atom')
		x.register_namespace("xsi", "http://www.w3.org/2001/XMLSchema-instance")
		x.register_namespace("itunes", "http://www.itunes.com/dtds/podcast-1.0.dtd")
		x.register_namespace("media", "http://search.yahoo.com/mrss/")
		x.register_namespace("im", "http://itunes.apple.com/rss")
		x.register_namespace("media", "http://search.yahoo.com/mrss/")
		
		#either a rss feed of items
		#or a rss feed of feeds with entry(s)
		items = x.select('//rss/item')
		entries = x.select('//atom:entry')
		print entries, items
		for entry in entries:
			metaData={}
			metaData['brandName'] = entry.select('./atom:title/text()').extract()
			metaData['itunesTrackId'] = entry.select('./atom:id/@im:id').extract()
			metaData['brandDescription'] = entry.select('./atom:summary/text()').extract()
			if metaData['itunesTrackId']:
				metaData['itunesTrackId'] = metaData['itunesTrackId'][0]

			#itunes podcast html
			#from an Id
			if 'itunesTrackId' in metaData and metaData['itunesTrackId']:
				self.logProgress('parsePage from Id', metaData['brandName'][0], '', metaData['itunesTrackId'], log.INFO, str(metaData['itunesTrackId']) )

				request = Request('http://itunes.apple.com/lookup?id='+ metaData['itunesTrackId'], meta = {'metaData': copy.deepcopy(metaData)}, callback=self.getItunesTrackJson)
			else:
			#if not from the title
				self.logProgress('parsePage from title', metaData['brandName'][0], '', '---------', log.INFO)
				try:
					ownerName = metaData['ownerName'][0] 
				except:
					ownerName = ''
				#&attribute=titleTerm removed whilst using the owner name in the string as well
				request = Request('http://itunes.apple.com/search?term='+ metaData['brandName'][0] + ownerName +'&entity=podcast', meta = {'metaData': copy.deepcopy(metaData)}, callback=self.getItunesTrackJson)
			
			self.indexedPodcasts.append(1)
			yield request
			
		for item in items:
			metaData={}
			metaData['brandName'] = item.select('./title/text()').extract()
			metaData['itunesTrackId'] = item.select('./id/@im:id').extract()
			metaData['brandDescription'] = item.select('./summary/text()').extract()
			
			
			if metaData['itunesTrackId']:
				metaData['itunesTrackId'] = metaData['itunesTrackId'][0]

			#itunes podcast html
			#from an Id
			if 'itunesTrackId' in metaData and metaData['itunesTrackId']:
				self.logProgress('parsePage from Id', metaData['brandName'][0], '', metaData['itunesTrackId'], log.INFO, str(metaData['itunesTrackId']) )

				request = Request('http://itunes.apple.com/lookup?id='+ metaData['itunesTrackId'], meta = {'metaData': copy.deepcopy(metaData)}, callback=self.getItunesTrackJson)
			else:
			#if not from the title
				self.logProgress('parsePage from title', metaData['brandName'][0], '', '---------', log.INFO)
				try:
					ownerName = metaData['ownerName'][0] 
				except:
					ownerName = ''
				#&attribute=titleTerm removed whilst using the owner name in the string as well
				request = Request('http://itunes.apple.com/search?term='+ metaData['brandName'][0] + ownerName +'&entity=podcast', meta = {'metaData': copy.deepcopy(metaData)}, callback=self.getItunesTrackJson)
			
			self.indexedPodcasts.append(1)
			yield request

Beispiel #28

0

Datei anzeigen

    def parse(self, response):
        x = XmlXPathSelector(response)
        x.remove_namespaces()
        x.register_namespace("rdf",
                             "http://www.w3.org/1999/02/22-rdf-syntax-ns#")
        items = []
        items = x.select('//record/metadata/RDF')

        jsons = []

        for item in items:
            creator = item.select(
                'MetaResource/creator/Agent/name/text()').extract()
            title = item.select('Resource/title/text()').extract()
            uri = item.select('Resource/screen/Image/@rdf:about').extract()
            tags = item.select(
                'Resource/subject/Description/value/text()').extract()
            thumbnail = item.select(
                'Resource/thumbnail/Image/@rdf:about').extract()
            lat = item.select(
                'Resource/spatial/Description/lat/text()').extract()
            long = item.select(
                'Resource/spatial/Description/long/text()').extract()
            locality = item.select(
                'Resource/spatial/Description/locality/text()').extract()

            tags_string = '"' + '", "'.join(tags) + '"'

            if not lat:
                newlat = 'null'
            else:
                newlat = lat[0]

            if not long:
                newlong = 'null'
            else:
                newlong = long[0]

            if not locality:
                newloc = ''
            else:
                newloc = locality[0]

            json_entry = '{"title": "' + title[0] + '", "uri": "' + uri[
                0] + '", "attribution_uri": "' + uri[
                    0] + '", "media_creator_username": "******", "thumbnail_url": "' + thumbnail[
                            0] + '", "media_geo_latitude": ' + newlat + ', "media_geo_longitude": ' + newlong + ', "location": "' + newloc + '", "tags": [' + tags_string + '], "archive":"Yahoo! Japan", "media_type": "Image", "layer_type": "Image", "child_items_count":0, "published":1}, '

            jsons.append(json_entry)

        resumptionToken = x.select('//resumptionToken/text()').extract()
        if resumptionToken == []:
            nextFileLink = ''
            open('last.txt', 'wb').write(''.join(jsons).encode("UTF-8"))
        else:
            nextFileLink = "http://search.shinrokuden.irides.tohoku.ac.jp/webapi/oaipmh?verb=ListRecords&metadataPrefix=sdn&resumptionToken=" + resumptionToken[
                0].encode('ascii')
            open(resumptionToken[0].encode('ascii') + '.txt',
                 'wb').write(''.join(jsons).encode("UTF-8"))
        yield Request(nextFileLink, callback=self.parse)

Beispiel #29

0

Datei anzeigen

Datei: kahoku_spider.py Projekt: Japan-Digital-Archives/JDA-scripts

  def parse(self, response):

    #########
    # Setup #
    #########
    x = XmlXPathSelector(response)
    x.remove_namespaces()
    x.register_namespace('rdf', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#')

    category     = getCAT()
    category     = category.upper()
    output_path  = self.PATH + category.lower() + '_output/'
    token_path   = output_path + '.previous_resumption_token'
    dup_path     = output_path + '.dup_list'
    items        = []
    items        = x.select('//record/metadata/RDF')
    result       = []
    id_list      = []
    nextFileLink = ''

    resumption_token = x.select('//resumptionToken/text()').extract()
    saveResumptionToken(resumption_token, token_path)

    ###############
    # Parse Items #
    ###############
    print '****** PARSING FILE... ******'
    for item in items:

      ####################
      ##### creator ######
      ##### archive ######
      #### layer_type ####
      ####################
      media_creator_username = '******'
      archive                = 'Kahoku Shimpo Disasters Archive'
      layer_type             = 'Image' 

      ####################
      #### media_type ####
      ####################
      if category == 'MOVIE': media_type = 'Video'
      if category == 'DOCUMENT' or category == 'OTHER': media_type = 'Headline'
      if category == 'IMAGE': media_type = 'Image'

      ######################
      # media_date_created #
      ######################
      media_date_created = item.select('Resource/created/text()').extract()
      media_date_created = processField(media_date_created)

      ##################
      #### abstract ####
      ##### title ######
      ##################
      abstract = item.select('Resource/abstract/text()').extract()
      title    = item.select('Resource/title/text()').extract()
      title    = processField(title)
      abstract = processField(abstract)
      abstract = abstract.replace('\r\n', '')

      # Abstract tends to be more unique, though not always there. Title is often repetitive but more consistently present.
      if not abstract: abstract = title

      ###################
      #### unique_id ##### 
      ###################
      unique_id = item.select('Resource/identifier/text()').extract()
      unique_id = str(unique_id[0])
      # Used for de-duping
      id_list.append(unique_id)

      ####################
      ###### Source ###### 
      ####################
      source = item.select('Resource/@rdf:about').extract()
      source = processField(source)

      ####################
      ####### URI ######## 
      ####################
      # Download image if it has not already been downloaded
      if category == 'IMAGE':
      	uri = item.select('Resource/screen/Image/@rdf:about').extract()
      	downloaded = os.path.exists(output_path + unique_id + '.jpg')
      	if uri and not downloaded:
      	  uri = uri[0]
      	  urllib.urlretrieve(uri, output_path + unique_id + '.jpg')
      	  uri = 'https://s3.amazonaws.com/JDA-Files/' + unique_id
      
      if category == 'MOVIE':
      	uri = item.select('Resource/ogg/Image/@rdf:about').extract()
	      
      if category == 'DOCUMENT' or category == 'OTHER':
      	uri = source

      uri = processField(uri)

      ####################
      #### Thumbnail ##### 
      ####################
      thumbnail_url = item.select('Resource/thumbnail/Image/@rdf:about').extract()
      thumbnail_url = processField(thumbnail_url)

      ####################
      ####### Tags ####### 
      ####################
      tags = item.select('Resource/subject/Description/value/text()').extract()
      if not tags:
        tags_string = '[]'
      else:
        tags_string = '"' + '", "'.join(tags) + '"'

      ####################
      ##### Location ##### 
      ####################
      region           = item.select('Resource/spatial/Description/region/text()').extract()
      locality         = item.select('Resource/spatial/Description/locality/text()').extract()
      street_address   = item.select('Resource/spatial/Description/street-address/text()').extract()

      if region or locality or street_address:
        region         = processField(region)
        locality       = processField(locality)
        street_address = processField(street_address)
        locationTemp   = [street_address, locality, region]
        location       = ''

        # Handles comma location and attribute existence variability
        for item in locationTemp:
          if item:
            if location == '':
              location = location + item
            else:
              location = location + ', ' + item
        if location[location.__len__()-1] == ',':
          location = location[:-1]
      else:
        location = ''

      ##########################
      ######## Lat/Long ########
      ##########################
      # Find coordinates using Google Maps API
      lat = '' 
      lng = ''
      if location != '':
        key                = '&key=AIzaSyCGF2BwNPNckrbx6L2tQRATBcjKv0C3xCo'
        google_uri         = 'https://maps.googleapis.com/maps/api/geocode/json?address=' 
        location_encoded   = location.encode('utf8')
        location_url_ready = urllib.quote_plus(location_encoded, safe='')
        request_uri        = google_uri + location_url_ready + key 
        with contextlib.closing(urllib.urlopen(request_uri)) as response:
          data = json.load(response)
          if json.dumps(data['results']) != '[]':
            lat = json.dumps(data['results'][0]['geometry']['location']['lat'])
            lng = json.dumps(data['results'][0]['geometry']['location']['lng'])
          else:
            lat = 'null' 
            lng = 'null'

      ##########################
      ######## JSONify #########
      ##########################
      json_entry = ( '{"title": "' 
        + abstract + '", "uri": "' 
        + uri + '", "attribution_uri": "' 
        + source + '", "media_date_created": "' 
        + media_date_created + '", "media_creator_username": "******", "thumbnail_url": "' 
        + thumbnail_url + '", "media_geo_latitude": "' 
        + lat + '", "media_geo_longitude": "' 
        + lng + '", "location": "' 
        + location + '", "tags": [' 
        + tags_string + '], "archive": "' 
        + archive + '",  "media_type": "'
        + media_type + '", "layer_type": "'
        + layer_type + '", "child_items_count": 0, "published": 1}, '
      )

      #####################
      # Duplicate Checker #
      #####################
      # Check for duplicates only in the "final-..." file since that is the only 
      # file without a resumption token and thus could possibly contain duplicates.
      if not resumption_token and os.path.exists(dup_path):
        dup_list = open(dup_path, 'r').read()
        if unique_id not in dup_list:
          print 'not in dup'
          result.append(json_entry)
      else:
        result.append(json_entry)


    ###################
    # Save Duplicates #
    ###################
    # Save Item URI List
    with open(dup_path, 'w+r') as f:
      print '****** (OVER)WRITING DEDUP LIST ******'
      f.truncate() 
      for item in id_list:
        print>>f, item
      f.close() 

    ###########
    # If Done #
    ###########
    if resumption_token == []:
      print '****** DONE ******'
      nextFileLink = ""
      path = output_path + 'final-' + getDateString() + '.json'
      open(path, 'wb').write(''.join(result).encode('UTF-8'))
      removeEmptyFiles(output_path)

    ###############
    # Or Next Job #
    ###############
    else: 
      url          = self.template_url + category + '&resumptionToken='
      nextFileLink = url + resumption_token[0].encode('ascii')
      path         = output_path + resumption_token[0].encode('ascii') + '.json'

      open(path, 'wb').write(''.join(result).encode('UTF-8'))
      yield Request(nextFileLink, callback = self.parse)

Beispiel #30

0

Datei anzeigen

Datei: sverigesradio_spider.py Projekt: solaise73/adaptfm

    def parse_rss(self, response):
       x = XmlXPathSelector(response)
       x.register_namespace("xsi", "http://www.w3.org/2001/XMLSchema-instance")
       x.register_namespace("itunes", "http://www.itunes.com/dtds/podcast-1.0.dtd")
       x.register_namespace("media", "http://search.yahoo.com/mrss/")
       
       episodes = x.select('./channel/item')
       items = []
       parent = response.meta['parent'] 
       for episode in episodes:
           item = PodcastItem()
           #item = copy.copy(parent)
           
           
           item['channelId'] = parent['channelId']
           item['channelName'] = parent['channelName']
           item['channelDescription'] = parent['channelDescription']
           item['channelImage'] = parent['channelImage']
           item['channelHomepage'] = parent['channelHomepage']

           item['ownerId'] = parent['ownerId']
           item['ownerName'] = parent['ownerName']
           item['ownerKey'] = parent['ownerKey']
           item['ownerImage'] = parent['ownerImage']
           item['ownerHomepage'] = parent['ownerHomepage']
           
           item['brandFeed'] = response.meta['rss'] 
           item['brandName'] = parent['brandName'] 
           item['brandId'] = parent['brandId']
           #item['brandIds'] = parent['brandIds']
           #item['brandShortName'] = parent['brandShortName']
           
           item['brandImage'] = parent['brandImage']
           #item['brandTimes'] = parent['brandTimes']
           #item['brandCurrentItem'] = parent['brandCurrentItem']
           #item['brandLanguage'] = parent['brandLanguage']
           #item['brandAvgDuration'] = parent['brandAvgDuration']
           #item['brandFrequency'] = parent['brandFrequency']
           # item['brandTags'] = parent['brandTags']
           
           
           item['id'] = episode.select('./guid/text()').extract()
           item['type'] = 'podcast'
           item['episodeId'] = episode.select('./guid/text()').extract()
           item['episodeTitle'] = episode.select('./title/text()').extract()
           item['episodeSubtitle'] = episode.select('./itunes:subtitle/text()').extract()
           item['episodeDescription'] = episode.select('./description/text()').extract()
           #item['episodeStart'] = episode.select('xxx').extract()
           #item['episodeEnd'] = episode.select('xxx').extract()
           item['episodeDuration'] = episode.select('./enclosure/@length').extract()
           item['episodePublishDate'] = episode.select('./pubDate/text()').extract()
           item['episodeMimeType'] = episode.select('./media:content/@type').extract()
           item['episodeMedia'] = episode.select('./link/text()').extract()
           #item['episodeImage'] = episode.select('xxx').extract()
           #item['episodeHomepage'] = episode.select('xxx').extract()
           item['episodeFirstBroadcast'] = episode.select('./pubDate/text()').extract()
           #item['episodeAvailableUntil'] = episode.select('xxx').extract()
           #item['episodeTags'] = episode.select('xxx').extract()
           #item['episodeRelatedTitle'] = episode.select('xxx').extract()
           #item['episodeRelatedUrl'] = episode.select('xxx').extract()
           item['brandGenres'] = episode.select('//./channel/itunes:category/@text').extract()
           item['brandGenreIds'] = episode.select('//./channel/./itunes:category/@text').extract()
           item['brandDescription'] = episode.select('//./channel/description/text()').extract()
           item['brandShortName'] = episode.select('//./channel/title/text()').extract()
           item['brandLanguage'] = episode.select('//./channel/language/text()').extract()
           item['brandHomepage'] = episode.select('//./channel/link').extract()
           item['brandImage'] = episode.select('//./channel/itunes:image/@href').extract()
           """
           
           item['seriesId'] = site.select('xxx').extract()
           item['seriesName'] = site.select('xxx').extract()
           item['seriesDescription'] = site.select('xxx').extract()
           item['seriesImage'] = site.select('xxx').extract()
           item['seriesFeed'] = site.select('xxx').extract()
           item['seriesCurrentItem'] = site.select('xxx').extract()
           item['seriesHomepage'] = site.select('xxx').extract()
           item['seriesTags'] = site.select('xxx').extract()
           """
           
           items.append(item)
       return items