def parse(self, response): x = XmlXPathSelector(response) x.register_namespace("im", "http://itunes.apple.com/rss") x.register_namespace('atom','http://www.w3.org/2005/Atom') feedCount = str(len(self.start_urls)) self.i=self.i+1 self.log('Reading rss url [%s of %s]' % (self.i, feedCount), level=log.INFO) entries = x.select('//atom:entry') if entries: # a itunes rss feed for entry in entries: id = entry.select('./atom:id/@im:id').extract() self.log('Entry %s' % (str(id)), level=log.INFO) yield Request('http://itunes.apple.com/lookup?id='+ id[0], callback=self.getItunesTrackJson) else: # a single feed l = XPathItemLoader(PodcastItem(), x) l.add_value('id', 'rssdisco_'+response.url) l.add_value('audioType', 'disco') l.add_value('brandFeed', response.url) l.add_xpath('brandName', '//./channel/title/text()') self.log('Feed from rss %s' % (response.url), level=log.INFO) item = l.load_item() yield item
def parseSubGenre(self, response): x = XmlXPathSelector(response) x.register_namespace("kb", "http://www.kerbango.com/xml") metaData = response.meta['metaData'] stations = x.select('//kb:results/kb:station_record') # was limited to less 5 for now!!! for station in stations: metaData['channelPlaylist'] = [station.select('./kb:station_url_record/kb:url/text()').extract()[0].rstrip('/ \r\n')] metaData['channelName'] = station.select('./kb:station/text()').extract() metaData['channelDescription'] = station.select('./kb:description/text()').extract() metaData['streamId'] = station.select('./kb:esid/text()').extract() metaData['streamBandwidth'] = station.select('./kb:station_url_record/kb:bandwidth_kbps/text()').extract() metaData['streamData'] = station.select('./kb:station_url_record/kb:status_code/text()').extract() metaData['channelGenreIds'] = metaData['genreId'] metaData['channelGenres'] = metaData['genreName'] metaData['channelCategory'] = metaData['genreName'] self.log('parseSubGenre %s %s' % (metaData['genreName'], metaData['channelName'] ), level=log.INFO) channelName = metaData['channelName'][0] channelName = re.sub(r'Low$|High$', '', channelName).strip() #cope with bbc names that include bitratethy in name tuneInSearchUrl = 'http://tunein.com/search/suggest/?query='+ channelName #assume all is well and the supplied url is indeed a playlist! request = Request(tuneInSearchUrl, meta = {'metaData': copy.deepcopy(metaData)}, callback=self.parseTuneInSearch, errback=lambda x:self.parsePlaylist(x,copy.deepcopy(metaData)) ) yield request
def parse(self, response): xxs = XmlXPathSelector(response) xxs.register_namespace("f", "http://www.w3.org/2005/Atom") entries = xxs.select('//item') for entry in entries: item = ZoinkscraperItem() item['name'] = entry.select('./title/text()')[0].extract_unquoted() item['url'] = entry.select('./link/text()')[0].extract() item['date'] = datetime.strptime(entry.select('./pubDate/text()')[0].extract()[:-6],'%a, %d %b %Y %H:%M:%S') yield item
def parse(self, response): x = XmlXPathSelector(response) x.remove_namespaces() x.register_namespace("rdf", "http://www.w3.org/1999/02/22-rdf-syntax-ns#") items = [] items = x.select('//record/metadata/RDF') jsons = [] for item in items: creator = item.select('MetaResource/creator/Agent/name/text()').extract() title = item.select('Resource/title/text()').extract() uri = item.select('Resource/screen/Image/@rdf:about').extract() tags = item.select('Resource/subject/Description/value/text()').extract() thumbnail = item.select('Resource/thumbnail/Image/@rdf:about').extract() lat = item.select('Resource/spatial/Description/lat/text()').extract() long = item.select('Resource/spatial/Description/long/text()').extract() locality = item.select('Resource/spatial/Description/locality/text()').extract() tags_string = '"' + '", "'.join(tags) + '"' if not lat: newlat = 'null' else: newlat = lat[0] if not long: newlong = 'null' else: newlong = long[0] if not locality: newloc = '' else: newloc = locality[0] json_entry = '{"title": "' + title[0] + '", "uri": "' + uri[0] + '", "attribution_uri": "' + uri[0] + '", "media_creator_username": "******", "thumbnail_url": "' + thumbnail[0] + '", "media_geo_latitude": ' + newlat + ', "media_geo_longitude": ' + newlong + ', "location": "' + newloc + '", "tags": [' + tags_string + '], "archive":"Yahoo! Japan", "media_type": "Image", "layer_type": "Image", "child_items_count":0, "published":1}, ' jsons.append(json_entry) resumptionToken = x.select('//resumptionToken/text()').extract() if resumptionToken == []: nextFileLink = '' open('last.txt', 'wb').write(''.join(jsons).encode("UTF-8")) else: nextFileLink = "http://search.shinrokuden.irides.tohoku.ac.jp/webapi/oaipmh?verb=ListRecords&metadataPrefix=sdn&resumptionToken=" + resumptionToken[0].encode('ascii') open(resumptionToken[0].encode('ascii') + '.txt', 'wb').write(''.join(jsons).encode("UTF-8")) yield Request(nextFileLink, callback = self.parse)
def xmliter_lxml(obj, nodename, namespace=None): from lxml import etree reader = _StreamReader(obj) tag = '{%s}%s' % (namespace, nodename) if namespace else nodename iterable = etree.iterparse(reader, tag=tag, encoding=reader.encoding) selxpath = '//' + ('x:%s' % nodename if namespace else nodename) for _, node in iterable: nodetext = etree.tostring(node) node.clear() xs = XmlXPathSelector(text=nodetext) if namespace: xs.register_namespace('x', namespace) yield xs.select(selxpath)[0]
def xmliter_lxml(obj, nodename, namespace=None): from lxml import etree reader = _StreamReader(obj) tag = '{%s}%s' % (namespace, nodename) if namespace else nodename iterable = etree.iterparse(reader, tag=tag, encoding=reader.encoding) selxpath = '//' + ('x:%s' % nodename if namespace else nodename) for _, node in iterable: nodetext = etree.tostring(node) node.clear() xs = XmlXPathSelector(text=nodetext) if namespace: xs.register_namespace('x', namespace) yield xs.select(selxpath)[0]
def test_selector_namespaces_simple(self): body = """ <test xmlns:somens="http://scrapy.org"> <somens:a id="foo"/> <a id="bar">found</a> </test> """ response = XmlResponse(url="http://example.com", body=body) x = XmlXPathSelector(response) x.register_namespace("somens", "http://scrapy.org") self.assertEqual(x.select("//somens:a").extract(), ['<somens:a id="foo"/>'])
def parse(self, response): xxs = XmlXPathSelector(response) xxs.register_namespace('sac', 'http://www.steepandcheap.com/docs/steepcheap/rss.xml') deals = xxs.select('//item') items = [] for deal in deals: item = DealItem() item['title'] = deal.select('title/text()').extract() item['link'] = deal.select('link/@href').extract() item['desc'] = deal.select('description/text()').extract() item['shortDesc'] = deal.select('sac:listDescription/text()').extract() item['curPrice'] = deal.select('sac:priceCurrent/text()').extract() item['regPrice'] = deal.select('sac:priceRegular/text()').extract() items.append(item) return items
def parse(self, response): base_url = get_base_url(response) xxs = XmlXPathSelector(response) xxs.register_namespace("g", "http://base.google.com/ns/1.0") products = xxs.select('//channel/item') for product in products: loader = ProductLoader(item=Product(), selector=product) loader.add_xpath('url', 'link/text()') loader.add_xpath('name', 'title/text()') loader.add_xpath('image_url', 'g:image_link/text()') loader.add_xpath('price', 'g:price/text()') loader.add_xpath('brand', 'g:brand/text()') loader.add_xpath('category', 'g:brand/text()') loader.add_xpath('sku', 'g:id/text()') loader.add_xpath('identifier', 'g:id/text()') yield loader.load_item()
def parse(self, response): x = XmlXPathSelector(response) x.register_namespace("kb", "http://www.kerbango.com/xml") items = [] #genres = x.select('//kb:results/kb:menu_record[kb:menu_id/text()="21"]|//kb:results/kb:menu_record[kb:menu_id/text()="21"]|//kb:results/kb:menu_record[kb:menu_id/text()="21"]|//kb:results/kb:menu_record[kb:menu_id/text()="21"]') # [kb:menu_id/text()="21"] limited to 21 for now!!! genres = x.select('//kb:results/kb:menu_record[kb:menu_id/text()="21"]') #[kb:menu_id/text()="21"] for genre in genres: metaData={} metaData['genreName'] = genre.select('./kb:name/text()').extract()[:1] metaData['genreId'] = genre.select('./kb:menu_id/text()').extract()[:1] request = Request('http://pri.kts-af.net/xml/index.xml?tuning_id='+ metaData['genreId'][0], meta = {'metaData': copy.deepcopy(metaData)}, callback=self.parseSubGenre) self.logProgress ('parse', metaData['genreName'], metaData['genreId'], '', level=log.INFO) yield request
def parse(self, response): xxs = XmlXPathSelector(response) for namespace, schema in self.namespaces.iteritems(): xxs.register_namespace(namespace, schema) for entry in xxs.select('//itunesu:entry'): metaData={} metaData['audioType'] = entry.select('./im:contentType/@term').extract() metaData['brandId'] = entry.select('./itunesu:id/text()').extract() metaData['brandName'] = entry.select('./im:name/text()').extract() metaData['brandDescription'] = entry.select('./itunesu:summary/text()').extract() metaData['brandCategory'] = entry.select('./itunesu:category/@label').extract() metaData['brandGenres'] = entry.select('./itunesu:category/@label').extract() metaData['brandGenreIds'] = entry.select('./itunesu:category/@im:id').extract() metaData['brandPublishDate'] = entry.select('./im:releaseDate/text()').extract() metaData['itunesTrackId'] = entry.select('./itunesu:id/@im:id').extract() metaData['itunesArtworkUrl55'] = entry.select('./im:image[@height="55"]/text()').extract() metaData['itunesArtworkUrl60'] = entry.select('./im:image[@height="60"]/text()').extract() metaData['itunesArtworkUrl170'] = entry.select('./im:image[@height="170"]/text()').extract() metaData['itunesCollectionPrice'] = entry.select('./im:price/@amount').extract() metaData['itunesCollectionViewUrl'] = entry.select('./itunesu:link/@href').extract() #have got anything that identifies the "department" so using category for now metaData['channelName'] = metaData['brandCategory'] metaData['ownerName'] = entry.select('./im:artist/text()').extract() metaData['ownerId'] = entry.select('./im:artist/@href').extract() #html = entry.select('./itunesu:content[@type="html"]/text()').extract()[0] #hxs = HtmlXPathSelector(text=html) #metaData['ownerName'] = hxs.select('//./a[contains(@href,"institution")]/text()').extract() itunesUrl = 'http://itunes.apple.com/WebObjects/DZR.woa/wa/viewPodcast?cc=us&mt=10&id=' + metaData['itunesTrackId'][0] request = Request(itunesUrl, method='GET', meta = {'metaData': copy.deepcopy(metaData), 'dont_retry': True}, headers={ "User-Agent": "iTunes/9.1.1" }, callback=self.parseItunesHtml) #errback=lambda x:self.parseItem(x,copy.deepcopy(metaData)) ) yield request
def parse(self, response): xxs = XmlXPathSelector(response) xxs.register_namespace('soapenv', 'http://schemas.xmlsoap.org/soap/envelope/') xxs.register_namespace('xsd', 'http://www.w3.org/2001/XMLSchema') xxs.register_namespace('xsi', 'http://www.w3.org/2001/XMLSchema-instance') xxs.register_namespace( 'CurrentsAndMetadata', 'http://opendap.co-ops.nos.noaa.gov/axis/webservices/currents/wsdl' ) timelist = xxs.select( '//CurrentsAndMetadata:data/CurrentsAndMetadata:item/CurrentsAndMetadata:timeStamp/text()' ).extract() cspdlist = xxs.select( '//CurrentsAndMetadata:data/CurrentsAndMetadata:item/CurrentsAndMetadata:CS/text()' ).extract() cdirlist = xxs.select( '//CurrentsAndMetadata:data/CurrentsAndMetadata:item/CurrentsAndMetadata:CD/text()' ).extract() print len(timelist) for i in range(0, len(cdirlist)): sql_str = self.SQL_INSERT_STUB.format( self.get_current_station().lower(), str(timelist[i])[0:-2], str(cspdlist[i]), str(cdirlist[i]), 'datafactory_currentdata') #d_time = datetime.datetime(str(timelist[i])[0:-2], pytz.UTC) d_time_unware = datetime.datetime.strptime( str(timelist[i])[0:-2], "%Y-%m-%d %H:%M:%S") d_time1 = pytz.utc.localize(d_time_unware) d_time = d_time1.astimezone(pytz.utc) if self.needStore(d_time): self.db.query(sql_str) self.db.commit() if timelist: sql_str = "INSERT INTO {0} (sid, stime, etime) VALUES (\"{1}\", \"{2}\", \"{3}\")".format( DB_SETTINGS['DATABASE_TIME_TABLE'], self.get_current_station(), self.startDate.astimezone( pytz.utc).strftime("%Y-%m-%d %H:%M:%S"), self.endDate.astimezone( pytz.utc).strftime("%Y-%m-%d %H:%M:%S")) self.db.query(sql_str) self.db.commit() self.station_slot = self.station_slot + 1 if (self.station_slot < len(self.start_urls)): yield self.start_urls[self.station_slot]
def parse(self, response): xxs = XmlXPathSelector(response) base_url = get_base_url(response) xxs.register_namespace("f", "http://www.w3.org/2005/Atom") products = xxs.select('//f:entry') for product in products: product.register_namespace("g", "http://base.google.com/ns/1.0") product.register_namespace("p", "http://www.w3.org/2005/Atom") product_loader = ProductLoader(item=Product(), selector=product) name = product.select('./p:title/text()').extract()[0] if 'B-STOCK' in name.upper(): continue product_loader.add_value('name', name) url = product.select('./p:link/@href').extract()[0] product_loader.add_value('url', urljoin_rfc(base_url, url)) image_url = product.select('./g:image_link/text()').extract() if image_url: product_loader.add_value('image_url', urljoin_rfc(base_url, image_url[0])) category = product.select('./g:product_type/text()').extract() if category: product_loader.add_value('category', category[0]) brand = product.select('./g:brand/text()').extract() if brand: product_loader.add_value('brand', brand[0]) price = product.select('./g:sale_price/text()').extract() if price: product_loader.add_value('price', extract_price(price[0])) else: price = product.select('./g:price/text()').extract() product_loader.add_value('price', extract_price(price[0])) # sku = product.select('./g:gtin/text()').extract() # if sku: # product_loader.add_value('sku', sku[0]) identifier = product.select('./g:id/text()').extract()[0] product_loader.add_value('identifier', identifier) product_loader.add_value('sku', identifier) shipping_cost = product.select( './g:shipping/g:price/text()').extract() if shipping_cost: product_loader.add_value('shipping_cost', extract_price(shipping_cost[0])) product = product_loader.load_item() yield product
def parse(self, response): x = XmlXPathSelector(response) x.register_namespace("xsi", "http://www.w3.org/2001/XMLSchema-instance") #programs = x.select('./program[position()<3]') programs = x.select('./program') allitems=[] for program in programs: parent={} parent['brandId'] = program.select('./systemRef[@systemId="pid.brand"][position()=1]/@key').extract() parent['brandIds'] = program.select('./systemRef[@systemId="pid.brand"]/@key').extract() parent['brandFeed'] = program.select('./link[@target="feed"]/@url').extract()[0] parent['brandName'] = program.select('./title/text()').extract() parent['brandShortName'] = program.select('./shortTitle/text()').extract() parent['brandDescription'] = program.select('./description/text()').extract() parent['brandHomepage'] = program.select('./link[@target="homepage"]/@url').extract() parent['brandImage'] = program.select('./image/@url').extract() parent['brandTimes'] = program.select('./@frequency').extract() parent['brandCurrentItem'] = program.select('./link[@target="currentItem"]/@url').extract() parent['brandLanguage'] = program.select('./@language').extract() parent['brandAvgDuration'] = program.select('./@typicalDuration').extract() parent['brandFrequency'] = program.select('./@frequency').extract() # parent['brandTags'] = program.select('xxx').extract() parent['brandGenres'] = program.select('./bbcGenre/@name').extract() parent['brandGenreIds'] = program.select('./systemRef[@systemId="pid.genre"]/@key').extract() parent['channelId'] = program.select('./network/@id').extract() parent['channelName'] = program.select('./network/@name').extract() # parent['channelDescription'] = program.select('//head/meta[@name="description"]/@content').extract() # parent['channelImage'] = 'http://sverigesradio.se/diverse/appdata/isidor/images/news_images/3297/459929_87_56.jpg' # parent['channelFeed'] = program.select('xxx').extract() parent['channelHomepage'] = 'http://www.bbc.co.uk/' + parent['channelId'][0] parent['ownerId'] = 'BBC' parent['ownerName'] = 'BBC Radio' parent['ownerKey'] = 'bbc' parent['ownerImage'] = 'http://static.bbci.co.uk/frameworks/barlesque/2.5.10/desktop/3.5/img/blq-blocks_grey_alpha.png' parent['ownerHomepage'] = 'http://www.bbc.co.uk/' + parent['ownerKey'] request = Request(parent['brandFeed'], meta={'parent': parent}, callback=self.load_rss) allitems.append(request) return allitems
def parse(self, response): """ Main parser """ xxs = XmlXPathSelector(response) xxs.register_namespace('feedburner', "http://rssnamespace.org/feedburner/ext/1.0") # For each blog post we have: # # * A main url like: http://www.beppegrillo/YYYY/MM/page_name/index.html # this contains the blog post and the "commenti piu' votati" section # # * A javascript page http://www.beppegrillo/YYYY/MM/page_name.js # which contains a list of URLs pointing to pages containing # subsets of the comments. # # Therefore, we have to return a request for each page, and a # request for each one of these subpages containing a subset of # the comments for later parsing, made by specific methods. for url in xxs.select('//feedburner:origLink/text()').extract(): yield Request(url, callback=self.parse_page) yield Request(url.replace('/index.html', '.js'), callback=self.parse_javascript)
def parse(self, response): xxs = XmlXPathSelector(response) xxs.register_namespace("g", "http://base.google.com/ns/1.0") products = xxs.select('//channel/item') for product in products: loader = ProductLoader(item=Product(), selector=product) loader.add_xpath('url', 'link/text()') loader.add_xpath('name', 'title/text()') loader.add_xpath('image_url', 'g:image_link/text()') loader.add_xpath('price', 'g:price/text()') loader.add_xpath('brand', 'g:brand/text()') categories = product.select( 'g:product_type/text()').extract()[0].split(' > ') loader.add_value('category', categories) loader.add_xpath('sku', 'g:id/text()') loader.add_xpath('identifier', 'g:id/text()') stock = product.select( 'g:availability/text()').extract()[0].lower() if stock != 'in stock': loader.add_value('stock', 0) yield loader.load_item()
def parseFeed(self, response): jsonResponse = response.meta['jsonResponse'] brandStats = jsonResponse['stats']['stats_fields']['episodePublishDate'] #maxDate = brandStats['max'] #updateDoc = '<delete><query>brandFeed:"'+brandFeed+'"</query></delete>' x = XmlXPathSelector(response) x.register_namespace("xsi", "http://www.w3.org/2001/XMLSchema-instance") x.register_namespace("itunes", "http://www.itunes.com/dtds/podcast-1.0.dtd") x.register_namespace("media", "http://search.yahoo.com/mrss/") ######### newEpisodes = x.select('//channel/item[enclosure[contains(@type,"audio") or contains(@type,"video")]]') metaData = {} metaData['rssUrl'] = response.url episodes = [] #create a single solr update doc that contains all the new episodes and deletes expired ones for xmlEpisode in newEpisodes: jsonBrand = jsonResponse['grouped']['brandFeed']['groups'][0]['doclist']['docs'][0] episode = self.load_item(jsonBrand, xmlEpisode, metaData).__dict__.values()[0] episodes.append(episode) updatejson = JSONEncoder().encode(episodes) yield Request( url=self.solrUpdateUrl, method='POST', body=updatejson, headers={'Content-Type':'application/json'}, callback=self.dummyEnd )
def load_podcast_rss(self, response): x = XmlXPathSelector(response) x.register_namespace("xsi", "http://www.w3.org/2001/XMLSchema-instance") x.register_namespace("itunes", "http://www.itunes.com/dtds/podcast-1.0.dtd") x.register_namespace("media", "http://search.yahoo.com/mrss/") metaData = response.meta['metaData'] itunesTrackId = metaData['itunesTrackId'] metaData['rssUrl'] = response.url ########## # a limit of 50 episodes has been hard coded here, this should be in settings somewhere ######### episodes = x.select('//channel/item[enclosure[contains(@type,"audio") or contains(@type,"video")]][position()<50]') podcastEpisodeCount = str(len(episodes)) items = [] self.totalPodcastEpisodes = self.totalPodcastEpisodes + len(episodes) if len(episodes)==0: self.logProgress('Empty feed', metaData['brandName'][0], '', itunesTrackId, log.WARNING, ('No episodes for %s' % (response.url))) metaData['itemtype']=['noepisodes'] item = self.load_item(x.select('//channel'), metaData) yield item else: podcastEpisodeIndex = str(len(items)) podcastEpisodeCount = str(len(episodes)) self.logProgress('load_podcast_rss', metaData['brandName'][0], '', itunesTrackId, log.INFO, ('%s/%s' % (podcastEpisodeIndex, podcastEpisodeCount))) for episode in episodes: metaData['itemtype']=['ondemand'] item = self.load_item(episode, metaData) yield item
def parsePlaylistXML(self, response): metaData = response.meta['metaData'] xxs = XmlXPathSelector(response) xxs.register_namespace('itunesu2', 'http://www.apple.com/itms/') for episode in xxs.select('//itunesu2:Protocol/itunesu2:plist/itunesu2:dict/itunesu2:array/itunesu2:dict[itunesu2:dict/itunesu2:string[.="mp3"]]'): metaData['episodeDuration'] = episode.select('./itunesu2:dict/itunesu2:key[.="duration"]/following-sibling::itunesu2:integer[1]/text()').extract() metaData['episodeTitle'] = episode.select('./itunesu2:dict/itunesu2:key[.="songName"]/following-sibling::itunesu2:string[1]/text()').extract() metaData['rssItemMediaType'] = episode.select('./itunesu2:dict/itunesu2:key[.="fileExtension"]/following-sibling::itunesu2:string[1]/text()').extract() metaData['episodeDescription'] = episode.select('./itunesu2:dict/itunesu2:key[.="description"]/following-sibling::itunesu2:string[1]/text()').extract() metaData['itunesArtistName'] = episode.select('./itunesu2:dict/itunesu2:key[.="artistName"]/following-sibling::itunesu2:string[1]/text()').extract() metaData['episodePublishDate'] = episode.select('./itunesu2:dict/itunesu2:key[.="releaseDate"]/following-sibling::itunesu2:string[1]/text()').extract() metaData['brandFeed'] = episode.select('./itunesu2:dict/itunesu2:key[.="feedURL"]/following-sibling::itunesu2:string[1]/text()').extract() metaData['episodeId'] = episode.select('./itunesu2:dict/itunesu2:key[.="episodeGUID"]/following-sibling::itunesu2:string[1]/text()').extract() metaData['episodeMedia'] = episode.select('./itunesu2:key[.="URL"]/following-sibling::itunesu2:string[1]/text()').extract() if 'itunesArtworkUrl170' not in metaData: metaData['itunesArtworkUrl170'] = episode.select('./itunesu2:key[.="artworkURL"]/following-sibling::itunesu2:string[1]/text()').extract() if 'episodeFirstBroadcast' not in metaData: metaData['episodeFirstBroadcast'] = metaData['episodePublishDate'] yield self.parseItem(metaData)
def load_rss(self, response): x = XmlXPathSelector(response) x.register_namespace("xsi", "http://www.w3.org/2001/XMLSchema-instance") x.register_namespace("itunes", "http://www.itunes.com/dtds/podcast-1.0.dtd") x.register_namespace("media", "http://search.yahoo.com/mrss/") title = x.select('//./channel/title/text()').extract()[0] parent = response.meta['parent'] request = Request('http://itunes.apple.com/search?term='+ title +'&entity=podcast&attribute=titleTerm', meta = {'parent': parent, 'rss': x, 'rssUrl': response.url}, callback=self.get_itunes_info) return request
def parse(self, response): xxs = XmlXPathSelector(response) xxs.register_namespace('soapenv', 'http://schemas.xmlsoap.org/soap/envelope/') xxs.register_namespace('xsd', 'http://www.w3.org/2001/XMLSchema') xxs.register_namespace('xsi', 'http://www.w3.org/2001/XMLSchema-instance') xxs.register_namespace('CurrentsAndMetadata', 'http://opendap.co-ops.nos.noaa.gov/axis/webservices/currents/wsdl') timelist = xxs.select('//CurrentsAndMetadata:data/CurrentsAndMetadata:item/CurrentsAndMetadata:timeStamp/text()').extract() cspdlist = xxs.select('//CurrentsAndMetadata:data/CurrentsAndMetadata:item/CurrentsAndMetadata:CS/text()').extract() cdirlist = xxs.select('//CurrentsAndMetadata:data/CurrentsAndMetadata:item/CurrentsAndMetadata:CD/text()').extract() print len(timelist) for i in range(0, len(cdirlist)): sql_str = self.SQL_INSERT_STUB.format(self.get_current_station().lower(), str(timelist[i])[0:-2], str(cspdlist[i]), str(cdirlist[i]), 'datafactory_currentdata') #d_time = datetime.datetime(str(timelist[i])[0:-2], pytz.UTC) d_time_unware = datetime.datetime.strptime(str(timelist[i])[0:-2], "%Y-%m-%d %H:%M:%S") d_time1 = pytz.utc.localize(d_time_unware) d_time = d_time1.astimezone(pytz.utc) if self.needStore(d_time): self.db.query(sql_str) self.db.commit() if timelist: sql_str = "INSERT INTO {0} (sid, stime, etime) VALUES (\"{1}\", \"{2}\", \"{3}\")".format( DB_SETTINGS['DATABASE_TIME_TABLE'], self.get_current_station(), self.startDate.astimezone(pytz.utc).strftime("%Y-%m-%d %H:%M:%S"), self.endDate.astimezone(pytz.utc).strftime ("%Y-%m-%d %H:%M:%S") ) self.db.query(sql_str) self.db.commit() self.station_slot = self.station_slot + 1 if (self.station_slot < len(self.start_urls)): yield self.start_urls[self.station_slot]
def load_rss(self, response): x = XmlXPathSelector(response) x.register_namespace("xsi", "http://www.w3.org/2001/XMLSchema-instance") x.register_namespace("itunes", "http://www.itunes.com/dtds/podcast-1.0.dtd") x.register_namespace("media", "http://search.yahoo.com/mrss/") title = x.select('//./channel/image/title/text()').extract()[0] parent = response.meta['parent'] #alterative URL? #http://ax.itunes.apple.com/WebObjects/MZStoreServices.woa/wa/wsSearch?term=%22Les%20Aventuriers%22&entity=podcast&attribute=titleTerm&country=FR itunesUrl = 'http://itunes.apple.com/search?term='+ title +'&entity=podcast&attribute=titleTerm&country=FR' request = Request(itunesUrl, dont_filter=True, meta = {'parent': parent, 'rss': x, 'rssUrl': response.url}, callback=self.get_itunes_info) print itunesUrl return request
def parseAudioBookRSS(self, response): x = XmlXPathSelector(response) x.register_namespace("xsi", "http://www.w3.org/2001/XMLSchema-instance") x.register_namespace("itunes", "http://www.itunes.com/dtds/podcast-1.0.dtd") x.register_namespace("media", "http://search.yahoo.com/mrss/") metaData = response.meta['metaData'] chapters = x.select('//channel/item') items = [] for chapter in chapters: metaData['audioType']=['audiobook'] item = self.load_item(chapter, copy.deepcopy(metaData)) items.append(item) return items
def test_selector_namespaces_multiple(self): body = """<?xml version="1.0" encoding="UTF-8"?> <BrowseNode xmlns="http://webservices.amazon.com/AWSECommerceService/2005-10-05" xmlns:b="http://somens.com" xmlns:p="http://www.scrapy.org/product" > <b:Operation>hello</b:Operation> <TestTag b:att="value"><Other>value</Other></TestTag> <p:SecondTestTag><material/><price>90</price><p:name>Dried Rose</p:name></p:SecondTestTag> </BrowseNode> """ response = XmlResponse(url="http://example.com", body=body) x = XmlXPathSelector(response) x.register_namespace("xmlns", "http://webservices.amazon.com/AWSECommerceService/2005-10-05") x.register_namespace("p", "http://www.scrapy.org/product") x.register_namespace("b", "http://somens.com") self.assertEqual(len(x.select("//xmlns:TestTag")), 1) self.assertEqual(x.select("//b:Operation/text()").extract()[0], 'hello') self.assertEqual(x.select("//xmlns:TestTag/@b:att").extract()[0], 'value') self.assertEqual(x.select("//p:SecondTestTag/xmlns:price/text()").extract()[0], '90') self.assertEqual(x.select("//p:SecondTestTag").select("./xmlns:price/text()")[0].extract(), '90') self.assertEqual(x.select("//p:SecondTestTag/xmlns:material").extract()[0], '<material/>')
def get_itunes(self, response): itunes = json.loads(response.body) rss = XmlXPathSelector(response.meta['rss']) rss.register_namespace("xsi", "http://www.w3.org/2001/XMLSchema-instance") rss.register_namespace("itunes", "http://www.itunes.com/dtds/podcast-1.0.dtd") rss.register_namespace("media", "http://search.yahoo.com/mrss/") episodes = rss.select('./channel/item') items = [] for episode in episodes: item = PodcastItem() item['brandGenres'] = episode.select('//./channel/itunes:category/@text').extract() item['brandGenreIds'] = episode.select('//./channel/./itunes:category/@text').extract() item['brandDescription'] = episode.select('//./channel/description/text()').extract() item['brandShortName'] = episode.select('//./channel/title/text()').extract() item['brandLanguage'] = episode.select('//./channel/language/text()').extract() item['brandHomepage'] = episode.select('//./channel/link/text()').extract() item['brandImage'] = episode.select('//./channel/itunes:image/@href').extract() item['channelId'] = episode.select('//./channel/itunes:author/text()').extract() item['channelName'] = episode.select('//./channel/itunes:author/text()').extract() #item['channelDescription'] = parent['channelDescription'] item['channelImage'] = episode.select('//./channel/itunes:image/@href').extract() item['channelHomepage'] = episode.select('//./channel/link/text()').extract() item['ownerId'] = 'NPR' item['ownerName'] = 'NPR' item['ownerKey'] = 'npr' item['ownerImage'] = 'http://media.npr.org/chrome/news/nprlogo_138x46.gif' item['ownerHomepage'] = 'http://www.npr.org' item['brandFeed'] = response.meta['rss'] item['brandName'] = episode.select('//./channel/title/text()').extract() item['brandId'] = response.meta['rss'] item['type'] = 'podcast' item['id'] = episode.select('./guid/text()').extract() item['episodeId'] = episode.select('./guid/text()').extract() item['episodeTitle'] = episode.select('./title/text()').extract() item['episodeSubtitle'] = episode.select('./itunes:subtitle/text()').extract() item['episodeDescription'] = episode.select('./description/text()').extract() #item['episodeStart'] = episode.select('xxx').extract() #item['episodeEnd'] = episode.select('xxx').extract() item['episodeDuration'] = episode.select('./enclosure/@length').extract() item['episodePublishDate'] = episode.select('./pubDate/text()').extract() item['episodeMimeType'] = episode.select('./media:content/@type').extract() item['episodeMedia'] = episode.select('./guid/text()').extract() #item['episodeImage'] = episode.select('xxx').extract() #item['episodeHomepage'] = episode.select('xxx').extract() item['episodeFirstBroadcast'] = episode.select('./pubDate/text()').extract() #item['episodeAvailableUntil'] = episode.select('xxx').extract() #item['episodeTags'] = episode.select('xxx').extract() #item['episodeRelatedTitle'] = episode.select('xxx').extract() #item['episodeRelatedUrl'] = episode.select('xxx').extract() #item['itunesGenres'] = itunes['results'][0]['genres'][0] #item['itunesGenreIds'] = itunes['results'][0]['genreIds'][0] #item['itunesartworkUrl30'] = itunes['results'][0]['artworkUrl30'][0] items.append(item) return items
def load_podcast_rss(self, response): x = XmlXPathSelector(response) x.register_namespace("xsi", "http://www.w3.org/2001/XMLSchema-instance") x.register_namespace("itunes", "http://www.itunes.com/dtds/podcast-1.0.dtd") x.register_namespace("media", "http://search.yahoo.com/mrss/") metaData = response.meta["metaData"] genreName = metaData["genreName"] genreCount = metaData["genreCount"] genreIndex = metaData["genreIndex"] letterValue = metaData["letterValue"] pageCount = metaData["pageCount"] pageIndex = metaData["pageIndex"] popularInGenre = metaData["popularInGenre"] podcastId = metaData["podcastId"] podcastName = metaData["podcastName"] podcastGenreIndex = metaData["podcastGenreIndex"] podcastGenreCount = metaData["podcastGenreCount"] podcastDescription = metaData["podcastDescription"] podcastCount = str(len(self.indexedPodcasts)) itunes1 = metaData["itunes1"] # this is json from a 2nd lookup to get the "artist" data, where artist may be channel or owner data itunes2 = metaData["itunes2"] if "itunes2" in metaData else {} episodes = x.select("./channel/item") podcastEpisodeCount = str(len(episodes)) rssUrl = response.url items = [] for episode in episodes: l = XPathItemLoader(PodcastItem(), selector=episode) ############# # RSS per channel values ############# l.add_xpath("rssChannelTitle", "//./channel/title/text()") l.add_xpath("rssChannelLink", "//./channel/link/text()") l.add_xpath("rssChannelDescription", "//./channel/description/text()") l.add_xpath("rssChannelCopyright", "//./channel/copyright/text()") l.add_xpath("rssChannelLanguage", "//./channel/language/text()") l.add_xpath("brandLanguage", "//./channel/language/text()") l.add_xpath("rssChannelGenerator", "//./channel/generator/text()") l.add_xpath("rssChannelPubDate", "//./channel/pubDate/text()") l.add_xpath("rssChannelitunesAuthor", "//./channel/itunes:author/text()") l.add_xpath("rssChannelitunesCategory", "//./channel/itunes:category/@text") l.add_xpath("rssChannelitunesExplicit", "//./channel/itunes:explicit/text()") l.add_xpath("rssChannelitunesImage", "//./channel/itunes:image/@href") l.add_xpath("rssChannelitunesKeywords", "//./channel/itunes:keywords/text()") l.add_xpath("rssChannelitunesOwnerEmail", "//./channel/itunes:owner/itunes:email/text()") l.add_xpath("rssChannelitunesOwnerName", "//./channel/itunes:owner/itunes:name/text()") l.add_xpath("rssChannelitunesSubtitle", "//./channel/itunes:subtitle/text()") l.add_xpath("rssChannelitunesSummary", "//./channel/itunes:summary/text()") l.add_xpath("rssChannelImageUrl", "//./channel/image/url/text()") l.add_xpath("rssChannelImageTitle", "//./channel/image/title/text()") l.add_xpath("rssChannelImageLink", "//./channel/image/link/text()") l.add_xpath("rssChannelLastBuildDate", "//./channel/lastBuildDate/text()") ############# # RSS per item values ############# l.add_xpath("rssItemTitle", "./title/text()") l.add_xpath("rssItemDescription", "./description/text()") l.add_xpath("rssItemPubDate", "./pubDate/text()") l.add_xpath("rssItemLink", "./link/text()") l.add_xpath("rssItemGuid", "./guid/text()") l.add_xpath("rssItemAuthor", "./author/text()") l.add_xpath("rssItemitunesSummary", "./itunes:summary/text()") l.add_xpath("rssItemitunesKeywords", "./itunes:keywords/text()") l.add_xpath("rssItemitunesDuration", "./itunes:duration/text()") l.add_xpath("rssItemitunesExplicit", "./itunes:explicit/text()") l.add_xpath("rssItemEnclosureUrl", "./enclosure/@url") l.add_xpath("rssItemEnclosureLength", "./enclosure/@length") l.add_xpath("rssItemEnclosureType", "./enclosure/@type") l.add_xpath("episodeDuration", "./itunes:duration/text()") ############# # iTunes lookup meta values ############# if "artistId" in itunes1: l.add_value("itunesArtistId", str(itunes1["artistId"])) if "artistName" in itunes1: l.add_value("itunesArtistName", itunes1["artistName"]) if "artistViewUrl" in itunes1: l.add_value("itunesArtistViewUrl", itunes1["artistViewUrl"]) if "artworkUrl100" in itunes1: l.add_value("itunesArtworkUrl100", itunes1["artworkUrl100"]) if "artworkUrl30" in itunes1: l.add_value("itunesArtworkUrl30", itunes1["artworkUrl30"]) if "artworkUrl60" in itunes1: l.add_value("itunesArtworkUrl60", itunes1["artworkUrl60"]) if "artworkUrl600" in itunes1: l.add_value("itunesArtworkUrl600", itunes1["artworkUrl600"]) if "collectionId" in itunes1: l.add_value("itunesCollectionId", str(itunes1["collectionId"])) if "collectionName" in itunes1: l.add_value("itunesCollectionName", itunes1["collectionName"]) if "collectionPrice" in itunes1: l.add_value("itunesCollectionPrice", str(itunes1["collectionPrice"])) if "collectionViewUrl" in itunes1: l.add_value("itunesCollectionViewUrl", itunes1["collectionViewUrl"]) if "genreIds" in itunes1: l.add_value("itunesGenreIds", itunes1["genreIds"]) if "genres" in itunes1: l.add_value("itunesGenres", itunes1["genres"]) if "itunesCollectionCensoredName" in itunes1: l.add_value("itunesCollectionCensoredName", itunes1["collectionCensoredName"]) if "itunesCollectionExplicitness" in itunes1: l.add_value("itunesCollectionExplicitness", itunes1["collectionExplicitness"]) if "itunesCurrency" in itunes1: l.add_value("itunesCurrency", itunes1["currency"]) # if 'itunesPopular' in itunes1 : l.add_value('itunesPopular', itunes1['popular']) # if 'itunesPopularInGenre' in itunes1 : l.add_value('itunesPopularInGenre', itunes1['popularInGenre']) if "itunesTrackCensoredName" in itunes1: l.add_value("itunesTrackCensoredName", itunes1["trackCensoredName"]) if "itunesTrackCount" in itunes1: l.add_value("itunesTrackCount", itunes1["trackCount"]) if "primaryGenreName" in itunes1: l.add_value("itunesPrimaryGenreName", itunes1["primaryGenreName"]) if "releaseDate" in itunes1: l.add_value("itunesReleaseDate", itunes1["releaseDate"]) if "trackExplicitness" in itunes1: l.add_value("itunesTrackExplicitness", itunes1["trackExplicitness"]) if "trackId" in itunes1: l.add_value("itunesTrackId", str(itunes1["trackId"])) if "trackPrice" in itunes1: l.add_value("itunesTrackPrice", str(itunes1["trackPrice"])) if "trackViewUrl" in itunes1: l.add_value("itunesTrackViewUrl", itunes1["trackViewUrl"]) l.add_value("itunesPopular", metaData["itunesPopular"]) l.add_value("itunesPopularInGenre", metaData["itunesPopularInGenre"]) l.add_value("itunesSimilar", metaData["podcastSimilar"]) l.add_value("itunesRelated", metaData["podcastRelated"]) item = l.load_item() ############# ## Test and copy from alternate sources ############# # Do some more if else/else to fill the fields here..... ####### ## Episode items ############# item["id"] = item["rssItemGuid"] if "rssItemGuid" in item else ["Unknown"] item["episodeId"] = item["rssItemGuid"] if "rssItemGuid" in item else ["Unknown"] item["episodeTitle"] = item["rssItemTitle"] if "rssItemTitle" in item else ["Unknown"] item["episodeSubtitle"] = item["rssItemSubtitle"] if "rssItemSubtitle" in item else ["Unknown"] item["episodeDescription"] = item["rssItemDescription"] if "rssItemDescription" in item else ["Unknown"] item["episodeDuration"] = item["episodeDuration"] if "episodeDuration" in item else ["0"] item["episodeMimeType"] = item["rssItemEnclosureType"] if "rssItemEnclosureType" in item else ["Unknown"] item["episodeMedia"] = item["rssItemEnclosureUrl"] if "rssItemEnclosureUrl" in item else ["Unknown"] item["episodeImage"] = item["rssChannelImageUrl"] if "rssChannelImageUrl" in item else ["Unknown"] item["episodeHomepage"] = item["rssItemLink"] if "rssItemLink" in item else ["Unknown"] # item['episodeFirstBroadcast'] = item['rssItemGuid'] # item['episodeAvailableUntil'] = item['rssItemGuid'] item["episodeTags"] = item["rssItemitunesKeywords"] if "rssItemitunesKeywords" in item else ["Unknown"] # item['episodeRelatedTitle'] = item['rssItemGuid'] # item['episodeRelatedUrl'] = item['rssItemGuid'] # item['seriesId'] = item['xxx'] # item['seriesName'] = item['xxx'] # item['seriesDescription'] = item['xxx'] # item['seriesImage'] = item['xxx'] # item['seriesFeed'] = item['xxx'] # item['seriesCurrentItem'] = item['xxx'] # item['seriesHomepage'] = item['xxx'] # item['seriesTags'] = item['xxx'] if "itunesReleaseDate" in item: item["episodePublishDate"] = item["itunesReleaseDate"] elif "rssItemPubDate" in item: item["episodePublishDate"] = item["rssItemPubDate"] else: item["episodePublishDate"] = ["1900-01-01T00:00:00Z"] ####### ## Brand items ############# item["brandId"] = [rssUrl] item["brandFeed"] = [rssUrl] # brandName if "rssChannelTitle" in item: item["brandName"] = item["rssChannelTitle"] elif "itunesTrackName" in item: item["brandName"] = item["itunesTrackName"] else: item["brandName"] = ["Unknown"] if "rssChannelDescription" in item: item["brandDescription"] = item["rssChannelDescription"] elif "podcastDescription" in metaData: item["brandDescription"] = metaData["podcastDescription"] else: ["Unknown"] item["brandSimilar"] = metaData["podcastSimilar"] item["brandRelated"] = metaData["podcastRelated"] item["brandHomepage"] = item["rssChannelLink"] if "rssChannelLink" in item else ["Unknown"] item["brandLanguage"] = item["brandLanguage"] if "brandLanguage" in item else ["xx"] item["brandTags"] = item["rssChannelitunesKeywords"] if "rssChannelitunesKeywords" in item else ["Unknown"] item["brandGenres"] = item["itunesGenres"] if "itunesGenres" in item else ["Unknown"] item["brandGenreIds"] = item["itunesGenreIds"] if "itunesGenreIds" in item else ["Unknown"] if "itunesArtworkUrl100" in item: item["brandImage"] = item["itunesArtworkUrl100"] elif "rssChannelitunesImage" in item: item["brandImage"] = item["rssChannelitunesImage"] else: item["brandImage"] = ["Unknown"] # itunes feeds do not stick to the parent genre in itunesPrimaryGenreName as found here # http://itunes.apple.com/us/genre/podcasts/id26?mt=2 # this makes it difficult to have a smallish number of top level categories # so the text value from the scrape is used in preference if "genreName" in metaData: item["brandCategory"] = metaData["genreName"] elif "itunesPrimaryGenreName" in item: item["brandCategory"] = item["itunesPrimaryGenreName"] elif "rssChannelitunesCategory" in item: item["brandCategory"] = [item["rssChannelitunesCategory"][0]] else: item["brandCategory"] = ["Unknown"] ####### ## Channel items ############# item["channelId"] = item["itunesArtistId"] if "itunesArtistId" in item else ["Unknown"] if "rssChannelitunesAuthor" in item: item["channelName"] = item["rssChannelitunesAuthor"] elif "itunesArtistName" in item: item["channelName"] = item["itunesArtistName"] else: item["channelName"] = ["Unknown"] ####### ## Depends on what itunes returns for Artist lookup ####### # itunes1 based on scraped podcastId, itunes2 based on artistId from first itunes lookup if present # if itunes1.artistId NOT present # brandName = itunes1.trackName, channelName = itunes1.artistName, ownerName remains unknown if "artistId" not in itunes1: item["brandName"] = itunes1["trackName"] item["channelName"] = itunes1["artistName"] item["ownerName"] = "Unknown" # if itunes1.artistId present then 2nd lookup made to itunes: # if itunes1.artistName == itunes2.artistName # brandName = itunes1.trackName, channelName = remains unknown, ownerName = itunes1.artistName elif "artistId" in itunes1 and "artistId" in itunes2: if itunes1["artistId"] == itunes2["artistId"]: item["brandName"] = itunes1["trackName"] item["channelName"] = "Unknown" item["ownerName"] = itunes1["artistName"] item["ownerId"] = itunes1["artistId"] # if itunes1.artistName != itunes2.artistName # brandName = itunes1.trackName, channelName = itunes1.artistName, ownerName = itunes2.artistName else: item["brandName"] = itunes1["trackName"] item["channelName"] = itunes1["artistName"] item["ownerName"] = itunes2["artistName"] item["ownerId"] = itunes2["artistId"] self.indexedEpisodes.append(item["episodeId"]) episodeCount = str(len(self.indexedEpisodes)) episodeTitle = item["episodeTitle"] podcastEpisodeIndex = str(len(items)) self.log( "[%s/%s] Indexing genre %s[%d of %d] letter %s page [%s of %d] podcast [id:%s] [%d of %d] %s : [%s/%s] %s" % ( episodeCount, podcastCount, genreName, genreIndex, genreCount, letterValue, pageIndex, pageCount, podcastId, podcastGenreIndex, podcastGenreCount, podcastName, podcastEpisodeIndex, podcastEpisodeCount, episodeTitle, ), level=log.INFO, ) items.append(item) return items
def parse(self, response): x = XmlXPathSelector(response) x.register_namespace('atom','http://www.w3.org/2005/Atom') x.register_namespace("xsi", "http://www.w3.org/2001/XMLSchema-instance") x.register_namespace("itunes", "http://www.itunes.com/dtds/podcast-1.0.dtd") x.register_namespace("media", "http://search.yahoo.com/mrss/") x.register_namespace("im", "http://itunes.apple.com/rss") x.register_namespace("media", "http://search.yahoo.com/mrss/") #either a rss feed of items #or a rss feed of feeds with entry(s) items = x.select('//rss/item') entries = x.select('//atom:entry') print entries, items for entry in entries: metaData={} metaData['brandName'] = entry.select('./atom:title/text()').extract() metaData['itunesTrackId'] = entry.select('./atom:id/@im:id').extract() metaData['brandDescription'] = entry.select('./atom:summary/text()').extract() if metaData['itunesTrackId']: metaData['itunesTrackId'] = metaData['itunesTrackId'][0] #itunes podcast html #from an Id if 'itunesTrackId' in metaData and metaData['itunesTrackId']: self.logProgress('parsePage from Id', metaData['brandName'][0], '', metaData['itunesTrackId'], log.INFO, str(metaData['itunesTrackId']) ) request = Request('http://itunes.apple.com/lookup?id='+ metaData['itunesTrackId'], meta = {'metaData': copy.deepcopy(metaData)}, callback=self.getItunesTrackJson) else: #if not from the title self.logProgress('parsePage from title', metaData['brandName'][0], '', '---------', log.INFO) try: ownerName = metaData['ownerName'][0] except: ownerName = '' #&attribute=titleTerm removed whilst using the owner name in the string as well request = Request('http://itunes.apple.com/search?term='+ metaData['brandName'][0] + ownerName +'&entity=podcast', meta = {'metaData': copy.deepcopy(metaData)}, callback=self.getItunesTrackJson) self.indexedPodcasts.append(1) yield request for item in items: metaData={} metaData['brandName'] = item.select('./title/text()').extract() metaData['itunesTrackId'] = item.select('./id/@im:id').extract() metaData['brandDescription'] = item.select('./summary/text()').extract() if metaData['itunesTrackId']: metaData['itunesTrackId'] = metaData['itunesTrackId'][0] #itunes podcast html #from an Id if 'itunesTrackId' in metaData and metaData['itunesTrackId']: self.logProgress('parsePage from Id', metaData['brandName'][0], '', metaData['itunesTrackId'], log.INFO, str(metaData['itunesTrackId']) ) request = Request('http://itunes.apple.com/lookup?id='+ metaData['itunesTrackId'], meta = {'metaData': copy.deepcopy(metaData)}, callback=self.getItunesTrackJson) else: #if not from the title self.logProgress('parsePage from title', metaData['brandName'][0], '', '---------', log.INFO) try: ownerName = metaData['ownerName'][0] except: ownerName = '' #&attribute=titleTerm removed whilst using the owner name in the string as well request = Request('http://itunes.apple.com/search?term='+ metaData['brandName'][0] + ownerName +'&entity=podcast', meta = {'metaData': copy.deepcopy(metaData)}, callback=self.getItunesTrackJson) self.indexedPodcasts.append(1) yield request
def parse(self, response): x = XmlXPathSelector(response) x.remove_namespaces() x.register_namespace("rdf", "http://www.w3.org/1999/02/22-rdf-syntax-ns#") items = [] items = x.select('//record/metadata/RDF') jsons = [] for item in items: creator = item.select( 'MetaResource/creator/Agent/name/text()').extract() title = item.select('Resource/title/text()').extract() uri = item.select('Resource/screen/Image/@rdf:about').extract() tags = item.select( 'Resource/subject/Description/value/text()').extract() thumbnail = item.select( 'Resource/thumbnail/Image/@rdf:about').extract() lat = item.select( 'Resource/spatial/Description/lat/text()').extract() long = item.select( 'Resource/spatial/Description/long/text()').extract() locality = item.select( 'Resource/spatial/Description/locality/text()').extract() tags_string = '"' + '", "'.join(tags) + '"' if not lat: newlat = 'null' else: newlat = lat[0] if not long: newlong = 'null' else: newlong = long[0] if not locality: newloc = '' else: newloc = locality[0] json_entry = '{"title": "' + title[0] + '", "uri": "' + uri[ 0] + '", "attribution_uri": "' + uri[ 0] + '", "media_creator_username": "******", "thumbnail_url": "' + thumbnail[ 0] + '", "media_geo_latitude": ' + newlat + ', "media_geo_longitude": ' + newlong + ', "location": "' + newloc + '", "tags": [' + tags_string + '], "archive":"Yahoo! Japan", "media_type": "Image", "layer_type": "Image", "child_items_count":0, "published":1}, ' jsons.append(json_entry) resumptionToken = x.select('//resumptionToken/text()').extract() if resumptionToken == []: nextFileLink = '' open('last.txt', 'wb').write(''.join(jsons).encode("UTF-8")) else: nextFileLink = "http://search.shinrokuden.irides.tohoku.ac.jp/webapi/oaipmh?verb=ListRecords&metadataPrefix=sdn&resumptionToken=" + resumptionToken[ 0].encode('ascii') open(resumptionToken[0].encode('ascii') + '.txt', 'wb').write(''.join(jsons).encode("UTF-8")) yield Request(nextFileLink, callback=self.parse)
def parse(self, response): ######### # Setup # ######### x = XmlXPathSelector(response) x.remove_namespaces() x.register_namespace('rdf', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#') category = getCAT() category = category.upper() output_path = self.PATH + category.lower() + '_output/' token_path = output_path + '.previous_resumption_token' dup_path = output_path + '.dup_list' items = [] items = x.select('//record/metadata/RDF') result = [] id_list = [] nextFileLink = '' resumption_token = x.select('//resumptionToken/text()').extract() saveResumptionToken(resumption_token, token_path) ############### # Parse Items # ############### print '****** PARSING FILE... ******' for item in items: #################### ##### creator ###### ##### archive ###### #### layer_type #### #################### media_creator_username = '******' archive = 'Kahoku Shimpo Disasters Archive' layer_type = 'Image' #################### #### media_type #### #################### if category == 'MOVIE': media_type = 'Video' if category == 'DOCUMENT' or category == 'OTHER': media_type = 'Headline' if category == 'IMAGE': media_type = 'Image' ###################### # media_date_created # ###################### media_date_created = item.select('Resource/created/text()').extract() media_date_created = processField(media_date_created) ################## #### abstract #### ##### title ###### ################## abstract = item.select('Resource/abstract/text()').extract() title = item.select('Resource/title/text()').extract() title = processField(title) abstract = processField(abstract) abstract = abstract.replace('\r\n', '') # Abstract tends to be more unique, though not always there. Title is often repetitive but more consistently present. if not abstract: abstract = title ################### #### unique_id ##### ################### unique_id = item.select('Resource/identifier/text()').extract() unique_id = str(unique_id[0]) # Used for de-duping id_list.append(unique_id) #################### ###### Source ###### #################### source = item.select('Resource/@rdf:about').extract() source = processField(source) #################### ####### URI ######## #################### # Download image if it has not already been downloaded if category == 'IMAGE': uri = item.select('Resource/screen/Image/@rdf:about').extract() downloaded = os.path.exists(output_path + unique_id + '.jpg') if uri and not downloaded: uri = uri[0] urllib.urlretrieve(uri, output_path + unique_id + '.jpg') uri = 'https://s3.amazonaws.com/JDA-Files/' + unique_id if category == 'MOVIE': uri = item.select('Resource/ogg/Image/@rdf:about').extract() if category == 'DOCUMENT' or category == 'OTHER': uri = source uri = processField(uri) #################### #### Thumbnail ##### #################### thumbnail_url = item.select('Resource/thumbnail/Image/@rdf:about').extract() thumbnail_url = processField(thumbnail_url) #################### ####### Tags ####### #################### tags = item.select('Resource/subject/Description/value/text()').extract() if not tags: tags_string = '[]' else: tags_string = '"' + '", "'.join(tags) + '"' #################### ##### Location ##### #################### region = item.select('Resource/spatial/Description/region/text()').extract() locality = item.select('Resource/spatial/Description/locality/text()').extract() street_address = item.select('Resource/spatial/Description/street-address/text()').extract() if region or locality or street_address: region = processField(region) locality = processField(locality) street_address = processField(street_address) locationTemp = [street_address, locality, region] location = '' # Handles comma location and attribute existence variability for item in locationTemp: if item: if location == '': location = location + item else: location = location + ', ' + item if location[location.__len__()-1] == ',': location = location[:-1] else: location = '' ########################## ######## Lat/Long ######## ########################## # Find coordinates using Google Maps API lat = '' lng = '' if location != '': key = '&key=AIzaSyCGF2BwNPNckrbx6L2tQRATBcjKv0C3xCo' google_uri = 'https://maps.googleapis.com/maps/api/geocode/json?address=' location_encoded = location.encode('utf8') location_url_ready = urllib.quote_plus(location_encoded, safe='') request_uri = google_uri + location_url_ready + key with contextlib.closing(urllib.urlopen(request_uri)) as response: data = json.load(response) if json.dumps(data['results']) != '[]': lat = json.dumps(data['results'][0]['geometry']['location']['lat']) lng = json.dumps(data['results'][0]['geometry']['location']['lng']) else: lat = 'null' lng = 'null' ########################## ######## JSONify ######### ########################## json_entry = ( '{"title": "' + abstract + '", "uri": "' + uri + '", "attribution_uri": "' + source + '", "media_date_created": "' + media_date_created + '", "media_creator_username": "******", "thumbnail_url": "' + thumbnail_url + '", "media_geo_latitude": "' + lat + '", "media_geo_longitude": "' + lng + '", "location": "' + location + '", "tags": [' + tags_string + '], "archive": "' + archive + '", "media_type": "' + media_type + '", "layer_type": "' + layer_type + '", "child_items_count": 0, "published": 1}, ' ) ##################### # Duplicate Checker # ##################### # Check for duplicates only in the "final-..." file since that is the only # file without a resumption token and thus could possibly contain duplicates. if not resumption_token and os.path.exists(dup_path): dup_list = open(dup_path, 'r').read() if unique_id not in dup_list: print 'not in dup' result.append(json_entry) else: result.append(json_entry) ################### # Save Duplicates # ################### # Save Item URI List with open(dup_path, 'w+r') as f: print '****** (OVER)WRITING DEDUP LIST ******' f.truncate() for item in id_list: print>>f, item f.close() ########### # If Done # ########### if resumption_token == []: print '****** DONE ******' nextFileLink = "" path = output_path + 'final-' + getDateString() + '.json' open(path, 'wb').write(''.join(result).encode('UTF-8')) removeEmptyFiles(output_path) ############### # Or Next Job # ############### else: url = self.template_url + category + '&resumptionToken=' nextFileLink = url + resumption_token[0].encode('ascii') path = output_path + resumption_token[0].encode('ascii') + '.json' open(path, 'wb').write(''.join(result).encode('UTF-8')) yield Request(nextFileLink, callback = self.parse)
def parse_rss(self, response): x = XmlXPathSelector(response) x.register_namespace("xsi", "http://www.w3.org/2001/XMLSchema-instance") x.register_namespace("itunes", "http://www.itunes.com/dtds/podcast-1.0.dtd") x.register_namespace("media", "http://search.yahoo.com/mrss/") episodes = x.select('./channel/item') items = [] parent = response.meta['parent'] for episode in episodes: item = PodcastItem() #item = copy.copy(parent) item['channelId'] = parent['channelId'] item['channelName'] = parent['channelName'] item['channelDescription'] = parent['channelDescription'] item['channelImage'] = parent['channelImage'] item['channelHomepage'] = parent['channelHomepage'] item['ownerId'] = parent['ownerId'] item['ownerName'] = parent['ownerName'] item['ownerKey'] = parent['ownerKey'] item['ownerImage'] = parent['ownerImage'] item['ownerHomepage'] = parent['ownerHomepage'] item['brandFeed'] = response.meta['rss'] item['brandName'] = parent['brandName'] item['brandId'] = parent['brandId'] #item['brandIds'] = parent['brandIds'] #item['brandShortName'] = parent['brandShortName'] item['brandImage'] = parent['brandImage'] #item['brandTimes'] = parent['brandTimes'] #item['brandCurrentItem'] = parent['brandCurrentItem'] #item['brandLanguage'] = parent['brandLanguage'] #item['brandAvgDuration'] = parent['brandAvgDuration'] #item['brandFrequency'] = parent['brandFrequency'] # item['brandTags'] = parent['brandTags'] item['id'] = episode.select('./guid/text()').extract() item['type'] = 'podcast' item['episodeId'] = episode.select('./guid/text()').extract() item['episodeTitle'] = episode.select('./title/text()').extract() item['episodeSubtitle'] = episode.select('./itunes:subtitle/text()').extract() item['episodeDescription'] = episode.select('./description/text()').extract() #item['episodeStart'] = episode.select('xxx').extract() #item['episodeEnd'] = episode.select('xxx').extract() item['episodeDuration'] = episode.select('./enclosure/@length').extract() item['episodePublishDate'] = episode.select('./pubDate/text()').extract() item['episodeMimeType'] = episode.select('./media:content/@type').extract() item['episodeMedia'] = episode.select('./link/text()').extract() #item['episodeImage'] = episode.select('xxx').extract() #item['episodeHomepage'] = episode.select('xxx').extract() item['episodeFirstBroadcast'] = episode.select('./pubDate/text()').extract() #item['episodeAvailableUntil'] = episode.select('xxx').extract() #item['episodeTags'] = episode.select('xxx').extract() #item['episodeRelatedTitle'] = episode.select('xxx').extract() #item['episodeRelatedUrl'] = episode.select('xxx').extract() item['brandGenres'] = episode.select('//./channel/itunes:category/@text').extract() item['brandGenreIds'] = episode.select('//./channel/./itunes:category/@text').extract() item['brandDescription'] = episode.select('//./channel/description/text()').extract() item['brandShortName'] = episode.select('//./channel/title/text()').extract() item['brandLanguage'] = episode.select('//./channel/language/text()').extract() item['brandHomepage'] = episode.select('//./channel/link').extract() item['brandImage'] = episode.select('//./channel/itunes:image/@href').extract() """ item['seriesId'] = site.select('xxx').extract() item['seriesName'] = site.select('xxx').extract() item['seriesDescription'] = site.select('xxx').extract() item['seriesImage'] = site.select('xxx').extract() item['seriesFeed'] = site.select('xxx').extract() item['seriesCurrentItem'] = site.select('xxx').extract() item['seriesHomepage'] = site.select('xxx').extract() item['seriesTags'] = site.select('xxx').extract() """ items.append(item) return items