Beispiel #1
0
	def load_podcast_rss(self, response):
		x = XmlXPathSelector(response)
		x.register_namespace("xsi", "http://www.w3.org/2001/XMLSchema-instance")
		x.register_namespace("itunes", "http://www.itunes.com/dtds/podcast-1.0.dtd")
		x.register_namespace("media", "http://search.yahoo.com/mrss/")
		metaData = response.meta['metaData']
		itunesTrackId =  metaData['itunesTrackId']
		metaData['rssUrl'] = response.url
		
		##########
		# a limit of 50 episodes has been hard coded here, this should be in settings somewhere
		#########
		episodes = x.select('//channel/item[enclosure[contains(@type,"audio") or contains(@type,"video")]][position()<50]')
		podcastEpisodeCount = str(len(episodes))

		items = []
		self.totalPodcastEpisodes = self.totalPodcastEpisodes + len(episodes)
		if len(episodes)==0:
			self.logProgress('Empty feed', metaData['brandName'][0], '', itunesTrackId, log.WARNING, ('No episodes for %s' % (response.url)))
			
			metaData['itemtype']=['noepisodes']
			item = self.load_item(x.select('//channel'), metaData)
			yield item
		else:
			podcastEpisodeIndex = str(len(items))
			podcastEpisodeCount = str(len(episodes))
			self.logProgress('load_podcast_rss', metaData['brandName'][0], '', itunesTrackId, log.INFO, ('%s/%s' % (podcastEpisodeIndex, podcastEpisodeCount)))
			for episode in episodes:
				metaData['itemtype']=['ondemand']
				item = self.load_item(episode, metaData)
				yield item
    def parse_page_content(self, response):
        xxs = XmlXPathSelector(response)
        
        page_text = xxs.select('/api/query/pages/page/revisions/rev/text()').extract()
        if page_text:
            url = xxs.select('/api/query/pages/page/@fullurl').extract()
            if url:
                url = url[0]
            else:
                url = None

            page_text = page_text[0]
            for md_full in RE_INFOBOX_PAINTING.finditer(page_text):
                infobox = md_full.groups()[0]
                md = RE_IB_LOCATION.search(infobox)
                if md:
                    location = clean_wiki_string(md.groups()[0])

                    artist = ''
                    md_artist = RE_IB_ARTIST.search(infobox)
                    if md_artist:
                        artist = clean_wiki_string(md_artist.groups()[0])

                        name = ''
                        md_name = RE_IB_NAME.search(infobox)
                        if md_name:
                            name = clean_wiki_string(md_name.groups()[0])

                            if location and artist and name:
                                yield ArtInfo(name=name, artist=artist, location=location, url=url)
Beispiel #3
0
 def parse(self, response):
     xxs = XmlXPathSelector(response)
     routetitle = xxs.select('//predictions/@routeTitle').extract()[0]
     stoptag = xxs.select('//predictions/@stopTag').extract()[0]
     predictions = xxs.select('//prediction')
     items = []
     for prediction in predictions:
         item = EtaScraperItem()
         item['seconds'] = prediction.select('@seconds').extract()[0]
         item['minutes'] = prediction.select('@minutes').extract()[0]
         item['is_departure'] = prediction.select("@isDeparture").extract()[0]
         item['dir_tag'] = prediction.select('@dirTag').extract()[0]
         item['trip_tag'] = prediction.select('@tripTag').extract()[0]
         item['vehicle_id'] = prediction.select('@vehicle').extract()[0]
         abl = prediction.select("@affectedByLayover").extract()
         if len(abl) > 0:
             item['affected_by_layover'] = abl[0]
         else:
             item['affected_by_layover'] = 'false'
         item['routename'] = routetitle
         item['stoptag'] = stoptag
         item['created'] = time()
         item['thisdate'] = datetime.now().date()
         direction = item['dir_tag']
         if direction.find(DIRECTION_OPTS[0][0]) == -1 and direction.find(DIRECTION_OPTS[1][0]) == -1:
             direc = DIRECTION_OPTS[2][1]
         elif direction.find(DIRECTION_OPTS[0][0]) != -1:
             direc = DIRECTION_OPTS[0][1]
         else:
             direc = DIRECTION_OPTS[1][1]
         item['dir_tag'] = direc
         items.append(item)
     return items
Beispiel #4
0
    def test_unquote(self):
        xmldoc = '\n'.join((
            '<root>',
            '  lala',
            '  <node>',
            '    blabla&amp;more<!--comment-->a<b>test</b>oh',
            '    <![CDATA[lalalal&ppppp<b>PPPP</b>ppp&amp;la]]>',
            '  </node>',
            '  pff',
            '</root>'))
        xxs = XmlXPathSelector(text=xmldoc)

        self.assertEqual(xxs.extract_unquoted(), u'')

        self.assertEqual(xxs.select('/root').extract_unquoted(), [u''])
        self.assertEqual(xxs.select('/root/text()').extract_unquoted(), [
            u'\n  lala\n  ',
            u'\n  pff\n'])

        self.assertEqual(xxs.select('//*').extract_unquoted(), [u'', u'', u''])
        self.assertEqual(xxs.select('//text()').extract_unquoted(), [
            u'\n  lala\n  ',
            u'\n    blabla&more',
            u'a',
            u'test',
            u'oh\n    ',
            u'lalalal&ppppp<b>PPPP</b>ppp&amp;la',
            u'\n  ',
            u'\n  pff\n'])
Beispiel #5
0
    def parse(self, response):
        xxs = XmlXPathSelector(response)
        xxs.register_namespace('soapenv',
                               'http://schemas.xmlsoap.org/soap/envelope/')
        xxs.register_namespace('xsd', 'http://www.w3.org/2001/XMLSchema')
        xxs.register_namespace('xsi',
                               'http://www.w3.org/2001/XMLSchema-instance')
        xxs.register_namespace(
            'CurrentsAndMetadata',
            'http://opendap.co-ops.nos.noaa.gov/axis/webservices/currents/wsdl'
        )

        timelist = xxs.select(
            '//CurrentsAndMetadata:data/CurrentsAndMetadata:item/CurrentsAndMetadata:timeStamp/text()'
        ).extract()
        cspdlist = xxs.select(
            '//CurrentsAndMetadata:data/CurrentsAndMetadata:item/CurrentsAndMetadata:CS/text()'
        ).extract()
        cdirlist = xxs.select(
            '//CurrentsAndMetadata:data/CurrentsAndMetadata:item/CurrentsAndMetadata:CD/text()'
        ).extract()

        print len(timelist)

        for i in range(0, len(cdirlist)):
            sql_str = self.SQL_INSERT_STUB.format(
                self.get_current_station().lower(),
                str(timelist[i])[0:-2], str(cspdlist[i]), str(cdirlist[i]),
                'datafactory_currentdata')
            #d_time = datetime.datetime(str(timelist[i])[0:-2], pytz.UTC)
            d_time_unware = datetime.datetime.strptime(
                str(timelist[i])[0:-2], "%Y-%m-%d %H:%M:%S")
            d_time1 = pytz.utc.localize(d_time_unware)
            d_time = d_time1.astimezone(pytz.utc)
            if self.needStore(d_time):
                self.db.query(sql_str)

        self.db.commit()

        if timelist:
            sql_str = "INSERT INTO {0} (sid, stime, etime) VALUES (\"{1}\", \"{2}\", \"{3}\")".format(
                DB_SETTINGS['DATABASE_TIME_TABLE'], self.get_current_station(),
                self.startDate.astimezone(
                    pytz.utc).strftime("%Y-%m-%d %H:%M:%S"),
                self.endDate.astimezone(
                    pytz.utc).strftime("%Y-%m-%d %H:%M:%S"))

            self.db.query(sql_str)
            self.db.commit()

        self.station_slot = self.station_slot + 1

        if (self.station_slot < len(self.start_urls)):
            yield self.start_urls[self.station_slot]
Beispiel #6
0
	def parse(self, response):
		x = XmlXPathSelector(response)
		x.remove_namespaces()
		x.register_namespace("rdf", "http://www.w3.org/1999/02/22-rdf-syntax-ns#")
		items = []
		items = x.select('//record/metadata/RDF')

		jsons = []

		for item in items:
			creator = item.select('MetaResource/creator/Agent/name/text()').extract()
			title = item.select('Resource/title/text()').extract()
			uri = item.select('Resource/screen/Image/@rdf:about').extract()
			tags = item.select('Resource/subject/Description/value/text()').extract()
			thumbnail = item.select('Resource/thumbnail/Image/@rdf:about').extract()
			lat = item.select('Resource/spatial/Description/lat/text()').extract()
			long = item.select('Resource/spatial/Description/long/text()').extract()
			locality = item.select('Resource/spatial/Description/locality/text()').extract()
			
			tags_string = '"' + '", "'.join(tags) + '"'
			
			if not lat:
				newlat = 'null'
			else:
				newlat = lat[0]

			if not long:
				newlong = 'null'
			else:
				newlong = long[0]

			if not locality:
				newloc = ''
			else:
				newloc = locality[0]
			
			
			
			json_entry = '{"title": "' + title[0] + '", "uri": "' + uri[0] + '", "attribution_uri": "' + uri[0] + '", "media_creator_username": "******", "thumbnail_url": "' + thumbnail[0] + '", "media_geo_latitude": ' + newlat + ', "media_geo_longitude": ' + newlong + ', "location": "' + newloc + '", "tags": [' + tags_string + '], "archive":"Yahoo! Japan", "media_type": "Image", "layer_type": "Image", "child_items_count":0, "published":1}, '
			
			
			jsons.append(json_entry)
			

		resumptionToken = x.select('//resumptionToken/text()').extract()
		if resumptionToken == []:
			nextFileLink = ''
			open('last.txt', 'wb').write(''.join(jsons).encode("UTF-8"))
		else:
			nextFileLink = "http://search.shinrokuden.irides.tohoku.ac.jp/webapi/oaipmh?verb=ListRecords&metadataPrefix=sdn&resumptionToken=" + resumptionToken[0].encode('ascii')
			open(resumptionToken[0].encode('ascii') + '.txt', 'wb').write(''.join(jsons).encode("UTF-8"))
		yield Request(nextFileLink, callback = self.parse)
    def parse_xml_document(self, response):
        xxs = XmlXPathSelector(response)
        votes = xxs.select('//meeting/vote')
        items = []

        for vote in votes:
            councilvote = VoteItem()
            votenum = int(vote.select('@number').extract()[0])
            councilvote["number"] = int(votenum)
            councilvote["date"] = vote.select('vote-date/text()').extract()[0]
            councilvote["time"] = vote.select('vote-time/text()').extract()[0]
            councilvote["motion_ch"] = vote.select('motion-ch/text()').extract()[0]
            councilvote["motion_en"] = vote.select('motion-en/text()').extract()[0]
            councilvote["mover_ch"] = vote.select('mover-ch/text()').extract()[0]
            councilvote["mover_en"] = vote.select('mover-en/text()').extract()[0]
            councilvote["mover_type"] = vote.select('mover-type/text()').extract()[0]
            councilvote["separate_mechanism"] = vote.select('vote-separate-mechanism/text()').extract()[0]
            if councilvote["separate_mechanism"] == 'Yes':
                mechanism = ['functional-constituency', 'geographical-constituency']
            else:
                mechanism = ['overall']
            for constituency in mechanism:
                if constituency == 'functional-constituency':
                    short = 'fc_'
                elif constituency == 'geographical-constituency':
                    short = 'gc_'
                else:
                    short = ''
                for count_type in ['present', 'vote', 'yes', 'no', 'abstain']:
                    councilvote[short+count_type] = int(vote.select('vote-summary/'+constituency+'/'+count_type+'-count/text()').extract()[0])
                councilvote[short+'result'] = vote.select('vote-summary/'+constituency+'/'+'result/text()').extract()[0]
            councilvote['result'] = vote.select('vote-summary/overall/result/text()').extract()[0]


            items.append(councilvote)

            members = xxs.select('//meeting/vote[%s]/individual-votes/member'%votenum)
            for member in members:
                individualvote = IndividualVoteItem()
                individualvote['number'] = councilvote["number"]
                individualvote['date'] = councilvote["date"]
                individualvote['name_ch'] = member.select('@name-ch').extract()[0]
                individualvote['name_en'] = member.select('@name-en').extract()[0]
                individualvote['constituency'] = member.select('@constituency').extract()[0]
                individualvote['vote'] = member.select('vote/text()').extract()[0]

                items.append(individualvote)


        return items
    def parse(self, response):
        xxs = XmlXPathSelector(response)

        eis = xxs.select('/api/query/embeddedin/ei')
        for ei in eis:
            pageid = ei.select('@pageid').extract()
            if pageid:
                yield Request('http://en.wikipedia.org/w/api.php?action=query&prop=revisions|info&pageids=%s&rvprop=content&inprop=url&format=xml' % pageid[0],
                              callback=self.parse_page_content)

        cont = xxs.select('/api/query-continue/embeddedin/@eicontinue').extract()
        if cont:
            yield Request('http://en.wikipedia.org/w/api.php?action=query&list=embeddedin&'
                          'eititle=Template:Infobox%%20artwork&eilimit=100&eifilterredir=nonredirects&format=xml&eicontinue=%s' % cont[0],
                          callback=self.parse)
Beispiel #9
0
    def parse_rss(self, response):
        item = response.request.meta['item']

        if response.status != 500:
            xxs = XmlXPathSelector(response)
            xxs.remove_namespaces()

            item['date'] = xxs.select('.//channel/date/text()').extract()
            description = xxs.select('.//channel/description/text()').extract()
            if (len(item.get('description', '')) < 10) and description:
                item['description'] = ''.join(description).strip()

        del (item['subpage_urls'])

        return item
Beispiel #10
0
    def parse_rss(self, response):
        item = response.request.meta['item']

        if response.status != 500:
            xxs = XmlXPathSelector(response)
            xxs.remove_namespaces()

            item['date'] = xxs.select('.//channel/date/text()').extract()
            description = xxs.select('.//channel/description/text()').extract()
            if (len(item.get('description', '')) < 10) and description:
                item['description'] = ''.join(description).strip()

        del(item['subpage_urls'])

        return item
	def parse(self, response):
		x = XmlXPathSelector(response)
		x.register_namespace("im", "http://itunes.apple.com/rss")
		x.register_namespace('atom','http://www.w3.org/2005/Atom')
		feedCount = str(len(self.start_urls))
		self.i=self.i+1
		self.log('Reading rss url [%s of %s]' % (self.i, feedCount), level=log.INFO)
		entries = x.select('//atom:entry')
		
		if entries:

			# a itunes rss feed
			for entry in entries:
				id = entry.select('./atom:id/@im:id').extract()
				self.log('Entry %s' % (str(id)), level=log.INFO)
				yield Request('http://itunes.apple.com/lookup?id='+ id[0], callback=self.getItunesTrackJson)
			
			
			
		else:
			# a single feed
			l = XPathItemLoader(PodcastItem(), x)
			l.add_value('id', 'rssdisco_'+response.url)
			l.add_value('audioType', 'disco')
			l.add_value('brandFeed', response.url)
			l.add_xpath('brandName', '//./channel/title/text()')
			self.log('Feed from rss %s' % (response.url), level=log.INFO)
			
			item = l.load_item()
			
	
			yield item
	def parse(self, response):
		x = XmlXPathSelector(response)
		#x.register_namespace("xsi", "http://www.w3.org/2001/XMLSchema-instance")
		
		#programs = x.select('./body/outline[position()=4]/outline[position()<4]')
		programs = x.select('//body/outline/outline')
		podcastCount = str(len(programs))
		i=0
		allitems=[]
		for program in programs:
			i=i+1
			l = XPathItemLoader(PodcastItem(), selector=program)
			l.add_xpath('id', 'concat("dpc_", ./@xmlUrl)')
			l.add_value('audioType', 'disco')
			l.add_xpath('brandId', './@xmlUrl')
			l.add_xpath('brandFeed', './@xmlUrl')
			l.add_xpath('brandName', './@title')
			l.add_xpath('brandDescription', './@description')
			l.add_xpath('brandHomepage', './@htmlUrl')
			
			self.log('Discovering dpc [%s of %s] feeds' % (i, podcastCount), level=log.INFO)
		
		
			item = l.load_item()
			yield item
Beispiel #13
0
	def parseSubGenre(self, response):
		x = XmlXPathSelector(response)
		x.register_namespace("kb", "http://www.kerbango.com/xml")
		metaData = response.meta['metaData']
		stations = x.select('//kb:results/kb:station_record') #   was limited to less 5 for now!!!

		for station in stations:
			metaData['channelPlaylist'] = [station.select('./kb:station_url_record/kb:url/text()').extract()[0].rstrip('/ \r\n')]
			metaData['channelName'] = station.select('./kb:station/text()').extract()	
			metaData['channelDescription'] = station.select('./kb:description/text()').extract()	
			metaData['streamId'] = station.select('./kb:esid/text()').extract()	
			metaData['streamBandwidth'] = station.select('./kb:station_url_record/kb:bandwidth_kbps/text()').extract()	
			metaData['streamData'] = station.select('./kb:station_url_record/kb:status_code/text()').extract()	
			metaData['channelGenreIds'] = metaData['genreId']
			metaData['channelGenres'] = metaData['genreName']
			metaData['channelCategory'] = metaData['genreName']
			
			
			self.log('parseSubGenre %s %s' % (metaData['genreName'], metaData['channelName'] ), level=log.INFO)
			channelName = metaData['channelName'][0]
			channelName = re.sub(r'Low$|High$', '', channelName).strip() #cope with bbc names that include bitratethy in name
			tuneInSearchUrl = 'http://tunein.com/search/suggest/?query='+ channelName
			#assume all is well and the supplied url is indeed a playlist!
			
			request = Request(tuneInSearchUrl,
				meta = {'metaData': copy.deepcopy(metaData)},
				callback=self.parseTuneInSearch,
				errback=lambda x:self.parsePlaylist(x,copy.deepcopy(metaData)) )

			yield request
Beispiel #14
0
	def parseFeed(self, response):
		jsonResponse = response.meta['jsonResponse']
		
		brandStats = jsonResponse['stats']['stats_fields']['episodePublishDate']
		#maxDate = brandStats['max']
		#updateDoc = '<delete><query>brandFeed:"'+brandFeed+'"</query></delete>'


		x = XmlXPathSelector(response)
		x.register_namespace("xsi", "http://www.w3.org/2001/XMLSchema-instance")
		x.register_namespace("itunes", "http://www.itunes.com/dtds/podcast-1.0.dtd")
		x.register_namespace("media", "http://search.yahoo.com/mrss/")

		#########
		newEpisodes = x.select('//channel/item[enclosure[contains(@type,"audio") or contains(@type,"video")]]')
		metaData = {}
		metaData['rssUrl'] = response.url
		episodes = []
		#create a single solr update doc that contains all the new episodes and deletes expired ones
		

		for xmlEpisode in newEpisodes:
			jsonBrand = jsonResponse['grouped']['brandFeed']['groups'][0]['doclist']['docs'][0]
			episode = self.load_item(jsonBrand, xmlEpisode, metaData).__dict__.values()[0]
			episodes.append(episode)

		updatejson = JSONEncoder().encode(episodes)
		yield Request(
			url=self.solrUpdateUrl, 
			method='POST', 
			body=updatejson,
			headers={'Content-Type':'application/json'},
			callback=self.dummyEnd
		)
Beispiel #15
0
 def handle_bug_xml(self, response):
     logging.info("STARTING XML")
     hxs = XmlXPathSelector(response)
     item = hxs.select('//item')
     try:
         parsed = bugimporters.items.ParsedBug({
             'title': item.select('title/text()').extract()[0],
             'description': item.select('description/text()').extract()[0] ,
             'status':  item.select('status/text()').extract()[0],
             'people_involved': 0, #TODO
             'date_reported': self.format_date(item.select('created/text()').extract()[0]),
             'last_touched': self.format_date(item.select('updated/text()').extract()[0]),
             'submitter_username': item.select('reporter/@username').extract()[0],
             'submitter_realname': item.select('reporter/text()').extract()[0],
             'canonical_bug_link': item.select('link/text()').extract()[0],
             'looks_closed': (item.select('status/text()').extract()[0] == 'Closed'),
             'last_polled': datetime.now(),
             # TODO tracker ids
             #'_project_name': self.tm.tracker_name,
             #'_tracker_name': self.tm.tracker_name,
         })
         yield parsed
     except IndexError as e:
         logging.exception(e)
         logging.debug("AHHHHHHHHHHHHHHHHHHHHHH!!!!!!!!!!!!!: {0}".format(item.select('title/text()').extract()[0]))
Beispiel #16
0
 def parse(self, response):
     if(self.value('link_extractor') != None):
         xxs = XmlXPathSelector(response)
         links = xxs.select(self.value("link_extractor")).extract()
         return [Request(x, callback=self.parse_item) for x in links]
     else:
         return super(CommonSpider, self).parse(response)
    def parse(self, response):
        # Create xml selector & get its contents as a string for regex parsing
        xxs = XmlXPathSelector(response)
        data = str(xxs.select('/courseinfo').extract())
        
        # Create course item
        item = CourseItem()

        # Get course number from url
        number_regex = re.compile('(..-...)')
        number_match = number_regex.search(response.url)
        if (number_match != None):
            number_match.group()
            item['number'] = number_match.group(1)
        
        # Construct regular expression for prerequisite decoding
        prereq_regex = re.compile('Prerequisite(?:s)?:(.*)(\.')
        
        match = prereq_regex.search(data)
        if (match == None):
            print item
            return
        else:
            match.group()
            print match.group(1)
            item['prereqs'] = match.group(1)
    
        print item
    def parse(self, response):
        xxs = XmlXPathSelector(response)

        for product in xxs.select('//product'):
            category = product.select('./Category/text()').extract()
            loader = ProductLoader(item=Product(), selector=product)
            loader.add_xpath('identifier', './product-id/text()')
            loader.add_xpath('sku', './product-id/text()')
            loader.add_xpath('url', './product-url/text()')
            loader.add_xpath('name', './product-name/text()')
            loader.add_xpath('brand', './brand/text()')
            loader.add_value(
                'price',
                extract_price_eu(' '.join(
                    product.select('./price/text()').extract())))
            if category:
                loader.add_value('category',
                                 category[0].split('/')[-1].strip())
            loader.add_xpath('image_url', './image-url/text()')
            loader.add_xpath('stock', './stock/text()')
            if loader.get_output_value('price') > 499:
                loader.add_value('shipping_cost', '0')
            else:
                loader.add_value('shipping_cost', '25')
            yield loader.load_item()
Beispiel #19
0
    def parse(self, response):

        xxs = XmlXPathSelector(response)
        links = xxs.select(
            "//item/*[local-name()='origLink']/text()").extract()

        return [Request(x, callback=self.parse_item) for x in links]
Beispiel #20
0
    def parse(self, response):
        # inspect_response(response, self)
        # return
        # hxs = HtmlXPathSelector(response)
        # file_path = "d:/work/GoogleFeed.xml"
        # f = open(file_path)
        # xxs = XmlXPathSelector(text=f.read())
        xxs = XmlXPathSelector(response)
        for sel in xxs.select('//channel/item'):  # ##
            loader = ProductLoader(item=Product(), response=response)
            tmp = sel.select('link/text()').extract()
            if tmp:
                loader.add_value('url', tmp[0])
            # ID
            tmp = sel.select('*[name()="g:id"]/text()').extract()
            if tmp:
                loader.add_value('identifier', tmp[0])
            # Sku
            tmp = sel.select('*[name()="g:id"]/text()').extract()
            if tmp:
                loader.add_value('sku', tmp[0])
            # Name
            tmp = sel.select('title/text()').extract()
            if tmp:
                loader.add_value('name', tmp[0])
            # price
            tmp = sel.select('*[name()="g:sale_price"]/text()').extract()
            if not tmp:
                tmp = sel.select('*[name()="g:price"]/text()').extract()
            if tmp:
                price = round(extract_price(tmp[0]) / Decimal('1.20'), 2)
                loader.add_value('price', price)
            # image_url
            tmp = sel.select('*[name()="g:image_link"]/text()').extract()
            if tmp:
                loader.add_value('image_url', tmp[0])
            # Brand
            tmp = sel.select('*[name()="g:brand"]/text()').extract()
            if tmp and tmp[0] != 'Alliance':
                loader.add_value('brand', tmp[0])
            # category
            tmp = sel.select('*[name()="g:product_type"]/text()').extract()
            if tmp:
                try:
                    loader.add_value('category', tmp[0].split('>')[1].strip())
                except:
                    loader.add_value('category', tmp[0].strip())
            # shipping_cost
            price = loader.load_item()['price']
            if price and price < 50.00:
                loader.add_value('shipping_cost', 5.90)
            # stock
            tmp = sel.select('*[name()="g:availability"]/text()').extract()
            if tmp and tmp[0] == 'in stock':
                loader.add_value('stock', 1)
            else:
                loader.add_value('stock', 0)

            yield loader.load_item()
Beispiel #21
0
	def parsePage(self, response):
		x = XmlXPathSelector(response)
		items = []
		feeds = x.select('//lst[@name="grouped"]/lst[@name="brandFeed"]/arr[@name="groups"]/lst')
		
		for feed in feeds:
			metaData={}
			metaData['brandAvgDuration'] = feed.select('./result/doc/str[@name="brandAvgDuration"]/text()').extract()[:1]
			metaData['brandCurrentItem'] = feed.select('./result/doc/str[@name="brandCurrentItem"]/text()').extract()[:1]
			metaData['brandDescription'] = feed.select('./result/doc/str[@name="brandDescription"]/text()').extract()[:1]
			metaData['brandFeed'] = feed.select('./result/doc/str[@name="brandFeed"]/text()').extract()[:1]
			metaData['brandFrequency'] = feed.select('./result/doc/str[@name="brandFrequency"]/text()').extract()[:1]
			metaData['brandHomepage'] = feed.select('./result/doc/str[@name="brandHomepage"]/text()').extract()[:1]
			metaData['brandId'] = feed.select('./result/doc/str[@name="brandId"]/text()').extract()[:1]
			metaData['brandIds'] = feed.select('./result/doc/arr[@name="brandIds"]/text()').extract()
			metaData['brandImage'] = feed.select('./result/doc/str[@name="brandImage"]/text()').extract()[:1]
			metaData['brandName'] = feed.select('./result/doc/str[@name="brandName"]/text()').extract()[:1]
			metaData['brandShortName'] = feed.select('./result/doc/str[@name="brandShortName"]/text()').extract()[:1]
			metaData['brandTimes'] = feed.select('./result/doc/str[@name="brandTimes"]/text()').extract()
			metaData['brandRegions'] = feed.select('./result/doc/arr[@name="brandRegions"]/text()').extract()
			metaData['channelHomepage'] = feed.select('./result/doc/str[@name="channelHomepage"]/text()').extract()[:1]
			metaData['channelId'] = feed.select('./result/doc/str[@name="channelId"]/text()').extract()[:1]
			metaData['channelName'] = feed.select('./result/doc/str[@name="channelName"]/text()').extract()[:1]
			metaData['itunesArtistId'] = feed.select('./result/doc/str[@name="itunesArtistId"]/text()').extract()[:1]
			metaData['itunesPopular'] = feed.select('./result/doc/int[@name="itunesPopular"]/text()').extract()[:1]
			metaData['itunesPopularInGenre'] = feed.select('./result/doc/int[@name="itunesPopularInGenre"]/text()').extract()[:1]
			metaData['itunesSimilar'] = feed.select('./result/doc/str[@name="itunesSimilar"]/text()').extract()[:1]
			metaData['itunesRelated'] = feed.select('./result/doc/str[@name="itunesRelated"]/text()').extract()[:1]
			metaData['itunesTrackId'] = feed.select('./result/doc/str[@name="itunesTrackId"]/text()').extract()[:1]
			metaData['ownerHomepage'] = feed.select('./result/doc/str[@name="ownerHomepage"]/text()').extract()[:1]
			metaData['ownerId'] = feed.select('./result/doc/str[@name="ownerId"]/text()').extract()[:1]
			metaData['ownerImage'] = feed.select('./result/doc/str[@name="ownerImage"]/text()').extract()[:1]
			metaData['ownerKey'] = feed.select('./result/doc/str[@name="ownerKey"]/text()').extract()[:1]
			metaData['ownerName'] = feed.select('./result/doc/str[@name="ownerName"]/text()').extract()[:1]
			
			
			if metaData['itunesTrackId']:
				metaData['itunesTrackId'] = metaData['itunesTrackId'][0]

			#itunes podcast html
			#from an Id
			if 'itunesTrackId' in metaData and metaData['itunesTrackId']:
				self.logProgress('parsePage from Id', metaData['brandName'][0], '', metaData['itunesTrackId'], log.INFO, str(metaData['itunesTrackId']) )

				request = Request('http://itunes.apple.com/lookup?id='+ metaData['itunesTrackId'], meta = {'metaData': copy.deepcopy(metaData)}, callback=self.getItunesTrackJson)
			else:
			#if not from the title
				self.logProgress('parsePage from title', metaData['brandName'], '', '---------', log.INFO)
				try:
					ownerName = metaData['ownerName'][0] 
				except:
					ownerName = ''
				#&attribute=titleTerm removed whilst using the owner name in the string as well
				request = Request('http://itunes.apple.com/search?term='+ metaData['brandName'][0] +' '+ ownerName +'&entity=podcast', meta = {'metaData': copy.deepcopy(metaData)}, callback=self.getItunesTrackJson)
			
			self.indexedPodcasts.append(1)
			yield request
Beispiel #22
0
    def parse(self, response):
        if not hasattr(self, 'parse_node'):
            raise NotConfigured('You must define parse_node method in order to scrape this XML feed')

        response = self.adapt_response(response)
        if self.iterator == 'iternodes':
            nodes = xmliter(response, self.itertag)
        elif self.iterator == 'xml':
            selector = XmlXPathSelector(response)
            self._register_namespaces(selector)
            nodes = selector.select('//%s' % self.itertag)
        elif self.iterator == 'html':
            selector = HtmlXPathSelector(response)
            self._register_namespaces(selector)
            nodes = selector.select('//%s' % self.itertag)
        else:
            raise NotSupported('Unsupported node iterator')

        return self.parse_nodes(response, nodes)
Beispiel #23
0
	def parseSubGenre(self, response):
		x = XmlXPathSelector(response)
		metaData = response.meta['metaData']
		stations = x.select('//body/outline[@key="stations"]/outline')
		relateds = x.select('//body/outline[@key="related"]/outline')

		for station in stations:
			metaData['channelUrl'] = station.select('@URL').extract()
			metaData['channelName'] = station.select('@text').extract()
			metaData['channelDescription'] = station.select('@subtext').extract()
			metaData['channelGenreId'] = station.select('@genre_id').extract()
			metaData['channelFormats'] = station.select('@formats').extract()	
			metaData['channelImage'] = station.select('@image').extract()
			metaData['channelTuneInItem'] = station.select('@item').extract()
			metaData['channelTuneInNowPlayingId'] = station.select('@now_playing_id').extract()
			metaData['channelTuneInPresetId'] =station.select('@preset_id').extract()
			metaData['channelTuneInType'] =station.select('@type').extract()
			metaData['channelTuneInBitrate'] =station.select('@bitrate').extract()
			metaData['channelTuneInReliability'] =station.select('@reliability').extract()
			metaData['channelTuneInGuideId'] =station.select('@guide_id').extract()
			metaData['channelTuneInShowId'] =station.select('@show_id').extract()
			metaData['channelTuneInCurrentTrack'] =station.select('@current_track').extract()
			
			if metaData['channelTuneInGuideId']:
				print '------------', metaData['channelTuneInGuideId']
				metaData['channelTuneInUrl'] = 'http://tunein.com/tuner/?StationId='+ metaData['channelTuneInGuideId'][0]
				self.indexedPodcasts.append(1)
				self.logProgress ('parseSubGenre', metaData['channelUrl'], '', '', level=log.DEBUG)

				url = 'http://opml.radiotime.com/Tune.ashx?formats=aac,mp3,wma,wmpro,wmvoice,mp3raw&render=json&id='+ metaData['channelTuneInGuideId'][0]
				request = Request(url , meta = {'metaData': copy.deepcopy(metaData)}, callback=self.getStreams)
				
				yield request
			else:
				self.logProgress ('parseSubGenre', metaData['channelName'], 'no guide id', '', level=log.WARNING)
				yield None

		for related in relateds:
			url = related.select('./@URL').extract()[0]
			self.logProgress ('parseSubGenre related links', url, '', '', level=log.DEBUG)
			request = Request(url , meta = {'metaData': copy.deepcopy(metaData)}, callback=self.parseSubGenre)
			yield request
    def parse(self, response):
        xxs = XmlXPathSelector(response)
        stores = xxs.select('//locationinfo')
        items = []
        for store in stores:
            item = TutItem()
            item['address']  = store.select('address')
            item['address2'] = store.select('address2')
            items.append(item)

        return items
Beispiel #25
0
	def parse(self, response):
		xxs = XmlXPathSelector(response)
		entries = xxs.select('//item')
		for entry in entries:
			item = ZoinkscraperItem()

			item['name'] = entry.select('./title/text()')[0].extract_unquoted()
			item['url'] = entry.select('./link/text()')[0].extract()

			item['date'] = datetime.strptime(entry.select('./pubDate/text()')[0].extract()[:-6],'%a, %d %b %Y %H:%M:%S')
			yield item
Beispiel #26
0
    def parse(self, response):
        if not hasattr(self, 'parse_node'):
            raise NotConfigured(
                'You must define parse_node method in order to scrape this XML feed'
            )

        response = self.adapt_response(response)
        if self.iterator == 'iternodes':
            nodes = self._iternodes(response)
        elif self.iterator == 'xml':
            selector = XmlXPathSelector(response)
            self._register_namespaces(selector)
            nodes = selector.select('//%s' % self.itertag)
        elif self.iterator == 'html':
            selector = HtmlXPathSelector(response)
            self._register_namespaces(selector)
            nodes = selector.select('//%s' % self.itertag)
        else:
            raise NotSupported('Unsupported node iterator')

        return self.parse_nodes(response, nodes)
Beispiel #27
0
    def test_selector_over_text(self):
        hxs = HtmlXPathSelector(text='<root>lala</root>')
        self.assertEqual(hxs.extract(),
                         u'<html><body><root>lala</root></body></html>')

        xxs = XmlXPathSelector(text='<root>lala</root>')
        self.assertEqual(xxs.extract(),
                         u'<root>lala</root>')

        xxs = XmlXPathSelector(text='<root>lala</root>')
        self.assertEqual(xxs.select('.').extract(),
                         [u'<root>lala</root>'])
Beispiel #28
0
 def parse_travel_asy(self, response):
     xxs = XmlXPathSelector(response)
     xxs.remove_namespaces()
     json_object = json.loads(xxs.select("//string/text()").extract()[0])
     request_list = []
     for product in json_object['product']:
         if product['isYuyue'] == 'True':
             url = 'http://www.zhongmin.cn/Product/ProductDetails.aspx?pid=%s&bid=11' % product['Id']
         else:
             url = 'http://www.zhongmin.cn/Travel/Product/TravelDetailArr%(Id)s-%(age)sd%(day)s.html' % product
         request_list.append(Request(url = url))
     return request_list
    def parse(self, response):
        """
        We define a custom parser here because we need to get the link from
        the feed item and then follow it to get the recipe data.

        Getting the data from <content:encoded> seems overly complex, as we
        would have to decode all the encoded characters and then build a DOM
        from that.
        """
        xxs = XmlXPathSelector(response)
        links = xxs.select("//item/*[local-name()='origLink']/text()").extract()
        return [Request(x, callback=self.parse_item) for x in links]
Beispiel #30
0
	def load_rss(self, response):
		x = XmlXPathSelector(response)
		x.register_namespace("xsi", "http://www.w3.org/2001/XMLSchema-instance")
		x.register_namespace("itunes", "http://www.itunes.com/dtds/podcast-1.0.dtd")
		x.register_namespace("media", "http://search.yahoo.com/mrss/")
		
		title = x.select('//./channel/title/text()').extract()[0]
		parent = response.meta['parent']
		
		request = Request('http://itunes.apple.com/search?term='+ title +'&entity=podcast&attribute=titleTerm', meta = {'parent': parent, 'rss': x, 'rssUrl': response.url}, callback=self.get_itunes_info)
		
		return request
Beispiel #31
0
    def parse(self, response):
        xxs = XmlXPathSelector(response)
        xxs.register_namespace('soapenv', 'http://schemas.xmlsoap.org/soap/envelope/')
        xxs.register_namespace('xsd', 'http://www.w3.org/2001/XMLSchema')
        xxs.register_namespace('xsi', 'http://www.w3.org/2001/XMLSchema-instance')        
        xxs.register_namespace('CurrentsAndMetadata', 'http://opendap.co-ops.nos.noaa.gov/axis/webservices/currents/wsdl')

        timelist = xxs.select('//CurrentsAndMetadata:data/CurrentsAndMetadata:item/CurrentsAndMetadata:timeStamp/text()').extract()
        cspdlist = xxs.select('//CurrentsAndMetadata:data/CurrentsAndMetadata:item/CurrentsAndMetadata:CS/text()').extract()
        cdirlist = xxs.select('//CurrentsAndMetadata:data/CurrentsAndMetadata:item/CurrentsAndMetadata:CD/text()').extract()
	

        print len(timelist) 
        
        for i in range(0, len(cdirlist)):
            sql_str = self.SQL_INSERT_STUB.format(self.get_current_station().lower(), str(timelist[i])[0:-2], str(cspdlist[i]), str(cdirlist[i]), 'datafactory_currentdata')
            #d_time = datetime.datetime(str(timelist[i])[0:-2], pytz.UTC)
            d_time_unware = datetime.datetime.strptime(str(timelist[i])[0:-2], "%Y-%m-%d %H:%M:%S")
            d_time1 = pytz.utc.localize(d_time_unware)
            d_time = d_time1.astimezone(pytz.utc)
            if self.needStore(d_time):
                self.db.query(sql_str)

        self.db.commit()

        if timelist:
            sql_str = "INSERT INTO {0} (sid, stime, etime) VALUES (\"{1}\", \"{2}\", \"{3}\")".format(
                DB_SETTINGS['DATABASE_TIME_TABLE'],
                self.get_current_station(),
                self.startDate.astimezone(pytz.utc).strftime("%Y-%m-%d %H:%M:%S"),
                self.endDate.astimezone(pytz.utc).strftime ("%Y-%m-%d %H:%M:%S")
            )

            self.db.query(sql_str)
            self.db.commit()

        self.station_slot = self.station_slot + 1

        if (self.station_slot < len(self.start_urls)):
            yield self.start_urls[self.station_slot]
Beispiel #32
0
	def parseDetails(self, response):
		x = XmlXPathSelector(response)
		metaData = response.meta['metaData']

		related = x.select('//body/outline[@key="genres"]/outline[@type="link"]')
		metaData['channelRelatedGenres'] = related.select('@text').extract()
		metaData['channelRelatedGenreIds'] = related.select('@guide_id').extract()

		recommended = x.select('//body/outline[@key="recommendations"]/outline[@type="audio"]')
		metaData['channelRecommended'] = recommended.select('@text').extract()
		metaData['channelRecommendedDescription'] = recommended.select('@subtext').extract()
		metaData['channelRecommendedId'] = recommended.select('@guide_id').extract()
		metaData['channelRecommendedFormats'] = recommended.select('@formats').extract()
		metaData['channelRecommendedType'] = recommended.select('@item').extract()
		metaData['channelRecommendedImage'] = recommended.select('@image').extract()

		listing = x.select('//body/outline[@key="listing"]/outline[@type="object"]/station')
		metaData['channelCallSign'] = listing.select('call_sign/text()').extract()
		metaData['channelSlogan'] = listing.select('slogan/text()').extract()
		metaData['channelUrl'] = listing.select('url/text()').extract()
		metaData['channelTuneInReportUrl'] = listing.select('report_url/text()').extract()
		metaData['channelTuneInDetailUrl'] = listing.select('detail_url/text()').extract()
		metaData['channelTuneInIsPreset'] = listing.select('is_preset/text()').extract()
		metaData['channelTuneInIsAvailable'] = listing.select('is_available/text()').extract()
		metaData['channelTuneInIsMusic'] = listing.select('is_music/text()').extract()
		metaData['channelTuneInHasSong'] = listing.select('has_song/text()').extract()
		metaData['channelTuneInHasSchedule'] = listing.select('has_schedule/text()').extract()
		metaData['channelTuneInHasTopics'] = listing.select('has_topics/text()').extract()
		metaData['channelTuneInTwitterId'] = listing.select('twitter_id/text()').extract()
		metaData['channelLogo'] = listing.select('logo/text()').extract()
		metaData['channelLocation'] = listing.select('location/text()').extract()
		metaData['channelEmail'] = listing.select('email/text()').extract()
		metaData['channelPhone'] = listing.select('phone/text()').extract()
		metaData['channelAddress'] = listing.select('mailing_address/text()').extract()
		metaData['channelLanguage'] = listing.select('language/text()').extract()

		if metaData['channelTuneInGuideId']:
			url = 'http://opml.radiotime.com/Describe.ashx?c=composite&detail=options,schedules,listing,affiliates,genres,recommendations&id='+ metaData['channelTuneInGuideId'][0]
			request = Request(url , meta = {'metaData': copy.deepcopy(metaData)}, callback=self.createItem)
			yield request
Beispiel #33
0
	def parse(self, response):
		x = XmlXPathSelector(response)
		total = int(x.select('//lst[@name="grouped"]/lst[@name="brandFeed"]/int[@name="ngroups"]/text()').extract()[0])
		pageSize = 100		
		urlBase = response.url
		start = 0 #try letting scrapy handle it all

		for i in range(start, total, pageSize):
			url = urlBase + '&start='+ str(i) 
			self.log('Requesting next %d page %d of %d %s'% (pageSize, i, total, url), log.DEBUG)
			# add a , dont_filter=True to request to prevent caching solr requests by Scrapy
			request = Request( url, callback=self.parsePage, dont_filter=True)
			yield request
Beispiel #34
0
    def parse(self, response):

        hxs = XmlXPathSelector(response)
        name = hxs.select('//name').extract()

        if self.task_id is not None:
            self.log('Processing item %s' % self.task_id, log.INFO)
            self.alert_context = 'task_id=%s' % self.task_id
            for item in self.process_item(self.bot_task_params(self.task_id)):
                yield item
        else:
            for item in self.process_items():
                yield  item
Beispiel #35
0
    def parse(self, response):

        hxs = XmlXPathSelector(response)
        name = hxs.select('//name').extract()

        if self.task_id is not None:
            self.log('Processing item %s' % self.task_id, log.INFO)
            self.alert_context = 'task_id=%s' % self.task_id
            for item in self.process_item(self.bot_task_params(self.task_id)):
                yield item
        else:
            for item in self.process_items():
                yield item
Beispiel #36
0
def xmliter_lxml(obj, nodename, namespace=None):
    from lxml import etree
    reader = _StreamReader(obj)
    tag = '{%s}%s' % (namespace, nodename) if namespace else nodename
    iterable = etree.iterparse(reader, tag=tag, encoding=reader.encoding)
    selxpath = '//' + ('x:%s' % nodename if namespace else nodename)
    for _, node in iterable:
        nodetext = etree.tostring(node)
        node.clear()
        xs = XmlXPathSelector(text=nodetext)
        if namespace:
            xs.register_namespace('x', namespace)
        yield xs.select(selxpath)[0]
Beispiel #37
0
def xmliter_lxml(obj, nodename, namespace=None):
    from lxml import etree
    reader = _StreamReader(obj)
    tag = '{%s}%s' % (namespace, nodename) if namespace else nodename
    iterable = etree.iterparse(reader, tag=tag, encoding=reader.encoding)
    selxpath = '//' + ('x:%s' % nodename if namespace else nodename)
    for _, node in iterable:
        nodetext = etree.tostring(node)
        node.clear()
        xs = XmlXPathSelector(text=nodetext)
        if namespace:
            xs.register_namespace('x', namespace)
        yield xs.select(selxpath)[0]
Beispiel #38
0
    def test_selector_namespaces_simple(self):
        body = """
        <test xmlns:somens="http://scrapy.org">
           <somens:a id="foo"/>
           <a id="bar">found</a>
        </test>
        """

        response = XmlResponse(url="http://example.com", body=body)
        x = XmlXPathSelector(response)
        
        x.register_namespace("somens", "http://scrapy.org")
        self.assertEqual(x.select("//somens:a").extract(), 
                         ['<somens:a id="foo"/>'])
Beispiel #39
0
    def parse(self, response):
        """
        We define a custom parser here because we need to get the link from
        the feed item and then follow it to get the recipe data.

        Getting the data from <content:encoded> seems overly complex, as we
        would have to decode all the encoded characters and then build a DOM
        from that.
        """
        xxs = XmlXPathSelector(response)
        links = xxs.select(
            "//item/*[local-name()='origLink']/text()").extract()
        # self.parse_item comes from OnehundredonecookbooksMixin
        return [Request(x, callback=self.parse_item) for x in links]
Beispiel #40
0
    def parse(self, response):
        xxs = XmlXPathSelector(response)
        hxs = HtmlXPathSelector(response)
        links = xxs.select('//link/text()').extract()

        log.msg('Link length: %s' % len(links), level=log.ERROR)

        if len(links) <= 0:
            log.msg('no links found, using regular parser', level=log.ERROR)
            links = hxs.select('//a/@href').extract()

        msg = 'Links: %s' % links
        log.msg(msg, level=log.ERROR)

        return [Request(x, callback=self.parse_item) for x in links]
Beispiel #41
0
 def detect_feed(self, response):
     """Just detects the feed in the links and returns an Item"""
     xxs = XmlXPathSelector(response);
     '''Need to tweak the feedparser lib to just use the headers from response instead of 
     d/l the feed page again, rather than d/l it again 
     '''
     
     if any(xxs.select("/%s" % feed_type) for feed_type in ['rss', 'feed', 'xml', 'rdf']):
         try:
             rssFeed = feedparser.parse(response.url);
             return  self.extract_feed(rssFeed)
         except:
             raise Exception('Exception while parsing/extracting the feed')	
         
     return None
 def parsePart(self, response):
     item = response.meta['item']
     xxs = XmlXPathSelector(response)
     if len(xxs.select("//ERRORSEGMENT")) == 0:
         part_num = response.meta['part_num']
         end_range = response.meta['end_range']
         part_prefix = response.meta['part_prefix']
         item['parts'].append(self.part_format % (part_prefix, part_num))
         if part_num < end_range:
             yield self.makePartRequest(part_prefix, part_num + 1, item,
                                        end_range)
         else:
             yield item
     else:
         yield item
Beispiel #43
0
 def parse(self, response):
     base_url = get_base_url(response)
     xxs = XmlXPathSelector(response)
     xxs.register_namespace("g", "http://base.google.com/ns/1.0")
     products = xxs.select('//channel/item')
     for product in products:
         loader = ProductLoader(item=Product(), selector=product)
         loader.add_xpath('url', 'link/text()')
         loader.add_xpath('name', 'title/text()')
         loader.add_xpath('image_url', 'g:image_link/text()')
         loader.add_xpath('price', 'g:price/text()')
         loader.add_xpath('brand', 'g:brand/text()')
         loader.add_xpath('category', 'g:brand/text()')
         loader.add_xpath('sku', 'g:id/text()')
         loader.add_xpath('identifier', 'g:id/text()')
         yield loader.load_item()
Beispiel #44
0
    def parse(self, response):

        xxs = XmlXPathSelector(response)
        xxs.remove_namespaces()
        products = xxs.select('//item')
        for product in products:
            mpn = product.xpath('mpn/text()')
            if mpn:
                mpn = mpn[0].extract().upper().strip()
            else:
                mpn = None
            row = self.monitored_products.get(mpn) if mpn else None
            if row is None or (row and row['Discontinued'].lower().strip()
                               == 'yes'):
                continue
            loader = ProductLoader(selector=product, item=Product())
            loader.add_xpath('identifier', 'id/text()')
            loader.add_xpath('sku', 'mpn/text()')
            loader.add_xpath('brand', 'brand/text()')
            loader.add_xpath('image_url', 'image_link/text()')
            loader.add_xpath('url', 'link/text()')
            loader.add_xpath('name', 'title/text()')
            price = product.select('sale_price/text()').extract()
            if not price:
                price = product.select('price/text()').extract()

            loader.add_value('price', extract_price(price[0]))

            categories = product.select(
                'product_type/text()').extract()[-1].split('>')
            categories = map(lambda x: x.strip(), categories)
            loader.add_value('category', categories)

            shipping_cost = product.select('shipping/price/text()').extract()
            shipping_cost = extract_price(
                shipping_cost[0]) if shipping_cost else ''
            loader.add_value('shipping_cost', shipping_cost)

            in_stock = product.select(
                'availability[contains(text(), "in stock")]').extract()
            if not in_stock:
                loader.add_value('price', 0)

            item = loader.load_item()
            item['metadata'] = RHSMeta()
            item['metadata']['cost_price'] = row['Cost Price']
            yield item
Beispiel #45
0
    def get_products(self, meta, response, colors, colors_ids):
        hxs = XmlXPathSelector(response)
        names, ids = self.get_names(meta['base_name'], meta['product_id'],
                                    meta['current_data'], colors, colors_ids)

        for i, name in enumerate(names):
            p = ProductLoader(item=Product(), response=response)
            p.add_value('identifier', ids[i])
            p.add_value('name', name)
            p.add_value('brand', meta['brand'])
            p.add_value('url', meta['url'])
            p.add_value('image_url', meta['image_url'])
            price = hxs.select('//cmd[@t="discounted_price"]/text()').extract()
            if price:
                price = price[0].replace('.', '').replace(',', '.')
                price = extract_price(price)
            if not price or price == Decimal(1):
                if not price:
                    self.log('Price not found %s' % meta['url'])
                else:
                    self.log('Price is one %s' % meta['url'])

                if not self.retries.get(
                        meta['url']) or self.retries.get(meta['url']) < 3:
                    self.log('Retrying %s' % meta['url'])
                    self.retries[meta['url']] = self.retries.get(
                        meta['url'], 0) + 1
                    p = meta['url']
                    yield Request(p,
                                  meta={
                                      'category':
                                      response.meta.get('category', ''),
                                      'cookiejar':
                                      p + str(self.retries.get(meta['url']))
                                  },
                                  callback=self.parse_product,
                                  dont_filter=True)
                else:
                    self.log('Max retries reached %s' % meta['url'])
                return
            p.add_value('price', price)
            p.add_value('shipping_cost', '0')
            p.add_value('category', response.meta.get('category'))
            yield p.load_item()
Beispiel #46
0
 def parse(self, response):
     xxs = XmlXPathSelector(response)
     base_url = get_base_url(response)
     xxs.register_namespace("f", "http://www.w3.org/2005/Atom")
     products = xxs.select('//f:entry')
     for product in products:
         product.register_namespace("g", "http://base.google.com/ns/1.0")
         product.register_namespace("p", "http://www.w3.org/2005/Atom")
         product_loader = ProductLoader(item=Product(), selector=product)
         name = product.select('./p:title/text()').extract()[0]
         if 'B-STOCK' in name.upper():
             continue
         product_loader.add_value('name', name)
         url = product.select('./p:link/@href').extract()[0]
         product_loader.add_value('url', urljoin_rfc(base_url, url))
         image_url = product.select('./g:image_link/text()').extract()
         if image_url:
             product_loader.add_value('image_url',
                                      urljoin_rfc(base_url, image_url[0]))
         category = product.select('./g:product_type/text()').extract()
         if category:
             product_loader.add_value('category', category[0])
         brand = product.select('./g:brand/text()').extract()
         if brand:
             product_loader.add_value('brand', brand[0])
         price = product.select('./g:sale_price/text()').extract()
         if price:
             product_loader.add_value('price', extract_price(price[0]))
         else:
             price = product.select('./g:price/text()').extract()
             product_loader.add_value('price', extract_price(price[0]))
         # sku = product.select('./g:gtin/text()').extract()
         # if sku:
         #     product_loader.add_value('sku', sku[0])
         identifier = product.select('./g:id/text()').extract()[0]
         product_loader.add_value('identifier', identifier)
         product_loader.add_value('sku', identifier)
         shipping_cost = product.select(
             './g:shipping/g:price/text()').extract()
         if shipping_cost:
             product_loader.add_value('shipping_cost',
                                      extract_price(shipping_cost[0]))
         product = product_loader.load_item()
         yield product
    def parse(self, response):
        xxs = XmlXPathSelector(response)

        for productxs in xxs.select(
                '//product[attribute_set/text()!="spares-accessories"]'):
            loader = ProductLoader(item=Product(), selector=productxs)
            loader.add_xpath('sku', './product_id/text()')
            loader.add_xpath('identifier', './product_id/text()')
            loader.add_xpath('price', './product_price/text()')
            loader.add_xpath('name', './product_name/text()')
            loader.add_xpath('url', './product_url/text()')
            loader.add_xpath('category', './attribute_set/text()')
            loader.add_xpath('brand', './manufacturer/text()')
            brand = loader.get_output_value('brand').strip().upper()

            if brand in self.ignore_brands:
                log.msg('Ignoring product %s because of brand %s' %
                        (loader.get_output_value('identifier'), brand))
                continue

            loader.add_value('stock', '1')

            item = loader.load_item()
            item['identifier'] = item['identifier'].upper()

            cost_price = productxs.select('./cost/text()').extract()
            metadata = CSCateringMeta()
            cost_price = cost_price[0].strip() if cost_price else '0.00'
            metadata['cost_price'] = cost_price
            item['metadata'] = metadata

            category = loader.get_output_value('category').strip().lower()

            if category in ignore_categories and not self.has_sku(
                    item.get('sku', '')):
                log.msg('Ignoring product %s because of category %s' %
                        (loader.get_output_value('identifier'), category))
                continue

            yield Request(item['url'],
                          callback=self.parse_img,
                          meta={'item': item})
Beispiel #48
0
 def parse(self, response):
     xxs = XmlXPathSelector(response)
     xxs.register_namespace("g", "http://base.google.com/ns/1.0")
     products = xxs.select('//channel/item')
     for product in products:
         loader = ProductLoader(item=Product(), selector=product)
         loader.add_xpath('url', 'link/text()')
         loader.add_xpath('name', 'title/text()')
         loader.add_xpath('image_url', 'g:image_link/text()')
         loader.add_xpath('price', 'g:price/text()')
         loader.add_xpath('brand', 'g:brand/text()')
         categories = product.select(
             'g:product_type/text()').extract()[0].split(' &gt; ')
         loader.add_value('category', categories)
         loader.add_xpath('sku', 'g:id/text()')
         loader.add_xpath('identifier', 'g:id/text()')
         stock = product.select(
             'g:availability/text()').extract()[0].lower()
         if stock != 'in stock':
             loader.add_value('stock', 0)
         yield loader.load_item()
Beispiel #49
0
def scrape_rss(response):
    log.msg("inside scrape rss")
    xxs = XmlXPathSelector(response)
    items = []
    requests = []
    for item_tag in xxs.select('//item'):
        items.append(ArticleItem())
        if len(item_tag.select("title")) > 0:
            items[-1]["title"] = item_tag.select("title/text()")[0].extract()
        if len(item_tag.select("pubDate")) > 0:
            items[-1]["time_published"] = [
                item_tag.select("pubDate/text()")[0].extract()
            ]
        if len(item_tag.select("link")) > 0:
            items[-1]["url"] = item_tag.select("link/text()")[0].extract()
        if len(item_tag.select("description")) > 0:
            items[-1]["summary"] = item_tag.select(
                "description/text()")[0].extract()

        request = Request(items[-1]["url"], callback=extract_author_from_link)
        request.meta["item"] = items[-1]
        yield request
Beispiel #50
0
    def parse(self, response):
        hxs = XmlXPathSelector(response)
        shows = hxs.select('//show')
        date_from = datetime.now()
        date_to = date_from + timedelta(days=7 * 6)

        for show in shows:
            name = show.select('./name/text()').extract()[0]
            url = show.select('./@href').extract()[0]
            show_id = url.split('/')[-1]
            show_data = SHOWS_DATA % (show_id, date_from.strftime('%Y-%m-%d'),
                                      date_to.strftime('%Y-%m-%d'))
            r = Request(
                'https://api.entstix.com/api/v1/xlive/booking/book/availability/show',
                method='POST',
                body=show_data,
                callback=self.parse_products,
                meta={
                    'name': name,
                    'id': show_id
                })
            yield r
Beispiel #51
0
    def parse(self, response):

        xxs = XmlXPathSelector(response)
        xxs.remove_namespaces()
        urls = xxs.select('//loc/text()').extract()
        for url in urls:
            if 'brands-sitemap.xml' in url:
                continue

            if 'productbrand' in url:
                prod_id = re.findall('productbrand_(\d+).html', url)
                prod_id = prod_id[0] if prod_id else ''
                if prod_id:
                    if prod_id in self.product_ids:
                        continue
                    else:
                        self.product_ids.append(prod_id)
                yield Request(url,
                              callback=self.parse_product,
                              meta={"dont_merge_cookies": True})
            else:
                yield Request(url, meta={"dont_merge_cookies": True})
        '''
Beispiel #52
0
    def parse_products(self, response):
        hxs = XmlXPathSelector(response)
        show_id = response.meta['id']
        name = response.meta['name']
        if not hxs.select('/availability/moreResults/text()'):
            self.log('No results for %s, %s' % (show_id, name))
            return

        if hxs.select(
                '/availability/moreResults/text()')[0].extract() != 'false':
            self.log('There are more results!')
            date_from = datetime.now()
            date_to = date_from + timedelta(days=7 * 6)
            show_data = SHOWS_DATA_NEXT % (
                show_id, date_from.strftime('%Y-%m-%d'),
                date_to.strftime('%Y-%m-%d'),
                hxs.select('/availability/navigate/@key')[0].extract())

            r = Request(
                'https://api.entstix.com/api/v1/xlive/booking/book/availability/show',
                method='POST',
                body=show_data,
                callback=self.parse_products,
                meta={
                    'name': name,
                    'id': show_id
                })
            yield r

        products = hxs.select('.//performances/performance')
        weekdays = [
            'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday',
            'Sunday'
        ]
        ids_seen = defaultdict(list)
        for product in products:
            loader = ProductLoader(item=Product(), selector=product)
            face_value = product.select('.//faceValue/text()')[0].extract()
            price = product.select('.//saleprice/text()')[0].extract()
            date_ = product.select('.//date/text()')[0].extract()[4:]
            date_ = datetime.strptime(date_, '%d-%b-%Y %H:%M')
            type_ = product.select('.//type/text()')[0].extract()
            identifier = ':'.join(
                [show_id,
                 date_.strftime('%Y-%m-%d'), type_, face_value])
            if identifier in ids_seen and price not in ids_seen[identifier]:
                ids_seen[identifier].append(price)
                identifier += '-' + product.select('.//block/@id')[0].extract()
            else:
                ids_seen[identifier].append(price)

            loader.add_value('identifier', identifier)
            loader.add_value('brand', face_value)
            loader.add_value('price', price)
            loader.add_value('name', name)
            loader.add_value('category', weekdays[date_.weekday()])

            p = loader.load_item()
            p['sku'] = date_.strftime('%d-%m-%y') + ' ' + type_.upper()

            yield p
Beispiel #53
0
    def parse_vote(self, response):
        if not hasattr(response, 'body_as_unicode'):
            self.log('Cannot parse: {u}'.format(u=response.url),
                     level=log.INFO)
            return
        x = XmlXPathSelector(response)

        info = x.select('//Resultado/Informacion')
        session_id = info.select('//Sesion/text()').extract()
        if not session_id:
            # can't identify session, so we skip this file
            self.log('Missing session ID: {u}'.format(u=response.url),
                     level=log.INFO)
            return
        # general session info
        session_id = session_id[0]
        session_date = date_parser.parse(
            info.select('//Fecha/text()').extract()[0], dayfirst=True)
        session_instance, session_created = Session.objects.get_or_create(
            session=session_id, defaults={'date': session_date})
        if not session_created:
            session_instance.date = session_date
            session_instance.save()

        # specific voting session info
        voting_number = info.select('//NumeroVotacion/text()').extract()
        if not voting_number:
            self.log('Missing voting number: {u}'.format(u=response.url),
                     level=log.INFO)
            return
        voting_number = voting_number[0]
        voting_title = info.select('//Titulo/text()').extract()[0]
        voting_text = info.select('//TextoExpediente/text()').extract()[0]
        voting_title_sub = info.select('//TituloSubGrupo/text()').extract()
        voting_title_sub = voting_title_sub[0] if voting_title_sub else ''
        voting_text_sub = info.select('//TextoSubGrupo/text()').extract()
        voting_text_sub = voting_text_sub[0] if voting_text_sub else ''

        voting_instance, voting_created = Voting.objects.get_or_create(
            session=session_instance, number=voting_number)
        voting_instance.title = voting_title
        voting_instance.record_text = voting_text
        voting_instance.subgroup_title = voting_title_sub
        voting_instance.subgroup_text = voting_text_sub
        # voting session counters
        counts = x.select('//Resultado/Totales')
        counts_assent = counts.select('//Asentimiento/text()').extract()[0]
        if counts_assent.lower() == 'no':
            counts_assent = False
        else:
            counts_assent = True
        if counts_assent is False:
            counts_presents = counts.select('//Presentes/text()').extract()[0]
            counts_for = counts.select('//AFavor/text()').extract()[0]
            counts_against = counts.select('//EnContra/text()').extract()[0]
            counts_abstentions = counts.select(
                '//Abstenciones/text()').extract()[0]
            counts_dont = counts.select('//NoVotan/text()').extract()[0]

            voting_instance.attendee = counts_presents
            voting_instance.for_votes = counts_for
            voting_instance.against_votes = counts_against
            voting_instance.abstains = counts_abstentions
            voting_instance.no_votes = counts_dont

        voting_instance.assent = counts_assent

        record = response.meta['record']
        initiatives = Initiative.objects.filter(record__exact=record)
        if initiatives:
            voting_instance.initiative_set.add(initiatives.latest('id'))

        voting_instance.save()

        if counts_assent is False:
            # time to parse votes!
            votes = x.select('//Resultado/Votaciones/Votacion')
            Vote.objects.filter(voting=voting_instance).delete()
            votes_list = []
            for v in votes:
                member_seat = v.select('Asiento/text()').extract()[0]
                # @jneight: I don't like search members by name, seats better?
                full_name = v.select('Diputado/text()').extract()[0]
                second_name, first_name = full_name.split(',')
                vote_type = v.select('Voto/text()').extract()[0]
                member_pk = Member.objects.filter(
                    name__iexact=first_name.strip(),
                    second_name__iexact=second_name.strip()).values_list(
                        'pk', flat=True)
                if member_pk:
                    votes_list.append(
                        Vote(voting=voting_instance,
                             member_id=member_pk[0],
                             vote=vote_type))
            Vote.objects.bulk_create(votes_list)

        return voting_instance
Beispiel #54
0
from scrapy.selector import XmlXPathSelector

xml = (
    """
    <root>
        <foos>
            <foo>the quick <bar>brown </bar>fox</foo>
        </foos>
    </root>
    """
)


xxs = XmlXPathSelector(text=xml)
foos = xxs.select('//foos')
for one in foos:
    text = one.select('./foo//text()').extract()
    text = ''.join(text)
    print(text)

xml = (
    """
    <content type="text/xml">
      <s:dict>
        <s:key name="group_id">MAC</s:key>
        <s:key name="label">NOT FOR RESALE</s:key>
        <s:key name="max_violations">5</s:key>
        <s:key name="quota">1000000000</s:key>
        <s:key name="relative_expiration_interval">0</s:key>
        <s:key name="relative_expiration_start">0</s:key>
Beispiel #55
0
    def parse_google_geocode(self, response):

        self.log('Parsing response from google geocoder\n%s' % (response.body),
                 log.DEBUG)

        xxs = XmlXPathSelector(response=response)
        reportnum = response.request.meta['reportnum']
        source = response.request.meta['source']
        state = response.request.meta['state']
        geocode_cache_key = response.request.meta.get('cache_key', None)

        status = xxs.select('//status/text()').extract()
        if status: status = status[0]
        if status == u'OK':
            result_type = xxs.select('//result/type[1]/text()').extract()
            if result_type: result_type = result_type[0]

            location = xxs.select('//geometry/location')
            lat = location.select('lat/text()').extract()[0]
            lng = location.select('lng/text()').extract()[0]

            geocode_state = xxs.select(
                '//address_component[type="administrative_area_level_1"]/short_name/text()'
            )
            if geocode_state:
                geocode_state = geocode_state.extract()[0]
            else:
                geocode_state = xxs.select(
                    '//address_component[type="country"]/short_name/text()')
                if geocode_state:
                    geocode_state = geocode_state.extract()[0]
                    if not geocode_state in self.us_territories:
                        geocode_state = None

            if source == 'ADDRESS':
                if result_type:
                    source = result_type
                else:
                    source = 'IGNORE'

            if source == 'ZIP' and result_type != 'postal_code':
                self.log('Bad zip code %s' % (geocode_cache_key), log.WARNING)

                source = 'IGNORE'

            if geocode_state:
                if (geocode_state.lower() != state.lower()):
                    self.log(
                        'Geocode state mismatch: expected %s, actual %s' %
                        (state, geocode_state), log.WARNING)
                    source = 'IGNORE'
            else:
                self.log('Geocode returned with no state code', log.WARNING)
                source = 'IGNORE'

            try:
                item = self.createGeocode(reportnum, source, lat, lng)
            except Exception as e:
                self.log(
                    'GeocodeError:%s\n\torig source %s, source %s, loc %s, %s'
                    % (e, response.request.meta['source'], source, lat, lng),
                    log.ERROR)
                raise
            if item:
                if geocode_cache_key:
                    self.db.putGeocodeCache(geocode_cache_key, lat, lng)
                yield item
                self.item_completed(reportnum)
            else:
                self.log(
                    'Dropping geocoder response with result type: %s' %
                    (result_type), log.INFO)

        elif status == 'OVER_QUERY_LIMIT':
            self.log(
                'Geocode failed for task id %s \n%s\n%s' %
                (reportnum, response.request, response.body), log.WARNING)

            # Do not mark the task as done, we will pick it up again on the next run
            self.item_processing(task_id)

            pass
        else:
            msg = 'Google Geocode operation failed for task id %s : %s \n%s' % (
                reportnum, response.request, response.body)
            try:
                self.send_alert(msg, reportnum)
            except Exception:
                self.log(msg, log.ERROR)
                raise
Beispiel #56
0
    def parse_xml_document(self, response):
        xxs = XmlXPathSelector(response)
        votes = xxs.select('//meeting/vote')
        items = []

        for vote in votes:
            councilvote = VoteItem()
            votenum = int(vote.select('@number').extract()[0])
            councilvote["number"] = int(votenum)
            councilvote["date"] = vote.select('vote-date/text()').extract()[0]
            councilvote["time"] = vote.select('vote-time/text()').extract()[0]
            councilvote["motion_ch"] = vote.select(
                'motion-ch/text()').extract()[0]
            councilvote["motion_en"] = vote.select(
                'motion-en/text()').extract()[0]
            councilvote["mover_ch"] = vote.select(
                'mover-ch/text()').extract()[0]
            councilvote["mover_en"] = vote.select(
                'mover-en/text()').extract()[0]
            councilvote["mover_type"] = vote.select(
                'mover-type/text()').extract()[0]
            councilvote["separate_mechanism"] = vote.select(
                'vote-separate-mechanism/text()').extract()[0]
            if councilvote["separate_mechanism"] == 'Yes':
                mechanism = [
                    'functional-constituency', 'geographical-constituency'
                ]
            else:
                mechanism = ['overall']
            for constituency in mechanism:
                if constituency == 'functional-constituency':
                    short = 'fc_'
                elif constituency == 'geographical-constituency':
                    short = 'gc_'
                else:
                    short = ''
                for count_type in ['present', 'vote', 'yes', 'no', 'abstain']:
                    councilvote[short + count_type] = int(
                        vote.select('vote-summary/' + constituency + '/' +
                                    count_type + '-count/text()').extract()[0])
                councilvote[short + 'result'] = vote.select(
                    'vote-summary/' + constituency + '/' +
                    'result/text()').extract()[0]
            councilvote['result'] = vote.select(
                'vote-summary/overall/result/text()').extract()[0]

            items.append(councilvote)

            members = xxs.select('//meeting/vote[%s]/individual-votes/member' %
                                 votenum)
            for member in members:
                individualvote = IndividualVoteItem()
                individualvote['number'] = councilvote["number"]
                individualvote['date'] = councilvote["date"]
                individualvote['name_ch'] = member.select(
                    '@name-ch').extract()[0]
                individualvote['name_en'] = member.select(
                    '@name-en').extract()[0]
                individualvote['constituency'] = member.select(
                    '@constituency').extract()[0]
                individualvote['vote'] = member.select(
                    'vote/text()').extract()[0]

                items.append(individualvote)

        return items
Beispiel #57
0
    def parse(self, response):
        x = XmlXPathSelector(response)
        x.remove_namespaces()
        x.register_namespace("rdf",
                             "http://www.w3.org/1999/02/22-rdf-syntax-ns#")
        items = []
        items = x.select('//record/metadata/RDF')

        jsons = []

        for item in items:
            creator = item.select(
                'MetaResource/creator/Agent/name/text()').extract()
            title = item.select('Resource/title/text()').extract()
            uri = item.select('Resource/screen/Image/@rdf:about').extract()
            tags = item.select(
                'Resource/subject/Description/value/text()').extract()
            thumbnail = item.select(
                'Resource/thumbnail/Image/@rdf:about').extract()
            lat = item.select(
                'Resource/spatial/Description/lat/text()').extract()
            long = item.select(
                'Resource/spatial/Description/long/text()').extract()
            locality = item.select(
                'Resource/spatial/Description/locality/text()').extract()

            tags_string = '"' + '", "'.join(tags) + '"'

            if not lat:
                newlat = 'null'
            else:
                newlat = lat[0]

            if not long:
                newlong = 'null'
            else:
                newlong = long[0]

            if not locality:
                newloc = ''
            else:
                newloc = locality[0]

            json_entry = '{"title": "' + title[0] + '", "uri": "' + uri[
                0] + '", "attribution_uri": "' + uri[
                    0] + '", "media_creator_username": "******", "thumbnail_url": "' + thumbnail[
                            0] + '", "media_geo_latitude": ' + newlat + ', "media_geo_longitude": ' + newlong + ', "location": "' + newloc + '", "tags": [' + tags_string + '], "archive":"Yahoo! Japan", "media_type": "Image", "layer_type": "Image", "child_items_count":0, "published":1}, '

            jsons.append(json_entry)

        resumptionToken = x.select('//resumptionToken/text()').extract()
        if resumptionToken == []:
            nextFileLink = ''
            open('last.txt', 'wb').write(''.join(jsons).encode("UTF-8"))
        else:
            nextFileLink = "http://search.shinrokuden.irides.tohoku.ac.jp/webapi/oaipmh?verb=ListRecords&metadataPrefix=sdn&resumptionToken=" + resumptionToken[
                0].encode('ascii')
            open(resumptionToken[0].encode('ascii') + '.txt',
                 'wb').write(''.join(jsons).encode("UTF-8"))
        yield Request(nextFileLink, callback=self.parse)
Beispiel #58
0
    def parse(self, response):
        xxs = XmlXPathSelector(response)
        links = xxs.select("//link/text()").extract()

        return [Request(x, callback=self.parse_item) for x in links]
Beispiel #59
0
 def parse(self, response):
     xxs = XmlXPathSelector(response)
     for title in xxs.select("//item/title/text()").extract()
         log.msg(title)
Beispiel #60
0
 def _extract_links(self, response):
     xxs = XmlXPathSelector(response)
     for url in xxs.select(self.xpath).extract():
         yield Link(url.encode(response.encoding))