class DjShopSpider(CrawlSpider): name = "djshop.de" allowed_domains = ["djshop.de"] baseUrl = "http://www.djshop.de/" baseCharts = [ "%sDownload-Charts/ex/s~mp3,u~charts/xe/Download-Charts.html" % baseUrl, "%sVinyl-Charts/ex/s~charts/xe/charts.html" % baseUrl ] chartTypes = [{"unpretty" : "MP3 Downloads Charts", "pretty" : "Digital Charts"}, \ {"unpretty" : "Charts Style Charts", "pretty" : "Vinyl Charts"}, \ {"unpretty" : "Charts Top 100", "pretty" : "Top 100"}, \ {"unpretty" : "Charts International Charts", "pretty" : "International Charts"}] # Expires in 2 days expires = chartCache.timedeltaUntilDays(2) cacheControl = chartCache.setCacheControl(expires) source_id = "djshop.de" source_name = "djShop.de" description = "Updated daily with what's currently hot on the electronic scene." have_extra = True details = DetailItem( Detail(id=source_id, description=description, name=source_name, have_extra=have_extra)) def __init__(self, name=None, **kwargs): super(DjShopSpider, self).__init__() chartCache.shoveDetails(self.details) self.get_chart_urls() def get_chart_urls(self): for chart in self.baseCharts: req = urllib2.Request(chart) hxs = HtmlXPathSelector(text=urllib2.urlopen(req).read()) try: navBox = hxs.select('//div[@id="leftColumn"]') navList = navBox.select('//ul[@class="navUL"]/li') for index, link in enumerate(navList): if not "Label Charts" in link.select( 'a/text()').extract()[0].strip(): self.start_urls.append( "http://www.djshop.de" + link.select('a/@href').extract()[0].strip()) except Exception, e: print e
def setExpiresInDays(self, day, hour = 1): self.expires = chartCache.timedeltaUntilDays(day, hour) self.__getCacheControl()
def setExpiresInDays(self, day, hour=1): self.expires = chartCache.timedeltaUntilDays(day, hour) self.__getCacheControl()
def parse(self, response): hxs = HtmlXPathSelector(response) chart_name = "Top 100" try: chart_type = hxs.select( '//*[@class="tab-right-active"]/text()').extract()[0].strip() except IndexError: chart_type = hxs.select( '//*[@class="tab-left-active"]/text()').extract()[0].strip() if "upcoming" in response.url: extra = "Upcoming" if "mainstream" in response.url: extra = "Mainstream" if "alltime" in response.url: chart_name += " " + extra extra = "Alltime" id = chart_name + extra + chart_type chart = ChartItem() chart['name'] = chart_name + " " + chart_type chart[ 'display_name'] = chart["name"] if chart["name"] else "Top Overall" chart['origin'] = response.url chart['source'] = 'hotnewhiphop' chart['id'] = slugify(id) chart['list'] = [] chart['extra'] = extra expires = chartCache.timedeltaUntilDays(1) cacheControl = chartCache.setCacheControl(expires) chart['date'] = cacheControl.get("Date-Modified") chart['expires'] = cacheControl.get("Date-Expires") chart['maxage'] = cacheControl.get("Max-Age") if "mixtape" in response.url: if extra == "Upcoming": chart['default'] = 1 chart['type'] = "Album" loader = SingleUrlAlbumItem() urlKey = "url" url = "http://www.hotnewhiphop.com/ajax/api/getMixtape/" elif "song" in response.url: chart['type'] = "Track" loader = SingleUrlTrackItem() # Later on, if we have a hnhh resolver, this url could be used to # get a valid mp3 stream. url = "hnhh://www.hotnewhiphop.com/ajax/api/getSong/" urlKey = "stream_url" else: log.msg("Error with %s" % (chart['name'])) return chart_list = [] rank = 0 for item in hxs.select('//div[@class="newCell newCell2"]'): if chart['type'] == "Album": loader = XPathItemLoader(SingleUrlAlbumItem(), selector=item) if chart['type'] == "Track": loader = XPathItemLoader(SingleUrlTrackItem(), selector=item) loader.add_xpath(chart['type'].lower(), 'div[@class="centerBlock"]/h3/a/text()') loader.add_xpath('artist', 'div[@class="centerBlock"]/a/i/text()') loader.add_xpath(urlKey, 'div[@class="centerBlock"]/a/@href') single = loader.load_item() single[urlKey] = url + urlparse(single[urlKey]).path.split(".")[1] rank += 1 single['rank'] = rank chart_list.append(dict(single)) log.msg("Done with %s" % (chart['name'])) chart['list'] += chart_list return chart
def parse_atom(self, feed): ns = {'ns': 'http://www.w3.org/2005/Atom', 'im': 'http://itunes.apple.com/rss'} try: _id = feed.xpath('/ns:feed/ns:id', namespaces=ns)[0].text _type = feed.xpath('/ns:feed/ns:entry/im:contentType/im:contentType', namespaces=ns)[0].attrib['term'] except IndexError: return if _type != "Album" and _type != "Track": return # skip playlists entries = feed.xpath('/ns:feed/ns:entry', namespaces=ns) chart_list = [] rank = 0 for entry in entries: title = entry.xpath('im:name', namespaces=ns)[0].text artist = entry.xpath('im:artist', namespaces=ns)[0].text if _type == "Album": album = title item = SingleAlbumItem() elif _type == "Track": album = None collectionNames = entry.xpath('im:collection/im:name', namespaces=ns) if len(collectionNames) > 0: album = collectionNames[0].text item = SingleTrackItem() item['track'] = title rank += 1 item['artist'] = artist item['album'] = album item['rank'] = rank chart_list.append( dict(item) ) title = feed.xpath('ns:title', namespaces=ns)[0].text geo = None geo_re = re.compile("cc=([a-zA-Z]+)") rGeo = geo_re.search(_id) if rGeo != None: geo = rGeo.groups()[0] genre = None genre_re = re.compile("genre=(\d+)/") rGenre = genre_re.search(_id) if rGenre != None: genre = rGenre.groups()[0] if not genre is None: genre = get_genre(genre) origin = _id md5 = hashlib.md5() md5.update(_id) _id = md5.hexdigest() if geo is None: geo_s = origin.split("/") geo = geo_s chart = ChartItem() # Itunes expires tomorrow at 00am chart['id'] = _id chart['display_name'] = genre if genre else "Top Overall" chart['origin'] = origin chart['genre'] = genre chart['geo'] = geo chart['name'] = title chart['type'] = _type chart['list'] = chart_list chart['source'] = 'itunes' # maxage is the last item scraped expires = chartCache.timedeltaUntilDays(1) cacheControl = chartCache.setCacheControl(expires) chart['date'] = cacheControl.get("Date-Modified") chart['expires'] = cacheControl.get("Date-Expires") chart['maxage'] = cacheControl.get("Max-Age") if(_id == settings["ITUNES_DEFAULT_ALBUMCHART"] or _id == settings["ITUNES_DEFAULT_TRACKCHART"]): print "Found default" + _id chart['default'] = 1 return chart
def parse_rss(self, feed, url): genre_name = None feed_extra = None feed_type = "Album" geo = None genre = filter(lambda k: 'genre' in k, urlparser(url).path.split("/")) try : genre_name = get_genre( genre[0].split("=")[1] ) # geo in xpath is different ISO than in url. We want cc not xpath # geo = feed.xpath('.//channel/language')[0].text geo_re = re.compile("cc=(.*)(?=\/)") rGeo = geo_re.search(url) if rGeo != None: geo = rGeo.groups()[0] except IndexError : return if 'newreleases' in url : feed_extra = "New Album Releases" if 'justadded' in url : feed_extra = "Just Added Albums" if 'featuredalbums' in url: feed_extra = "Featured Albums" if feed_extra is None or genre_name is None or geo is None : return ns = { 'itms': 'http://phobos.apple.com/rss/1.0/modules/itms/' } entries = feed.xpath('.//channel/item') rank = 0 chart_list = [] for entry in entries: artist = entry.xpath('itms:artist', namespaces=ns)[0].text album = entry.xpath('itms:album', namespaces=ns)[0].text rank += 1 item = SingleAlbumItem() item['artist'] = artist item['album'] = album item['rank'] = rank chart_list.append( dict(item) ) chart = ChartItem() # Unique ids _id = url md5 = hashlib.md5() md5.update(_id) _id = md5.hexdigest() chart['id'] = _id chart['origin'] = url chart['genre'] = genre_name chart['geo'] = geo.lower() chart['name'] = genre_name chart['extra'] = feed_extra chart["newrls"] = True chart['type'] = feed_type chart['list'] = chart_list chart['source'] = 'itunes' # maxage is the last item scraped # Expires in 1 days expires = chartCache.timedeltaUntilDays(1) cacheControl = chartCache.setCacheControl(expires) chart['date'] = cacheControl.get("Date-Modified") chart['expires'] = cacheControl.get("Date-Expires") chart['maxage'] = cacheControl.get("Max-Age") if _id == settings["ITUNES_DEFAULT_NRCHART"]: chart['default'] = 1 return chart
def parse(self, response): hxs = HtmlXPathSelector(response) chart_name = "Top 100" try: chart_type = hxs.select('//*[@class="tab-right-active"]/text()').extract()[0].strip() except IndexError: chart_type = hxs.select('//*[@class="tab-left-active"]/text()').extract()[0].strip() if "upcoming" in response.url : extra = "Upcoming" if "mainstream" in response.url : extra = "Mainstream" if "alltime" in response.url : chart_name += " " + extra extra = "Alltime" id = chart_name + extra + chart_type chart = ChartItem() chart['name'] = chart_name + " " + chart_type chart['display_name'] = chart["name"] if chart["name"] else "Top Overall" chart['origin'] = response.url chart['source'] = 'hotnewhiphop' chart['id'] = slugify(id) chart['list'] = [] chart['extra'] = extra expires = chartCache.timedeltaUntilDays(1) cacheControl = chartCache.setCacheControl(expires) chart['date'] = cacheControl.get("Date-Modified") chart['expires'] = cacheControl.get("Date-Expires") chart['maxage'] = cacheControl.get("Max-Age") if "mixtape" in response.url : if extra == "Upcoming" : chart['default'] = 1 chart['type'] = "Album" loader = SingleUrlAlbumItem() urlKey = "url" url = "http://www.hotnewhiphop.com/ajax/api/getMixtape/" elif "song" in response.url : chart['type'] = "Track" loader = SingleUrlTrackItem() # Later on, if we have a hnhh resolver, this url could be used to # get a valid mp3 stream. url = "hnhh://www.hotnewhiphop.com/ajax/api/getSong/" urlKey = "stream_url" else : log.msg("Error with %s" %(chart['name'])) return chart_list = [] rank = 0 for item in hxs.select('//div[@class="newCell newCell2"]'): if chart['type'] == "Album" : loader = XPathItemLoader(SingleUrlAlbumItem(), selector=item) if chart['type'] == "Track" : loader = XPathItemLoader(SingleUrlTrackItem(), selector=item) loader.add_xpath(chart['type'].lower(), 'div[@class="centerBlock"]/h3/a/text()') loader.add_xpath('artist', 'div[@class="centerBlock"]/a/i/text()') loader.add_xpath(urlKey, 'div[@class="centerBlock"]/a/@href') single = loader.load_item() single[urlKey] = url + urlparse(single[urlKey]).path.split(".")[1] rank += 1 single['rank'] = rank chart_list.append(dict(single)) log.msg("Done with %s" %(chart['name'])) chart['list'] += chart_list return chart
class MetacriticSpider(CrawlSpider): name = "metacritic.com" allowed_domains = ["metacritic.com"] baseUrl = "http://www.metacritic.com" genre_nav_xpath = './/ul[@class="genre_nav"]/li' types_xpath = './/ul[contains(@class, "tabs")]/li' first_nav_xpath = './/ul[contains(@class, "nav_items")]/li' current_page_name_xpath = './/ul[contains(@class, "tabs")]/li/span[@class="active"]/span/text()' list_xpath = './/ol[contains(@class,"list_product_condensed")]/li' next_page_xpath = './/ul[@class="pages"]/li/a/@href' coming_soon_table_xpath = './/table[@class="musicTable"]/tr' coming_soon_artist_xpath = './/td[@class="artistName"]' coming_soon_album_xpath = './/td[@class="albumTitle"]/text()' start_urls = ["http://www.metacritic.com/music"] rules = [ Rule(SgmlLinkExtractor(allow=("albums/genre/\w+", ), deny=( "music", "name", ), restrict_xpaths=(genre_nav_xpath, )), callback='parse_page', follow=True), Rule(SgmlLinkExtractor( deny=("albums/genre/\w+", "name", "music", "coming-soon/(metascore|userscore|name|date)", "new-releases/name"), restrict_xpaths=(types_xpath, )), callback='parse_new_releases', follow=True), Rule(SgmlLinkExtractor(allow=( "albums/release-date", "albums/score", ), deny=( "feature", "artist", "/\w+/people", ), restrict_xpaths=(first_nav_xpath, )), callback='parse_new_releases', follow=True) ] # Expires in 2 days expires = chartCache.timedeltaUntilDays(1) cacheControl = chartCache.setCacheControl(expires) source_id = "metacritic" source_name = "Metacritic" description = "Critically acclaimed and noteworthy music." have_extra = True details = DetailItem( Detail(id=source_id, description=description, name=source_name, have_extra=have_extra)) def __init__(self, name=None, **kwargs): super(MetacriticSpider, self).__init__() chartCache.shoveDetails(self.details) chartCache.shoveDetails(self.details, False) def get_current_genre(self, hxs): navList = hxs.select(self.genre_nav_xpath) for index, item in enumerate(navList): if item.select('.//span'): return item.select('.//span/text()').extract()[0].strip() return None def get_current(self, hxs, chart): try: active = hxs.select(self.current_page_name_xpath).extract() chart["extra"] = active[0].strip() chart["name"] = active[1].strip() chart["display_name"] = chart["name"] chart["id"] = slugify(chart["name"] + chart["extra"]) except Exception, e: if "coming-soon" in chart["origin"]: chart["extra"] = "Coming Soon" chart["name"] = "By Date" chart["display_name"] = chart["name"] chart["id"] = slugify(chart["name"] + chart["extra"])
def parse_atom(self, feed): ns = {'ns': 'http://www.w3.org/2005/Atom', 'im': 'http://itunes.apple.com/rss'} try: _id = feed.xpath('/ns:feed/ns:id', namespaces=ns)[0].text _type = feed.xpath('/ns:feed/ns:entry/im:contentType/im:contentType', namespaces=ns)[0].attrib['term'] except IndexError: return if _type != "Album" and _type != "Track": return # skip playlists entries = feed.xpath('/ns:feed/ns:entry', namespaces=ns) chart_list = [] rank = 0 for entry in entries: title = entry.xpath('im:name', namespaces=ns)[0].text artist = entry.xpath('im:artist', namespaces=ns)[0].text if _type == "Album": album = title item = SingleAlbumItem() elif _type == "Track": album = entry.xpath('im:collection/im:name', namespaces=ns)[0].text item = SingleTrackItem() item['track'] = title rank += 1 item['artist'] = artist item['album'] = album item['rank'] = rank chart_list.append( dict(item) ) title = feed.xpath('ns:title', namespaces=ns)[0].text geo = None geo_re = re.compile("cc=([a-zA-Z]+)") rGeo = geo_re.search(_id) if rGeo != None: geo = rGeo.groups()[0] genre = None genre_re = re.compile("genre=(\d+)/") rGenre = genre_re.search(_id) if rGenre != None: genre = rGenre.groups()[0] if not genre is None: genre = get_genre(genre) origin = _id md5 = hashlib.md5() md5.update(_id) _id = md5.hexdigest() if geo is None: geo_s = origin.split("/") geo = geo_s chart = ChartItem() # Itunes expires tomorrow at 00am chart['id'] = _id chart['display_name'] = genre if genre else "Top Overall" chart['origin'] = origin chart['genre'] = genre chart['geo'] = geo chart['name'] = title chart['type'] = _type chart['list'] = chart_list chart['source'] = 'itunes' # maxage is the last item scraped expires = chartCache.timedeltaUntilDays(1) cacheControl = chartCache.setCacheControl(expires) chart['date'] = cacheControl.get("Date-Modified") chart['expires'] = cacheControl.get("Date-Expires") chart['maxage'] = cacheControl.get("Max-Age") if(_id == settings["ITUNES_DEFAULT_ALBUMCHART"] or _id == settings["ITUNES_DEFAULT_TRACKCHART"]): print "Found default" + _id chart['default'] = 1 return chart