def get_current(self, hxs, chart): try: active = hxs.select(self.current_page_name_xpath).extract() chart["extra"] = active[0].strip() chart["name"] = active[1].strip() chart["display_name"] = chart["name"] chart["id"] = slugify(chart["name"] + chart["extra"]) except Exception, e: if "coming-soon" in chart["origin"]: chart["extra"] = "Coming Soon" chart["name"] = "By Date" chart["display_name"] = chart["name"] chart["id"] = slugify(chart["name"] + chart["extra"])
def get_current(self, hxs, chart): try: active = hxs.select(self.current_page_name_xpath).extract(); chart["extra"] = active[0].strip() chart["name"] = active[1].strip() chart["display_name"] = chart["name"]; chart["id"] = slugify(chart["name"]+chart["extra"]) except Exception, e: if "coming-soon" in chart["origin"]: chart["extra"] = "Coming Soon" chart["name"] = "By Date" chart["display_name"] = chart["name"]; chart["id"] = slugify(chart["name"]+chart["extra"])
def parseUrl(self, type, region): response, contents = self.client.request(self.baseUrl, 'POST', urllib.urlencode({ 'method' : 'getTopCharts', 'type' : type, '_region' : region })) if( response['status'] != '200' ) : print "Error " + response['status'] return self.setChartOrigin(self.baseUrl) self.setChartType(type) self.setChartId(slugify("%s %s %s" % (self.chart_name, type, region))) self.setChartGeo(region) jsonContent = self.getJsonFromResponse(contents) chart_list = [] for rank, items in enumerate(jsonContent['result']) : t = {} if( type == "Artist"): t["artist"] = items.pop("name") else: t['artist'] = items.pop("artist") t[type.lower()] = items.pop("name") t["rank"] = rank chart_list.append(t) self.storeChartItem(chart_list)
def __createChartItem(self): try: chart = ChartItem( id = slugify(self.chart_id), name = self.chart_name, display_name = self.display_name, origin = self.origin, type = self.chart_type, default = self.default, source = self.source_id, date = self.cacheControl.get("Date-Modified"), expires = self.cacheControl.get("Date-Expires"), maxage = self.cacheControl.get("Max-Age"), list = self.chart_list ) except AttributeError: print "ChartItem is missing required attributes!" raise if self.have_extra : if self.geo is not None: chart['geo'] = self.geo if self.genre is not None: chart['genre'] = self.genre if self.extra is not None: chart['extra'] = self.extra return chart
def __createChartItem(self): try: chart = ChartItem(id=slugify(self.chart_id), name=self.chart_name, display_name=self.display_name, origin=self.origin, type=self.chart_type, default=self.default, source=self.source_id, date=self.cacheControl.get("Date-Modified"), expires=self.cacheControl.get("Date-Expires"), maxage=self.cacheControl.get("Max-Age"), list=self.chart_list) except AttributeError: print "ChartItem is missing required attributes!" raise if self.have_extra: if self.geo is not None: chart['geo'] = self.geo if self.genre is not None: chart['genre'] = self.genre if self.extra is not None: chart['extra'] = self.extra return chart
def parse_chart(self, response): hxs = HtmlXPathSelector(response) chart_name = hxs.select( '//h1[@id="page-title"]/text()').extract()[0].strip() #chart_type = hxs.select('//*[@id="chart-list"]/div[@id="chart-type-fb"]/text()').extract()[0].strip() # get a list of pages next_pages = hxs.select(self.next_page_xpath).extract() # remove javascript links and turn it into a queue, also, we want to exclude next chart (!) next_pages = deque(filter(lambda e: not 'javascript' in e, next_pages)) # Correct the grammar to fit our expectations if chart_name == 'Germany Songs': chart_name = 'German Tracks' chart = ChartItem() chart['name'] = chart_name chart['display_name'] = chart_name if chart_name else "Top Overall" chart['origin'] = response.url chart['source'] = 'billboard' chart['id'] = slugify(chart_name) chart['list'] = [] chart['date'] = self.cacheControl.get("Date-Modified") chart['expires'] = self.cacheControl.get("Date-Expires") chart['maxage'] = self.cacheControl.get("Max-Age") # lets figure out the content type lower_name = chart_name.lower() if 'songs' in lower_name: chart['type'] = 'Track' typeItem = SingleTrackItem() elif 'albums' in lower_name \ or any(lower_name in s for s in ['soundtracks', 'billboard 200', 'tastemakers']): chart['type'] = 'Album' typeItem = SingleAlbumItem() elif any(lower_name in s for s in ['social 50', 'uncharted']): chart['type'] = 'Artist' typeItem = SingleArtistItem() else: chart['type'] = 'Track' typeItem = SingleTrackItem() if (chart['id'] == settings["BILLBOARD_DEFAULT_ALBUMCHART"] or chart['id'] == settings["BILLBOARD_DEFAULT_TRACKCHART"]): chart['default'] = 1 chart = self.parse_items(hxs, chart, typeItem) # ok, we've prepped the chart container, lets start getting the pages if len(next_pages) > 0: next_page = next_pages.popleft() request = Request('http://www.billboard.com' + next_page, callback=lambda r: self.parse_page( r, chart, next_pages, typeItem)) yield request
def parse_chart(self, response): hxs = HtmlXPathSelector(response) chart_name = hxs.select('//h1[@id="page-title"]/text()').extract()[0].strip() #chart_type = hxs.select('//*[@id="chart-list"]/div[@id="chart-type-fb"]/text()').extract()[0].strip() # get a list of pages next_pages = hxs.select(self.next_page_xpath).extract() # remove javascript links and turn it into a queue, also, we want to exclude next chart (!) next_pages = deque(filter(lambda e: not 'javascript' in e, next_pages)) # Correct the grammar to fit our expectations if chart_name == 'Germany Songs': chart_name = 'German Tracks' chart = ChartItem() chart['name'] = chart_name chart['display_name'] = chart_name if chart_name else "Top Overall" chart['origin'] = response.url chart['source'] = 'billboard' chart['id'] = slugify(chart_name) chart['list'] = [] chart['date'] = self.cacheControl.get("Date-Modified") chart['expires'] = self.cacheControl.get("Date-Expires") chart['maxage'] = self.cacheControl.get("Max-Age") # lets figure out the content type lower_name = chart_name.lower() if 'songs' in lower_name : chart['type'] = 'Track' typeItem = SingleTrackItem() elif 'albums' in lower_name \ or any(lower_name in s for s in ['soundtracks', 'billboard 200', 'tastemakers']): chart['type'] = 'Album' typeItem = SingleAlbumItem() elif any(lower_name in s for s in ['social 50', 'uncharted']): chart['type'] = 'Artist' typeItem = SingleArtistItem() else: chart['type'] = 'Track' typeItem = SingleTrackItem() if(chart['id'] == settings["BILLBOARD_DEFAULT_ALBUMCHART"] or chart['id'] == settings["BILLBOARD_DEFAULT_TRACKCHART"]): chart['default'] = 1 chart = self.parse_items(hxs, chart, typeItem) # ok, we've prepped the chart container, lets start getting the pages if len(next_pages) > 0 : next_page = next_pages.popleft() request = Request('http://www.billboard.com'+next_page, callback = lambda r: self.parse_page(r, chart, next_pages, typeItem)) yield request
def parseUrl(self): print "%s %s" % (self.chart_name, self.url) self.setChartId(slugify(self.chart_name)) self.setChartDisplayName(self.chart_name) self.setChartOrigin(self.url) chart_list = [] jsonContent = self.getJsonContent(self.url) if( len(jsonContent) != 0 ): rank = 0 count = 0; for rank, items in enumerate(jsonContent): item = {} # We only take the first 100 if( count < 100): # Soundcloud metadata is hard try: item["track"] = items.pop("title").rstrip().strip() try: item["artist"] = item["track"][:item["track"].index(" - ")] item["track"] = item["track"][item["track"].index(" - ")+3:] except (ValueError): try: item["artist"] = item["track"][:item["track"].index(" -")] item["track"] = item["track"][item["track"].index(" -")+2:] except (ValueError): try: item["artist"] = item["track"][:item["track"].index(": ")] item["track"] = item["track"][item["track"].index(": ")+2:] except (ValueError): try: item["artist"] = item["track"][:item["track"].index(":")] item["track"] = item["track"][item["track"].index(":")+1:] except (ValueError): try: item["artist"] = item["track"][:item["track"].index("\u2014")] item["track"] = item["track"][item["track"].index("\u2014")+1:] except (ValueError): item["artist"] = items.pop("username").rstrip().strip() item["rank"] = rank item['stream_url'] = "http://api.soundcloud.com/tracks/" + str(items.pop("id")) + "/stream.json?client_id=%s" % (self.apiKey) except (AttributeError): pass count += 1 chart_list.append(item) # Stores this chart self.storeChartItem(chart_list)
def create_chart(self, response, name=None, type=None): chart = ChartItem(origin=response.url, source=self.source_id, list=[], date=self.cacheControl.get("Date-Modified"), expires=self.cacheControl.get("Date-Expires"), maxage=self.cacheControl.get("Max-Age"), type="Album", newrls=True if "new-releases" in response.url or "coming-soon" in response.url else False) if name is not None and type is not None: chart["name"] = name chart["display_name"] = name chart["id"] = slugify(name + type) chart["extra"] = type return chart
def create_chart(self, response, name = None, type = None): chart = ChartItem( origin=response.url, source=self.source_id, list=[], date=self.cacheControl.get("Date-Modified"), expires=self.cacheControl.get("Date-Expires"), maxage=self.cacheControl.get("Max-Age"), type="Album", newrls=True if "new-releases" in response.url or "coming-soon" in response.url else False ); if name is not None and type is not None: chart["name"] = name chart["display_name"] = name chart["id"] = slugify(name+type) chart["extra"] = type return chart
def parse(self): for section in self.sections: response = self.getJsonContent('{baseUrl}/r/{section}.json'.format(baseUrl=self.baseUrl, section=section)) self.setChartOrigin(self.baseUrl) self.setChartName(section.capitalize()) self.setChartDisplayName(self.chart_name) self.setChartId(slugify(self.chart_name)) result_list = [] for rank, item in enumerate(response[u'tracks']): chart_item = { 'rank' : rank, 'artist' : item['artist'], 'track' : item['title'] } result_list.append(chart_item) self.storeChartItem(result_list)
def parse(self): for section in self.sections: response = self.getJsonContent('{baseUrl}/r/{section}.json'.format( baseUrl=self.baseUrl, section=section)) self.setChartOrigin(self.baseUrl) self.setChartName(section.capitalize()) self.setChartDisplayName(self.chart_name) self.setChartId(slugify(self.chart_name)) result_list = [] for rank, item in enumerate(response[u'tracks']): chart_item = { 'rank': rank, 'artist': item['artist'], 'track': item['title'] } result_list.append(chart_item) self.storeChartItem(result_list)
def parse_chart(self, response): hxs = HtmlXPathSelector(response) # get a list of pages next_pages = hxs.select(self.next_page_xpath).extract() # remove javascript links and turn it into a queue next_pages = deque(filter(lambda e: not 'javascript' in e, next_pages)) chart_name = hxs.select('//*[@class="printable-chart-header"]/h1/b/text()').extract()[0].strip() chart_type = hxs.select('//*[@id="chart-list"]/div[@id="chart-type-fb"]/text()').extract()[0].strip() chart = ChartItem() chart['name'] = chart_name chart['origin'] = response.url chart['source'] = 'billboard' chart['id'] = slugify(chart_name) chart['list'] = [] # lets figure out the content type lower_name = chart_name.lower() if chart_type == 'Albums': chart['type'] = 'Album' elif chart_type == 'Singles': chart['type'] = 'Track' elif 'albums' in lower_name: chart['type'] = 'Album' elif 'soundtrack' in lower_name: chart['type'] = 'Album' else: chart['type'] = 'Track' # ok, we've prepped the chart container, lets start getting the pages next_page = next_pages.popleft() request = Request('http://www.billboard.com'+next_page, callback = lambda r: self.parse_page(r, chart, next_pages)) yield request
def parseUrl(self, url, extra = None): self.setChartName("%s %ss" % (self.exfmType.title(), self.chart_type.title())) self.setChartDisplayName(extra.title() if extra else self.exfmType.title()) self.setChartOrigin(url) if extra: self.setChartName("%s %s" % (self.chart_name, extra)) self.setChartId(slugify(self.chart_name)) jsonContent = self.getJsonContent(url) chart_list = [] for rank, items in enumerate(jsonContent['songs']): t = {} try: t["artist"] = items.pop("artist").rstrip().strip() t["track"] = items.pop("title").rstrip().strip() t["rank"] = rank except (AttributeError): pass chart_list.append(t) self.storeChartItem(chart_list)
def parseUrl(self, url, extra=None): self.setChartName("%s %ss" % (self.exfmType.title(), self.chart_type.title())) self.setChartDisplayName( extra.title() if extra else self.exfmType.title()) self.setChartOrigin(url) if extra: self.setChartName("%s %s" % (self.chart_name, extra)) self.setChartId(slugify(self.chart_name)) jsonContent = self.getJsonContent(url) chart_list = [] for rank, items in enumerate(jsonContent['songs']): t = {} try: t["artist"] = items.pop("artist").rstrip().strip() t["track"] = items.pop("title").rstrip().strip() t["rank"] = rank except (AttributeError): pass chart_list.append(t) self.storeChartItem(chart_list)
def parse_albums(self, name, albums, isEditorial ): if albums is None: # something went wrong return self.setChartName(name) self.setChartDisplayName(name) self.setChartType("Album") self.setChartId(slugify("%s%s" % (self.source_id, name) if isEditorial is True else "%seditorial %s" % (self.source_id,name))) self.setChartExtra("Editorial Choices") if isEditorial else self.setChartExtra(None) chart_list = [] nullList = [] for album in albums: try: album = album['album'] title = album['title'] artist = " ".join([ artist['name'] for artist in album['primaryArtists'] ]) try: review = album['headlineReview'] try: review['text'] = re.sub(r'((\[roviLink=.+])(.*?)(\[/roviLink]))', r'\3', review['text']) except Exception,e: print e except Exception: review = None release_date = album['originalReleaseDate'] rating = album['rating'] # instead of filter out by releasedate, we search the api by releaseyear # the result seems to be more appealing # Note: some albums have Null releaseDate, this doesnt necessarily mean # that the release date isnt within our range. We include some of them as well if release_date is not None : chart_list.append ( {'album': title, 'artist': artist, 'date': release_date, 'rating': rating, 'review' : review }) else : nullList.append ( {'album': title, 'artist': artist, 'date': release_date, 'rating': rating, 'review' : review }) except : continue if(len(nullList) > self.maxAlbums): print("Slicing NUllList from %s to %s" %(len(nullList), self.maxAlbums)) nullList = nullList[-self.maxAlbums:] chart_list = sorted(chart_list, key=itemgetter('date')) if(len(chart_list) > self.maxAlbums): print("Slicing list from %s to %s" %(len(chart_list), self.maxAlbums)) chart_list = chart_list[-self.maxAlbums:] _list = nullList + chart_list self.storeChartItem(_list)
def __message(self, ok = False): print "%s %s - %s (%s) : %s" % ((self.__outputMsgOk if ok else self.__outputMsgError), self.source_id, self.chart_type, slugify(self.chart_id), self.display_name)
def parse(self, response): hxs = HtmlXPathSelector(response) chart_name = "Top 100" try: chart_type = hxs.select('//*[@class="tab-right-active"]/text()').extract()[0].strip() except IndexError: chart_type = hxs.select('//*[@class="tab-left-active"]/text()').extract()[0].strip() if "upcoming" in response.url : extra = "Upcoming" if "mainstream" in response.url : extra = "Mainstream" if "alltime" in response.url : chart_name += " " + extra extra = "Alltime" id = chart_name + extra + chart_type chart = ChartItem() chart['name'] = chart_name + " " + chart_type chart['display_name'] = chart["name"] if chart["name"] else "Top Overall" chart['origin'] = response.url chart['source'] = 'hotnewhiphop' chart['id'] = slugify(id) chart['list'] = [] chart['extra'] = extra expires = chartCache.timedeltaUntilDays(1) cacheControl = chartCache.setCacheControl(expires) chart['date'] = cacheControl.get("Date-Modified") chart['expires'] = cacheControl.get("Date-Expires") chart['maxage'] = cacheControl.get("Max-Age") if "mixtape" in response.url : if extra == "Upcoming" : chart['default'] = 1 chart['type'] = "Album" loader = SingleUrlAlbumItem() urlKey = "url" url = "http://www.hotnewhiphop.com/ajax/api/getMixtape/" elif "song" in response.url : chart['type'] = "Track" loader = SingleUrlTrackItem() # Later on, if we have a hnhh resolver, this url could be used to # get a valid mp3 stream. url = "hnhh://www.hotnewhiphop.com/ajax/api/getSong/" urlKey = "stream_url" else : log.msg("Error with %s" %(chart['name'])) return chart_list = [] rank = 0 for item in hxs.select('//div[@class="newCell newCell2"]'): if chart['type'] == "Album" : loader = XPathItemLoader(SingleUrlAlbumItem(), selector=item) if chart['type'] == "Track" : loader = XPathItemLoader(SingleUrlTrackItem(), selector=item) loader.add_xpath(chart['type'].lower(), 'div[@class="centerBlock"]/h3/a/text()') loader.add_xpath('artist', 'div[@class="centerBlock"]/a/i/text()') loader.add_xpath(urlKey, 'div[@class="centerBlock"]/a/@href') single = loader.load_item() single[urlKey] = url + urlparse(single[urlKey]).path.split(".")[1] rank += 1 single['rank'] = rank chart_list.append(dict(single)) log.msg("Done with %s" %(chart['name'])) chart['list'] += chart_list return chart
def storeChartItem(self, chart_list): print "Saving chart: %s - %s (%s) : %s" % (self.source_id, self.chart_type, slugify(self.chart_id), self.display_name) self.chart_list = chart_list; chart = self.__createChartItem() self.__updateCache(self.__createMetadata(chart), chart)
def parse(self, response): log.msg("Parsing: %s" % (response.url), loglevel=log.INFO) hxs = HtmlXPathSelector(response) chart = ChartItem() title = hxs.select("//title/text()").extract()[0].strip() test = re.compile('^(MP3 Downloads(\sCharts|\s))(.*?)(\sCharts)', re.IGNORECASE) try: cTitle = test.match(title).group(3) if cTitle is not None: type = self.chartTypes[2]["pretty"] + " " if "vinyl" in response.url.lower(): type += self.chartTypes[1]["pretty"] else: type += self.chartTypes[0]["pretty"] chart["extra"] = type chart["name"] = cTitle.replace(self.chartTypes[2]["pretty"], "") except Exception: for type in self.chartTypes: if type["unpretty"] in title: chart["extra"] = type["pretty"] cTitle = title.replace(type["unpretty"], "") if len(cTitle) == 0: chart["name"] = response.url.split('/')[-1].replace( ".html", "").title().replace("-", " ") else: chart["name"] = cTitle if "Top 100" in chart["extra"]: chart["extra"] += " " if "vinyl" in response.url.lower(): chart["extra"] += self.chartTypes[1]["pretty"] else: chart["extra"] += self.chartTypes[0]["pretty"] chart["name"] = chart["name"].replace("Charts", "") if "name" in chart: chart["name"] = chart["name"].rstrip("-").strip() chart['display_name'] = chart["name"] if chart[ "name"] else "Top Overall" chart['origin'] = response.url chart['source'] = 'djshop.de' chart['id'] = slugify(chart["extra"] + chart["name"]) chart["type"] = "Album" chart['date'] = self.cacheControl.get("Date-Modified") chart['expires'] = self.cacheControl.get("Date-Expires") chart['maxage'] = self.cacheControl.get("Max-Age") chart['list'] = [] ''' This could be transformed into a track chart However, theres so many various and compilations and I dont think Tomahawk would parse them good. Also, its actually a Vinyl chart, so theres no "track" ranking involved ''' typeItem = SingleAlbumItem() cols = hxs.select('//div[@class="column1"]') chart_list = [] for index, col in enumerate(cols): loader = XPathItemLoader(typeItem, selector=col) loader.add_xpath('rank', str(index + 1)) loader.add_xpath('artist', "h2/a/text()") loader.add_xpath('album', "h3/text()") single = loader.load_item() chart_list.append(dict(single)) chart['list'] += chart_list yield chart
def __message(self, ok=False): print "%s %s - %s (%s) : %s" % ( (self.__outputMsgOk if ok else self.__outputMsgError), self.source_id, self.chart_type, slugify( self.chart_id), self.display_name)
def parse(self, response): hxs = HtmlXPathSelector(response) chart_name = "Top 100" try: chart_type = hxs.select( '//*[@class="tab-right-active"]/text()').extract()[0].strip() except IndexError: chart_type = hxs.select( '//*[@class="tab-left-active"]/text()').extract()[0].strip() if "upcoming" in response.url: extra = "Upcoming" if "mainstream" in response.url: extra = "Mainstream" if "alltime" in response.url: chart_name += " " + extra extra = "Alltime" id = chart_name + extra + chart_type chart = ChartItem() chart['name'] = chart_name + " " + chart_type chart[ 'display_name'] = chart["name"] if chart["name"] else "Top Overall" chart['origin'] = response.url chart['source'] = 'hotnewhiphop' chart['id'] = slugify(id) chart['list'] = [] chart['extra'] = extra expires = chartCache.timedeltaUntilDays(1) cacheControl = chartCache.setCacheControl(expires) chart['date'] = cacheControl.get("Date-Modified") chart['expires'] = cacheControl.get("Date-Expires") chart['maxage'] = cacheControl.get("Max-Age") if "mixtape" in response.url: if extra == "Upcoming": chart['default'] = 1 chart['type'] = "Album" loader = SingleUrlAlbumItem() urlKey = "url" url = "http://www.hotnewhiphop.com/ajax/api/getMixtape/" elif "song" in response.url: chart['type'] = "Track" loader = SingleUrlTrackItem() # Later on, if we have a hnhh resolver, this url could be used to # get a valid mp3 stream. url = "hnhh://www.hotnewhiphop.com/ajax/api/getSong/" urlKey = "stream_url" else: log.msg("Error with %s" % (chart['name'])) return chart_list = [] rank = 0 for item in hxs.select('//div[@class="newCell newCell2"]'): if chart['type'] == "Album": loader = XPathItemLoader(SingleUrlAlbumItem(), selector=item) if chart['type'] == "Track": loader = XPathItemLoader(SingleUrlTrackItem(), selector=item) loader.add_xpath(chart['type'].lower(), 'div[@class="centerBlock"]/h3/a/text()') loader.add_xpath('artist', 'div[@class="centerBlock"]/a/i/text()') loader.add_xpath(urlKey, 'div[@class="centerBlock"]/a/@href') single = loader.load_item() single[urlKey] = url + urlparse(single[urlKey]).path.split(".")[1] rank += 1 single['rank'] = rank chart_list.append(dict(single)) log.msg("Done with %s" % (chart['name'])) chart['list'] += chart_list return chart
def parse_albums(self, name, albums, isEditorial): if albums is None: # something went wrong return self.setChartName(name) self.setChartDisplayName(name) self.setChartType("Album") self.setChartId( slugify("%s%s" % (self.source_id, name) if isEditorial is True else "%seditorial %s" % (self.source_id, name))) self.setChartExtra( "Editorial Choices") if isEditorial else self.setChartExtra(None) chart_list = [] nullList = [] for album in albums: try: album = album['album'] title = album['title'] artist = " ".join( [artist['name'] for artist in album['primaryArtists']]) try: review = album['headlineReview'] try: review['text'] = re.sub( r'((\[roviLink=.+])(.*?)(\[/roviLink]))', r'\3', review['text']) except Exception, e: print e except Exception: review = None release_date = album['originalReleaseDate'] rating = album['rating'] # instead of filter out by releasedate, we search the api by releaseyear # the result seems to be more appealing # Note: some albums have Null releaseDate, this doesnt necessarily mean # that the release date isnt within our range. We include some of them as well if release_date is not None: chart_list.append({ 'album': title, 'artist': artist, 'date': release_date, 'rating': rating, 'review': review }) else: nullList.append({ 'album': title, 'artist': artist, 'date': release_date, 'rating': rating, 'review': review }) except: continue if (len(nullList) > self.maxAlbums): print("Slicing NUllList from %s to %s" % (len(nullList), self.maxAlbums)) nullList = nullList[-self.maxAlbums:] chart_list = sorted(chart_list, key=itemgetter('date')) if (len(chart_list) > self.maxAlbums): print("Slicing list from %s to %s" % (len(chart_list), self.maxAlbums)) chart_list = chart_list[-self.maxAlbums:] _list = nullList + chart_list self.storeChartItem(_list)
def parse(self, response): log.msg("Parsing: %s"%(response.url), loglevel=log.INFO) hxs = HtmlXPathSelector(response) chart = ChartItem() title = hxs.select("//title/text()").extract()[0].strip() test = re.compile('^(MP3 Downloads(\sCharts|\s))(.*?)(\sCharts)', re.IGNORECASE) try: cTitle = test.match(title).group(3) if cTitle is not None: type = self.chartTypes[2]["pretty"]+" "; if "vinyl" in response.url.lower() : type += self.chartTypes[1]["pretty"] else : type += self.chartTypes[0]["pretty"] chart["extra"] = type; chart["name"] = cTitle.replace(self.chartTypes[2]["pretty"], "") except Exception: for type in self.chartTypes: if type["unpretty"] in title : chart["extra"] = type["pretty"] cTitle = title.replace(type["unpretty"], "") if len(cTitle) == 0: chart["name"] = response.url.split('/')[-1].replace(".html", "").title().replace("-", " "); else : chart["name"] = cTitle if "Top 100" in chart["extra"] : chart["extra"] += " " if "vinyl" in response.url.lower() : chart["extra"] += self.chartTypes[1]["pretty"] else : chart["extra"] += self.chartTypes[0]["pretty"] chart["name"] = chart["name"].replace("Charts", "") if "name" in chart : chart["name"] = chart["name"].rstrip("-").strip() chart['display_name'] = chart["name"] if chart["name"] else "Top Overall" chart['origin'] = response.url chart['source'] = 'djshop.de' chart['id'] = slugify(chart["extra"] + chart["name"]) chart["type"] = "Album" chart['date'] = self.cacheControl.get("Date-Modified") chart['expires'] = self.cacheControl.get("Date-Expires") chart['maxage'] = self.cacheControl.get("Max-Age") chart['list'] = [] ''' This could be transformed into a track chart However, theres so many various and compilations and I dont think Tomahawk would parse them good. Also, its actually a Vinyl chart, so theres no "track" ranking involved ''' typeItem = SingleAlbumItem() cols = hxs.select('//div[@class="column1"]') chart_list = [] for index, col in enumerate(cols): loader = XPathItemLoader(typeItem, selector=col) loader.add_xpath('rank', str(index+1)) loader.add_xpath('artist', "h2/a/text()") loader.add_xpath('album', "h3/text()") single = loader.load_item() chart_list.append(dict(single)) chart['list'] += chart_list yield chart