class DjShopSpider(CrawlSpider):
    name = "djshop.de"
    allowed_domains = ["djshop.de"]
    baseUrl = "http://www.djshop.de/"
    baseCharts = [
        "%sDownload-Charts/ex/s~mp3,u~charts/xe/Download-Charts.html" %
        baseUrl,
        "%sVinyl-Charts/ex/s~charts/xe/charts.html" % baseUrl
    ]

    chartTypes = [{"unpretty" : "MP3 Downloads Charts", "pretty" : "Digital Charts"}, \
                  {"unpretty" : "Charts Style Charts", "pretty" : "Vinyl Charts"}, \
                  {"unpretty" : "Charts Top 100", "pretty" : "Top 100"}, \
                  {"unpretty" : "Charts International Charts", "pretty" : "International Charts"}]

    # Expires in 2 days
    expires = chartCache.timedeltaUntilDays(2)
    cacheControl = chartCache.setCacheControl(expires)

    source_id = "djshop.de"
    source_name = "djShop.de"
    description = "Updated daily with what's currently hot on the electronic scene."
    have_extra = True
    details = DetailItem(
        Detail(id=source_id,
               description=description,
               name=source_name,
               have_extra=have_extra))

    def __init__(self, name=None, **kwargs):
        super(DjShopSpider, self).__init__()
        chartCache.shoveDetails(self.details)
        self.get_chart_urls()

    def get_chart_urls(self):
        for chart in self.baseCharts:
            req = urllib2.Request(chart)
            hxs = HtmlXPathSelector(text=urllib2.urlopen(req).read())
            try:
                navBox = hxs.select('//div[@id="leftColumn"]')
                navList = navBox.select('//ul[@class="navUL"]/li')
                for index, link in enumerate(navList):
                    if not "Label Charts" in link.select(
                            'a/text()').extract()[0].strip():
                        self.start_urls.append(
                            "http://www.djshop.de" +
                            link.select('a/@href').extract()[0].strip())
            except Exception, e:
                print e
 def setExpiresInDays(self, day, hour = 1):
     self.expires = chartCache.timedeltaUntilDays(day, hour)
     self.__getCacheControl()
Esempio n. 3
0
 def setExpiresInDays(self, day, hour=1):
     self.expires = chartCache.timedeltaUntilDays(day, hour)
     self.__getCacheControl()
Esempio n. 4
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        chart_name = "Top 100"
        try:
            chart_type = hxs.select(
                '//*[@class="tab-right-active"]/text()').extract()[0].strip()
        except IndexError:
            chart_type = hxs.select(
                '//*[@class="tab-left-active"]/text()').extract()[0].strip()

        if "upcoming" in response.url:
            extra = "Upcoming"
        if "mainstream" in response.url:
            extra = "Mainstream"
        if "alltime" in response.url:
            chart_name += " " + extra
            extra = "Alltime"

        id = chart_name + extra + chart_type
        chart = ChartItem()
        chart['name'] = chart_name + " " + chart_type
        chart[
            'display_name'] = chart["name"] if chart["name"] else "Top Overall"
        chart['origin'] = response.url
        chart['source'] = 'hotnewhiphop'
        chart['id'] = slugify(id)
        chart['list'] = []
        chart['extra'] = extra

        expires = chartCache.timedeltaUntilDays(1)
        cacheControl = chartCache.setCacheControl(expires)
        chart['date'] = cacheControl.get("Date-Modified")
        chart['expires'] = cacheControl.get("Date-Expires")
        chart['maxage'] = cacheControl.get("Max-Age")

        if "mixtape" in response.url:
            if extra == "Upcoming":
                chart['default'] = 1
            chart['type'] = "Album"
            loader = SingleUrlAlbumItem()
            urlKey = "url"
            url = "http://www.hotnewhiphop.com/ajax/api/getMixtape/"
        elif "song" in response.url:
            chart['type'] = "Track"
            loader = SingleUrlTrackItem()
            # Later on, if we have a hnhh resolver, this url could be used to
            # get a valid mp3 stream.
            url = "hnhh://www.hotnewhiphop.com/ajax/api/getSong/"
            urlKey = "stream_url"
        else:
            log.msg("Error with %s" % (chart['name']))
            return

        chart_list = []
        rank = 0
        for item in hxs.select('//div[@class="newCell newCell2"]'):
            if chart['type'] == "Album":
                loader = XPathItemLoader(SingleUrlAlbumItem(), selector=item)
            if chart['type'] == "Track":
                loader = XPathItemLoader(SingleUrlTrackItem(), selector=item)
            loader.add_xpath(chart['type'].lower(),
                             'div[@class="centerBlock"]/h3/a/text()')
            loader.add_xpath('artist', 'div[@class="centerBlock"]/a/i/text()')
            loader.add_xpath(urlKey, 'div[@class="centerBlock"]/a/@href')
            single = loader.load_item()
            single[urlKey] = url + urlparse(single[urlKey]).path.split(".")[1]
            rank += 1
            single['rank'] = rank
            chart_list.append(dict(single))

        log.msg("Done with %s" % (chart['name']))
        chart['list'] += chart_list
        return chart
Esempio n. 5
0
    def parse_atom(self, feed):
        ns = {'ns': 'http://www.w3.org/2005/Atom',
            'im': 'http://itunes.apple.com/rss'}
        try:
            _id = feed.xpath('/ns:feed/ns:id', namespaces=ns)[0].text
            _type = feed.xpath('/ns:feed/ns:entry/im:contentType/im:contentType', namespaces=ns)[0].attrib['term']
        except IndexError:
            return

        if _type != "Album" and _type != "Track":
            return # skip playlists

        entries = feed.xpath('/ns:feed/ns:entry', namespaces=ns)
        chart_list = []
        rank = 0
        for entry in entries:
            title = entry.xpath('im:name', namespaces=ns)[0].text
            artist = entry.xpath('im:artist', namespaces=ns)[0].text
            if _type == "Album":
                album = title
                item = SingleAlbumItem()
            elif _type == "Track":
                album = None
                collectionNames = entry.xpath('im:collection/im:name', namespaces=ns)
                if len(collectionNames) > 0:
                    album = collectionNames[0].text
                item = SingleTrackItem()
                item['track'] = title
            
            rank += 1
            item['artist'] = artist
            item['album'] = album
            item['rank'] = rank
            chart_list.append( dict(item) )

        title = feed.xpath('ns:title', namespaces=ns)[0].text

        geo = None
        geo_re = re.compile("cc=([a-zA-Z]+)")
        rGeo =  geo_re.search(_id)
        if rGeo != None:
            geo = rGeo.groups()[0]

        genre = None
        genre_re = re.compile("genre=(\d+)/")
        rGenre =  genre_re.search(_id)
        if rGenre != None:
            genre = rGenre.groups()[0]

        if not genre is None:
            genre = get_genre(genre)

        origin = _id
        md5 = hashlib.md5()
        md5.update(_id)
        _id = md5.hexdigest()

        if geo is None:
            geo_s = origin.split("/")
            geo = geo_s

        chart = ChartItem()
        # Itunes expires tomorrow at 00am
        chart['id'] = _id
        chart['display_name'] = genre if genre else "Top Overall"
        chart['origin'] = origin
        chart['genre'] = genre
        chart['geo'] = geo
        chart['name'] = title
        chart['type'] = _type
        chart['list'] = chart_list
        chart['source'] = 'itunes'

        # maxage is the last item scraped
        expires = chartCache.timedeltaUntilDays(1)
        cacheControl = chartCache.setCacheControl(expires)
        chart['date'] = cacheControl.get("Date-Modified")
        chart['expires'] = cacheControl.get("Date-Expires")
        chart['maxage'] = cacheControl.get("Max-Age")

        if(_id == settings["ITUNES_DEFAULT_ALBUMCHART"] or _id == settings["ITUNES_DEFAULT_TRACKCHART"]):
            print "Found default" + _id
            chart['default'] = 1

        return chart
Esempio n. 6
0
 def parse_rss(self, feed, url):
     genre_name = None
     feed_extra = None
     feed_type = "Album"
     geo = None
     genre = filter(lambda k: 'genre' in k, urlparser(url).path.split("/"))
     try :
         genre_name = get_genre( genre[0].split("=")[1] )
         # geo in xpath is different ISO than in url. We want cc not xpath
         # geo = feed.xpath('.//channel/language')[0].text
         geo_re = re.compile("cc=(.*)(?=\/)")
         rGeo =  geo_re.search(url)
         if rGeo != None:
             geo = rGeo.groups()[0]
     except IndexError :
         return
     
     if 'newreleases' in url :
         feed_extra = "New Album Releases"
     if 'justadded' in url :
         feed_extra = "Just Added Albums"
     if 'featuredalbums' in url:
         feed_extra = "Featured Albums"
     
     if feed_extra is None or genre_name is None or geo is None :
         return
     
     ns = { 'itms': 'http://phobos.apple.com/rss/1.0/modules/itms/' }
     entries = feed.xpath('.//channel/item')
     rank = 0
     chart_list = []
     for entry in entries:
         artist = entry.xpath('itms:artist', namespaces=ns)[0].text
         album = entry.xpath('itms:album', namespaces=ns)[0].text
         rank += 1
         item = SingleAlbumItem()
         item['artist'] = artist
         item['album'] = album
         item['rank'] = rank
         chart_list.append( dict(item) )
     
     chart = ChartItem()
     # Unique ids
     _id = url
     md5 = hashlib.md5()
     md5.update(_id)
     _id = md5.hexdigest()
     
     chart['id'] = _id
     chart['origin'] = url
     chart['genre'] = genre_name
     chart['geo'] = geo.lower()
     chart['name'] = genre_name
     chart['extra'] = feed_extra
     chart["newrls"] = True
     chart['type'] = feed_type
     chart['list'] = chart_list
     chart['source'] = 'itunes'
     # maxage is the last item scraped
     # Expires in 1 days
     expires = chartCache.timedeltaUntilDays(1)
     cacheControl = chartCache.setCacheControl(expires)
     chart['date'] = cacheControl.get("Date-Modified")
     chart['expires'] = cacheControl.get("Date-Expires")
     chart['maxage'] = cacheControl.get("Max-Age")
     
     if _id == settings["ITUNES_DEFAULT_NRCHART"]:
         chart['default'] = 1
     
     return chart
    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        chart_name = "Top 100"
        try:
            chart_type = hxs.select('//*[@class="tab-right-active"]/text()').extract()[0].strip()
        except IndexError:
            chart_type = hxs.select('//*[@class="tab-left-active"]/text()').extract()[0].strip()

        if "upcoming" in response.url :
            extra = "Upcoming"
        if "mainstream" in response.url :
            extra = "Mainstream"
        if "alltime" in response.url :
            chart_name += " " + extra
            extra = "Alltime"

        id = chart_name + extra + chart_type    
        chart = ChartItem()
        chart['name'] = chart_name + " " + chart_type
        chart['display_name'] = chart["name"] if chart["name"] else "Top Overall"
        chart['origin'] = response.url
        chart['source'] = 'hotnewhiphop'
        chart['id'] = slugify(id)
        chart['list'] = []
        chart['extra'] = extra

        expires = chartCache.timedeltaUntilDays(1)
        cacheControl = chartCache.setCacheControl(expires)
        chart['date'] = cacheControl.get("Date-Modified")
        chart['expires'] = cacheControl.get("Date-Expires")
        chart['maxage'] = cacheControl.get("Max-Age")

        if "mixtape" in response.url :
            if extra == "Upcoming" :
                chart['default'] = 1
            chart['type'] = "Album"
            loader = SingleUrlAlbumItem()
            urlKey = "url"
            url = "http://www.hotnewhiphop.com/ajax/api/getMixtape/"
        elif "song" in response.url :
            chart['type'] = "Track"
            loader = SingleUrlTrackItem()
            # Later on, if we have a hnhh resolver, this url could be used to
            # get a valid mp3 stream.
            url = "hnhh://www.hotnewhiphop.com/ajax/api/getSong/"
            urlKey = "stream_url"
        else :
            log.msg("Error with %s" %(chart['name']))
            return

        chart_list = []
        rank = 0
        for item in hxs.select('//div[@class="newCell newCell2"]'):
            if chart['type'] == "Album" :
                loader = XPathItemLoader(SingleUrlAlbumItem(), selector=item)
            if chart['type'] == "Track" :
                loader = XPathItemLoader(SingleUrlTrackItem(), selector=item)
            loader.add_xpath(chart['type'].lower(), 'div[@class="centerBlock"]/h3/a/text()')
            loader.add_xpath('artist', 'div[@class="centerBlock"]/a/i/text()')
            loader.add_xpath(urlKey, 'div[@class="centerBlock"]/a/@href')
            single = loader.load_item()
            single[urlKey] = url + urlparse(single[urlKey]).path.split(".")[1]
            rank += 1
            single['rank'] = rank
            chart_list.append(dict(single))

        log.msg("Done with %s" %(chart['name']))
        chart['list'] += chart_list
        return chart
Esempio n. 8
0
class MetacriticSpider(CrawlSpider):
    name = "metacritic.com"
    allowed_domains = ["metacritic.com"]
    baseUrl = "http://www.metacritic.com"

    genre_nav_xpath = './/ul[@class="genre_nav"]/li'
    types_xpath = './/ul[contains(@class, "tabs")]/li'
    first_nav_xpath = './/ul[contains(@class, "nav_items")]/li'
    current_page_name_xpath = './/ul[contains(@class, "tabs")]/li/span[@class="active"]/span/text()'
    list_xpath = './/ol[contains(@class,"list_product_condensed")]/li'
    next_page_xpath = './/ul[@class="pages"]/li/a/@href'
    coming_soon_table_xpath = './/table[@class="musicTable"]/tr'
    coming_soon_artist_xpath = './/td[@class="artistName"]'
    coming_soon_album_xpath = './/td[@class="albumTitle"]/text()'

    start_urls = ["http://www.metacritic.com/music"]

    rules = [
        Rule(SgmlLinkExtractor(allow=("albums/genre/\w+", ),
                               deny=(
                                   "music",
                                   "name",
                               ),
                               restrict_xpaths=(genre_nav_xpath, )),
             callback='parse_page',
             follow=True),
        Rule(SgmlLinkExtractor(
            deny=("albums/genre/\w+", "name", "music",
                  "coming-soon/(metascore|userscore|name|date)",
                  "new-releases/name"),
            restrict_xpaths=(types_xpath, )),
             callback='parse_new_releases',
             follow=True),
        Rule(SgmlLinkExtractor(allow=(
            "albums/release-date",
            "albums/score",
        ),
                               deny=(
                                   "feature",
                                   "artist",
                                   "/\w+/people",
                               ),
                               restrict_xpaths=(first_nav_xpath, )),
             callback='parse_new_releases',
             follow=True)
    ]

    # Expires in 2 days
    expires = chartCache.timedeltaUntilDays(1)
    cacheControl = chartCache.setCacheControl(expires)
    source_id = "metacritic"
    source_name = "Metacritic"
    description = "Critically acclaimed and noteworthy music."
    have_extra = True

    details = DetailItem(
        Detail(id=source_id,
               description=description,
               name=source_name,
               have_extra=have_extra))

    def __init__(self, name=None, **kwargs):
        super(MetacriticSpider, self).__init__()
        chartCache.shoveDetails(self.details)
        chartCache.shoveDetails(self.details, False)

    def get_current_genre(self, hxs):
        navList = hxs.select(self.genre_nav_xpath)
        for index, item in enumerate(navList):
            if item.select('.//span'):
                return item.select('.//span/text()').extract()[0].strip()
        return None

    def get_current(self, hxs, chart):
        try:
            active = hxs.select(self.current_page_name_xpath).extract()
            chart["extra"] = active[0].strip()
            chart["name"] = active[1].strip()
            chart["display_name"] = chart["name"]
            chart["id"] = slugify(chart["name"] + chart["extra"])
        except Exception, e:
            if "coming-soon" in chart["origin"]:
                chart["extra"] = "Coming Soon"
                chart["name"] = "By Date"
                chart["display_name"] = chart["name"]
                chart["id"] = slugify(chart["name"] + chart["extra"])
    def parse_atom(self, feed):
        ns = {'ns': 'http://www.w3.org/2005/Atom',
            'im': 'http://itunes.apple.com/rss'}
        try:
            _id = feed.xpath('/ns:feed/ns:id', namespaces=ns)[0].text
            _type = feed.xpath('/ns:feed/ns:entry/im:contentType/im:contentType', namespaces=ns)[0].attrib['term']
        except IndexError:
            return

        if _type != "Album" and _type != "Track":
            return # skip playlists

        entries = feed.xpath('/ns:feed/ns:entry', namespaces=ns)
        chart_list = []
        rank = 0
        for entry in entries:
            title = entry.xpath('im:name', namespaces=ns)[0].text
            artist = entry.xpath('im:artist', namespaces=ns)[0].text
            if _type == "Album":
                album = title
                item = SingleAlbumItem()
            elif _type == "Track":
                album = entry.xpath('im:collection/im:name', namespaces=ns)[0].text
                item = SingleTrackItem()
                item['track'] = title
            
            rank += 1
            item['artist'] = artist
            item['album'] = album
            item['rank'] = rank
            chart_list.append( dict(item) )

        title = feed.xpath('ns:title', namespaces=ns)[0].text

        geo = None
        geo_re = re.compile("cc=([a-zA-Z]+)")
        rGeo =  geo_re.search(_id)
        if rGeo != None:
            geo = rGeo.groups()[0]

        genre = None
        genre_re = re.compile("genre=(\d+)/")
        rGenre =  genre_re.search(_id)
        if rGenre != None:
            genre = rGenre.groups()[0]

        if not genre is None:
            genre = get_genre(genre)

        origin = _id
        md5 = hashlib.md5()
        md5.update(_id)
        _id = md5.hexdigest()

        if geo is None:
            geo_s = origin.split("/")
            geo = geo_s

        chart = ChartItem()
        # Itunes expires tomorrow at 00am
        chart['id'] = _id
        chart['display_name'] = genre if genre else "Top Overall"
        chart['origin'] = origin
        chart['genre'] = genre
        chart['geo'] = geo
        chart['name'] = title
        chart['type'] = _type
        chart['list'] = chart_list
        chart['source'] = 'itunes'

        # maxage is the last item scraped
        expires = chartCache.timedeltaUntilDays(1)
        cacheControl = chartCache.setCacheControl(expires)
        chart['date'] = cacheControl.get("Date-Modified")
        chart['expires'] = cacheControl.get("Date-Expires")
        chart['maxage'] = cacheControl.get("Max-Age")

        if(_id == settings["ITUNES_DEFAULT_ALBUMCHART"] or _id == settings["ITUNES_DEFAULT_TRACKCHART"]):
            print "Found default" + _id
            chart['default'] = 1

        return chart
Esempio n. 10
0
 def parse_rss(self, feed, url):
     genre_name = None
     feed_extra = None
     feed_type = "Album"
     geo = None
     genre = filter(lambda k: 'genre' in k, urlparser(url).path.split("/"))
     try :
         genre_name = get_genre( genre[0].split("=")[1] )
         # geo in xpath is different ISO than in url. We want cc not xpath
         # geo = feed.xpath('.//channel/language')[0].text
         geo_re = re.compile("cc=(.*)(?=\/)")
         rGeo =  geo_re.search(url)
         if rGeo != None:
             geo = rGeo.groups()[0]
     except IndexError :
         return
     
     if 'newreleases' in url :
         feed_extra = "New Album Releases"
     if 'justadded' in url :
         feed_extra = "Just Added Albums"
     if 'featuredalbums' in url:
         feed_extra = "Featured Albums"
     
     if feed_extra is None or genre_name is None or geo is None :
         return
     
     ns = { 'itms': 'http://phobos.apple.com/rss/1.0/modules/itms/' }
     entries = feed.xpath('.//channel/item')
     rank = 0
     chart_list = []
     for entry in entries:
         artist = entry.xpath('itms:artist', namespaces=ns)[0].text
         album = entry.xpath('itms:album', namespaces=ns)[0].text
         rank += 1
         item = SingleAlbumItem()
         item['artist'] = artist
         item['album'] = album
         item['rank'] = rank
         chart_list.append( dict(item) )
     
     chart = ChartItem()
     # Unique ids
     _id = url
     md5 = hashlib.md5()
     md5.update(_id)
     _id = md5.hexdigest()
     
     chart['id'] = _id
     chart['origin'] = url
     chart['genre'] = genre_name
     chart['geo'] = geo.lower()
     chart['name'] = genre_name
     chart['extra'] = feed_extra
     chart["newrls"] = True
     chart['type'] = feed_type
     chart['list'] = chart_list
     chart['source'] = 'itunes'
     # maxage is the last item scraped
     # Expires in 1 days
     expires = chartCache.timedeltaUntilDays(1)
     cacheControl = chartCache.setCacheControl(expires)
     chart['date'] = cacheControl.get("Date-Modified")
     chart['expires'] = cacheControl.get("Date-Expires")
     chart['maxage'] = cacheControl.get("Max-Age")
     
     if _id == settings["ITUNES_DEFAULT_NRCHART"]:
         chart['default'] = 1
     
     return chart