def getName(self): artistData = self.bsdata.find("h1", {"class": "artist_name_hdr"}) if artistData is None: anc = artistDBNameClass(err="No H1") return anc span = artistData.find("span") if span is None: artistName = artistData.text.strip() artistNativeName = artistName else: artistName = span.text.strip() artistData = removeTag(artistData, span) artistNativeName = artistData.text.strip( ) #.replace(artistName, "").strip() if len(artistName) > 0: artistName = fixName(artistName) artistNativeName = fixName(artistNativeName) if artistName.endswith("]"): artistName = artistName.split(" [")[0].strip() if artistNativeName.endswith("]"): artistNativeName = artistNativeName.split(" [")[0].strip() anc = artistDBNameClass(name=artistName, native=artistNativeName, err=None) else: anc = artistDBNameClass(name=artistName, err="Fix") return anc
def getName(self): jdata = None for meta in self.bsdata.findAll("meta"): content = meta.attrs['content'] if content.startswith("{") and content.endswith("}"): try: jdata = json.loads(content) except: continue break artistName = None if jdata is not None: try: artistName = jdata['artist']['name'] except: anc = artistDBNameClass(name=None, err="BadJSON") return anc else: anc = artistDBNameClass(name=None, err="NoJSON") return anc latinName, nativeName = self.splitNativeName(artistName) anc = artistDBNameClass(name=latinName, native=nativeName, err=None) return anc
def getName(self): h1 = self.bsdata.find("h1", {"class": 'artistHeadline'}) artistName = h1.text if h1 is not None else None if artistName is not None: bracketValues = regex.findall(r'\[(.*?)\]+', artistName) if len(bracketValues) > 0: ignores = [ 'rap', '2', '3', '4', 'NOR', 'US', 'unknown Artist', 'CHE', 'email\xa0protected', '70s', '60s', '80s', '90s', 'BRA', 'SWE', 'France', 'FR', 'UK', 'JP', 'DE', 'USA', 'RUS', 'ARG', 'DEU' ] for ignore in ignores: arg = " [{0}]".format(ignore) if arg in artistName: artistName = artistName.replace(arg, "") bracketValues = regex.findall(r'\[(.*?)\]+', artistName) artistName = " & ".join( bracketValues) if len(bracketValues) > 0 else artistName anc = artistDBNameClass(name=artistName, err=None) return anc else: script = self.bsdata.find("script", {"type": "application/ld+json"}) if script is None: anc = artistDBNameClass(name=None, err="NoJSON") return anc try: artist = json.loads(script.contents[0])["name"] except: anc = artistDBNameClass(name=None, err="CouldNotCompileJSON") return anc artistName = artist bracketValues = regex.findall(r'\[(.*?)\]+', artistName) if len(bracketValues) > 0: ignores = [ 'rap', '2', '3', '4', 'NOR', 'US', 'unknown Artist', 'CHE', 'email\xa0protected', '70s', '60s', '80s', '90s', 'BRA', 'SWE', 'France', 'FR', 'UK', 'JP', 'DE', 'USA', 'RUS', 'ARG', 'DEU' ] for ignore in ignores: arg = " [{0}]".format(ignore) if arg in artistName: artistName = artistName.replace(arg, "") bracketValues = regex.findall(r'\[(.*?)\]+', artistName) artistName = " & ".join( bracketValues) if len(bracketValues) > 0 else artistName anc = artistDBNameClass(name=artistName, err=None) return anc
def getName(self): h1 = self.bsdata.find("h1", {"id": "naboo_artist_name"}) if h1 is None: anc = artistDBNameClass(name=None, err="NoH1") return anc span = h1.find("span", {"itemprop": "name"}) if span is None: anc = artistDBNameClass(name=None, err="NoSpan") return anc artist = span.text anc = artistDBNameClass(name=artist, err=None) return anc
def getName(self): artistData = self.bsdata.find("section", {"id": "artist-info"}) if artistData is None: anc = artistDBNameClass(err=True) return anc h1 = artistData.find("h1") if h1 is None: anc = artistDBNameClass(err="NoH1") return anc artistName = h1.text anc = artistDBNameClass(name=artistName, err=None) return anc
def getName(self): title = self.bsdata.find("strong", {"class": "pagetitle"}) artistName = None if title is not None: artistName = title.text.replace(" - Spotify Chart History", "") anc = artistDBNameClass(name=artistName, err=None) return anc
def getName(self): title = self.bsdata.find("span", {"class": "pagetitle"}) artistName = None if title is not None: artistName = title.text.split(" | ")[0].strip() anc = artistDBNameClass(name=artistName, err=None) return anc
def getName(self): artistData = self.bsdata.find("div", {"class": "artistheader"}) if artistData is None: anc = artistDBNameClass(err=True) return anc h1 = artistData.find("h1") if h1 is None: anc = artistDBNameClass(err="NoH1") ref = self.getNamesAndURLs(h1) try: artistName = ref[0].name anc = artistDBNameClass(name=artistName, err=None) except: anc = artistDBNameClass(err="TxtErr") return anc
def getName(self): artistBios = self.bsdata.findAll("div", {"class": "artist-bio-container"}) if len(artistBios) > 0: for div in artistBios: h1 = div.find("h1", {"class": "artist-name"}) if h1 is not None: artistName = h1.text.strip() if len(artistName) > 0: artist = fixName(artistName) anc = artistDBNameClass(name=artist, err=None) else: artist = "?" anc = artistDBNameClass(name=artist, err="Fix") else: anc = artistDBNameClass(err="NoH1") else: anc = artistDBNameClass(err=True) return anc return anc
def getName(self): result1 = self.bsdata.find("h1", {'class':'hide_desktop'}) result2 = self.bsdata.find("h1", {'class':'hide_mobile'}) if result1 and not result2: result = result1 elif result2 and not result1: result = result2 elif result1 and result2: result = result1 else: anc = artistDBNameClass(err=True) return anc if result: artist = result.text if len(artist) > 0: artist = fixName(artist) anc = artistDBNameClass(name=artist, err=None) else: result = self.bsdata.find("script", {"id": "artist_schema"}) if result is None: anc = artistDBNameClass(name=artist, err="Fix") else: try: artist = fixName(json.loads(result.text)["name"]) anc = artistDBNameClass(name=artist, err=None) except: anc = artistDBNameClass(name=artist, err="JSON") else: anc = artistDBNameClass(err="NoH1") return anc
def getName(self): script = self.bsdata.find("script", {"data-name": "initial-state"}) if script is None: anc = artistDBNameClass(name=None, err = "NoJSON") return anc try: jdata = json.loads(script.contents[0]) except: anc = artistDBNameClass(name=None, err = "BadJSON") return anc artistName = None for artistID, artistData in jdata['artists']['artists'].items(): artistName = artistData["artistName"] break if artistName is not None: anc = artistDBNameClass(name=artistName, err=None) return anc anc = artistDBNameClass(name=None, err = "NoArtistName") return anc
def getName(self): try: artistdiv = self.bsdata.find("script", {"id": 'initial-tealium-data'}) artistdata = artistdiv.attrs['data-tealium-data'] except: artistdata = None if artistdata is None: try: artistdiv = self.bsdata.find("div", {"id": "tlmdata"}) artistdata = artistdiv.attrs['data-tealium-data'] except: anc = artistDBNameClass(name=None, err="NoTealiumData") try: artistvals = json.loads(artistdata) artist = artistvals["musicArtistName"] except: anc = artistDBNameClass(name=None, err="NoArtistName") return anc anc = artistDBNameClass(name=artist, err=None) return anc
def getData(self, inputdata): self.getDataBase(inputdata) self.checkData() if self.dbdata is not None: return self.dbdata if not isinstance(self.bsdata, dict): raise ValueError("Could not parse LastFM API data") tracks = self.bsdata["Tracks"] albums = self.bsdata["Albums"] if len(tracks) > 0: artistData = { "Name": tracks[0]["artistName"], "URL": tracks[0]["artistURL"], "MBID": tracks[0]["artistMBID"] } elif len(albums) > 0: artistData = { "Name": albums[0]["artistName"], "URL": albums[0]["artistURL"], "MBID": albums[0]["artistMBID"] } else: return None raise ValueError("No track/album data!") artistName = artistData["Name"] artistURL = artistData["URL"] artistID = self.dbUtils.getArtistID(artistURL) generalData = None externalData = {"MusicBrainzID": artistData["MBID"]} #mbID = mbutil.getArtistID(artistData['MBID'] trackData = [{ "Name": track["name"], "URL": track["URL"], "Counts": int(track["counts"]) } for track in tracks if int(track["counts"]) > 50] counts = sorted([x["Counts"] for x in trackData], reverse=True) idx = min([len(counts) - 1, 1000 - 1]) trackData = [v for v in trackData if v['Counts'] >= counts[idx]] albumData = [{ "Name": album["name"], "URL": album["URL"], "Counts": int(album["counts"]) } for album in albums if int(album["counts"]) > 25] counts = sorted([x["Counts"] for x in albumData], reverse=True) idx = min([len(counts) - 1, 1000 - 1]) albumData = [v for v in albumData if v['Counts'] >= counts[idx]] mediaData = {} if len(trackData) > 0: mediaName = "Tracks" mediaData[mediaName] = [] for artistTrack in trackData: m = md5() m.update(artistTrack['Name'].encode('utf-8')) m.update(artistTrack['URL'].encode('utf-8')) hashval = m.hexdigest() code = str(int(hashval, 16) % int(1e7)) album = artistTrack["Name"] albumURL = artistTrack["URL"] albumArtists = [artistData["Name"]] amdc = artistDBMediaDataClass(album=album, url=albumURL, artist=albumArtists, code=code, year=None) mediaData[mediaName].append(amdc) if len(albumData) > 0: mediaName = "Albums" mediaData[mediaName] = [] for artistAlbum in albumData: m = md5() m.update(artistAlbum['Name'].encode('utf-8')) m.update(artistAlbum['URL'].encode('utf-8')) hashval = m.hexdigest() code = str(int(hashval, 16) % int(1e7)) album = artistAlbum["Name"] albumURL = artistAlbum["URL"] albumArtists = [artistName] amdc = artistDBMediaDataClass(album=album, url=albumURL, artist=albumArtists, code=code, year=None) mediaData[mediaName].append(amdc) artist = artistDBNameClass(name=artistName, err=None) meta = artistDBMetaClass(title=None, url=artistURL) url = artistDBURLClass(url=artistURL) ID = artistDBIDClass(ID=artistID) pages = artistDBPageClass(ppp=1, tot=1, redo=False, more=False) profile = artistDBProfileClass(general=generalData, external=externalData) media = artistDBMediaClass() media.media = mediaData mediaCounts = self.getMediaCounts(media) info = self.getInfo() adc = artistDBDataClass(artist=artist, meta=meta, url=url, ID=ID, pages=pages, profile=profile, mediaCounts=mediaCounts, media=media, info=info) return adc
def getData(self, inputdata): self.getDataBase(inputdata) self.checkData() if self.dbdata is not None: return self.dbdata if not isinstance(self.bsdata, dict): raise ValueError("Could not parse Spotify API data") artistData = self.bsdata['Artist'] artistID = artistData.name artistURI = artistData.get('uri') artistType = artistData.get('stype') artistPopularity = artistData.get('popularity') artistName = artistData.get('name') artistAPIURL = artistData.get('href') artistGenres = artistData.get('genres', []) artistFollowers = artistData.get('followers') artistURL = artistData.get('urls', {}).get('spotify') generalData = {"Type": artistType} genresData = artistGenres if len(artistGenres) > 0 else None externalData = {'SpotifyAPI': {"URL": artistAPIURL, "URI": artistURI}} extraData = { 'Followers': artistFollowers, "Popularity": artistPopularity } mediaData = {} albumsData = self.bsdata['Albums'] if len(albumsData) > 0: albumsURL = albumsData.get('href') if albumsData.get('artistID') != artistID: raise ValueError( "ArtistIDs do not match for Spotify API Data! [{0}, {1}]". format(albumsData.get('artistID'), artistID)) mediaData = {} for albumData in albumsData.get('albums', []): albumID = albumData.get('sid') albumGroup = albumData.get('album_group') albumType = albumData.get('album_type') albumSType = albumData.get('stype') albumArtists = [{ artist['sid']: artist['name'] } for artist in albumData.get('artists', [])] albumURL = albumData.get('urls', {}).get('spotify') albumURI = albumData.get('uri') albumAPI = albumData.get('href') albumName = albumData.get('name') albumTracks = albumData.get('numtracks') albumDate = albumData.get('date') try: albumYear = to_datetime( albumDate).year if albumDate is not None else None except: albumYear = None if all([albumGroup, albumType]): mediaName = " + ".join([albumGroup, albumType]) elif albumGroup is not None: mediaName = albumGroup elif albumType is not None: mediaName = albumType else: mediaName = "Unknown" amdc = artistDBMediaDataClass(album=albumName, url=albumURL, artist=albumArtists, code=albumID, year=albumYear, aclass=albumSType, aformat={ "URI": albumURI, "API": albumAPI, "Date": albumDate, "NumTracks": albumTracks }) if mediaData.get(mediaName) is None: mediaData[mediaName] = [] mediaData[mediaName].append(amdc) artist = artistDBNameClass(name=artistName, err=None) meta = artistDBMetaClass(title=None, url=artistURL) url = artistDBURLClass(url=artistURL) ID = artistDBIDClass(ID=artistID) pages = artistDBPageClass(ppp=1, tot=1, redo=False, more=False) profile = artistDBProfileClass(general=generalData, external=externalData, extra=extraData, genres=genresData) media = artistDBMediaClass() media.media = mediaData mediaCounts = self.getMediaCounts(media) info = self.getInfo() adc = artistDBDataClass(artist=artist, meta=meta, url=url, ID=ID, pages=pages, profile=profile, mediaCounts=mediaCounts, media=media, info=info) return adc
def getData(self, inputdata): self.getDataBase(inputdata) self.checkData() if self.dbdata is not None: return self.dbdata if not isinstance(self.bsdata, dict): raise ValueError("Could not parse Deezer API data") artist = self.bsdata artistTracks = artist["Tracks"] artistAlbums = artist["Albums"] artistName = artist["Name"] artistID = artist["ID"] artistURL = artist["URL"] generalData = {"Type": artist["Type"]} mediaData = {} mediaName = "Tracks" mediaData[mediaName] = [] for code, artistTrack in artistTracks.items(): album = artistTrack["Name"] albumURL = artistTrack["URL"] albumArtists = [artistName] amdc = artistDBMediaDataClass(album=album, url=albumURL, artist=albumArtists, code=code, year=None) mediaData[mediaName].append(amdc) mediaData = {} mediaName = "Albums" mediaData[mediaName] = [] for code, artistAlbum in artistAlbums.items(): album = artistAlbum["Name"] albumURL = artistAlbum["URL"] albumArtists = [artistName] amdc = artistDBMediaDataClass(album=album, url=albumURL, artist=albumArtists, code=code, year=None) mediaData[mediaName].append(amdc) artist = artistDBNameClass(name=artistName, err=None) meta = artistDBMetaClass(title=None, url=artistURL) url = artistDBURLClass(url=artistURL) ID = artistDBIDClass(ID=artistID) pages = artistDBPageClass(ppp=1, tot=1, redo=False, more=False) profile = artistDBProfileClass(general=generalData) media = artistDBMediaClass() media.media = mediaData mediaCounts = self.getMediaCounts(media) info = self.getInfo() adc = artistDBDataClass(artist=artist, meta=meta, url=url, ID=ID, pages=pages, profile=profile, mediaCounts=mediaCounts, media=media, info=info) return adc
def getData(self, inputdata): self.getDataBase(inputdata) self.checkData() if self.dbdata is not None: return self.dbdata if not isinstance(self.bsdata, dict): raise ValueError("Could not parse Discogs API data") artistData = self.bsdata["Artist"] albumsData = self.bsdata["Albums"] artistID = artistData.name artistName = artistData["name"] url = "https://www.discogs.com/artist/{0}".format(artistID) generalData = {} generalData["RealName"] = artistData["realname"] generalData["Aliases"] = artistData["MasterAliases"] generalData["Groups"] = artistData["MasterGroups"] generalData["Members"] = artistData["MasterMembers"] generalData["Variations"] = artistData["MasterNameVariations"] generalData = {k: v for k,v in generalData.items() if v is not None} generalData = generalData if len(generalData) > 0 else None ######################################################################## # Get Releases ######################################################################## mediaData = {} if isinstance(albumsData,list): for item in albumsData: code = item.get('id') albumType = item.get('type') albumFormat = item.get('format') albumLabel = item.get('label') albumName = item.get('name') albumURL = item.get('url') albumRole = item.get('role') albumArtist = item.get('artist') albumYear = item.get('year') albumMain = item.get('main_release') mediaName = self.getMediaType(item) amdc = artistDBMediaDataClass(album=albumName, url=albumURL, artist=albumArtist, code=code, aformat=albumFormat, aclass={"Label": albumLabel, "Main": albumMain}, year=albumYear) if mediaData.get(mediaName) is None: mediaData[mediaName] = [] mediaData[mediaName].append(amdc) elif isinstance(albumsData,dict): mediaData = albumsData else: raise ValueError("Not sure how to process albums [{0}]".format(albumsData)) artist = artistDBNameClass(name=artistName, err=None) meta = artistDBMetaClass(title=None, url=url) url = artistDBURLClass(url=url) ID = artistDBIDClass(ID=artistID) pages = artistDBPageClass(ppp=1, tot=1, redo=False, more=False) profile = artistDBProfileClass(general=generalData) media = artistDBMediaClass() media.media = mediaData mediaCounts = self.getMediaCounts(media) info = artistDBFileInfoClass(info=None) adc = artistDBDataClass(artist=artist, meta=meta, url=url, ID=ID, pages=pages, profile=profile, mediaCounts=mediaCounts, media=media, info=info) return adc