Ejemplo n.º 1
0
    def downloadMissingArtistExtras(self, maxPages=None):
        ts = timestat("Downloading Missing Artist Extra Files")
        for modVal,modValData in self.metadata.items():
            tsMod = timestat("Downloading {0} Missing Artist Extra Files For ModVal={1}".format(len(modValData), modVal))
            N = len(modValData)
            for i,(artistID,artistPageData) in enumerate(modValData.items()):
                artistName = artistPageData["Name"]
                artistURL  = artistPageData["URL"]
                pages      = artistPageData["Pages"]
                print("="*100)
                print("{0}/{1}:  [{2}] / [{3}]".format(i,N,artistName,artistURL))
                for j,page in enumerate(range(pages)):
                    if maxPages is not None:
                        if j > maxPages:
                            continue
                    url      = self.dbArtists.getArtistURL(artistURL, page=page)
                    savename = self.dutils.getArtistSavename(artistID, page=page)
                    if isFile(savename):
                        continue

                    print("{0}/{1}:  [{2}] / [{3}] / [{4}-{5}]".format(i,N,artistName,artistURL,j,pages))
                    
                    try:
                        self.dutils.downloadArtistURL(url, savename)
                    except:
                        print("Error downloading {0}".format(url))
                        
            tsMod.stop()
        ts.stop()
Ejemplo n.º 2
0
    def createDBData(self,
                     db=None,
                     fromMetadata=True,
                     fromMaps=False,
                     fromMerge=True):
        dbs = [db] if db is not None else self.dbdata.keys()

        tsFull = timestat(
            "create DB Data (fromMetadata={0}, fromMaps={1}, fromMerge={2}) For [{3}] DBs"
            .format(fromMetadata, fromMaps, fromMerge, ", ".join(dbs)))

        for db in dbs:
            print("=" * 200)

            ts = timestat("Setting DB Artists For {0}".format(db))
            self.createDBArtistData(db, fromMetadata, fromMaps, fromMerge)
            ts.stop()

            ts = timestat("Setting DB Albums For {0}".format(db))
            self.createDBArtistAlbumData(db, fromMetadata, fromMaps, fromMerge)
            ts.stop()

            print("=" * 200)
            print("")

        tsFull.stop()
Ejemplo n.º 3
0
    def downloadMissingArtistUnofficial(self):
        ts = timestat("Downloading Missing Artist Unofficial Files")
        for modVal, modValData in self.metadata.items():
            tsMod = timestat(
                "Downloading {0} Missing Artist Unofficial Files For ModVal={1}"
                .format(len(modValData), modVal))
            N = len(modValData)
            for i, (artistID, artistPageData) in enumerate(modValData.items()):
                artistName = artistPageData["Name"]
                artistURL = artistPageData["URL"]

                print("=" * 100)
                print("{0}/{1}:  [{2}] / [{3}]".format(i, N, artistName,
                                                       artistURL))
                url = self.dbArtists.getArtistURL(artistURL, unofficial=True)
                savename = self.dutils.getArtistSavename(artistID,
                                                         unofficial=True)

                if isFile(savename):
                    continue

                try:
                    self.dutils.downloadArtistURL(url, savename)
                except:
                    print("Error downloading {0}".format(url))

            tsMod.stop()
        ts.stop()
Ejemplo n.º 4
0
 def parse(self, expr, force=False, debug=False, quiet=False):
     ts = timestat("Parsing Raw Files")  
     
     tsFiles  = timestat("Finding Files To Parse")
     newFiles = self.getArtistRawFiles(datatype=self.datatype, expr=expr, force=force)
     tsFiles.stop()
         
     N = len(newFiles)
     tsParse = timestat("Parsing {0} New Raw Files".format(N))
     
     newData = 0
     modValue = 250 if N >= 500 else 50
     for i,ifile in enumerate(newFiles):
         if (i+1) % modValue == 0 or (i+1) == N:
             tsParse.update(n=i+1, N=N)
             #print("{0: <15}Parsing {1}".format("{0}/{1}".format(i+1,N), ifile))
         htmldata = getFile(ifile)
         retval   = self.artist.getData(ifile)
         artistID = retval.ID.ID
         if artistID is None:
             continue
         savename = self.dutils.getArtistSavename(artistID)
         if savename is None:
             continue
         saveFile(idata=htmldata, ifile=savename, debug=False)
         newData += 1
         
     print("Created {0}/{1} New Artist Files".format(newData, N))
     tsParse.stop()
Ejemplo n.º 5
0
    def parse(self, expr, force=False, debug=False, quiet=False):
        ts = timestat("Parsing Raw HTML Files")
        
        tsFiles  = timestat("Finding Files To Parse")
        newFiles = self.getArtistRawHTMLFiles(expr, force)
        tsFiles.stop()
        if debug:
            print("Parsing {0} Raw HTML Files From Expr[{1}]".format(len(newFiles), expr))

        N = len(newFiles)
        modValue = 250 if N >= 500 else 50
        tsParse = timestat("Parsing {0} Raw HTML Files".format(N))
        for i,ifile in enumerate(newFiles):
            if (i+1) % modValue == 0 or (i+1) == N or debug:
                tsParse.update(n=i+1, N=N)
                #print("{0: <15}Parsing {1}".format("{0}/{1}".format(i+1,N), ifile))
            
            if debug:
                print("{0}/{1}\tParsing {2}".format(i,N,ifile))
            htmldata = getFile(ifile)
            retval   = self.artist.getData(ifile)
            artistID = retval.ID.ID
            if debug:
                print("  ---> ID={0}".format(artistID))
            savename = self.dutils.getArtistSavename(artistID)
            saveFile(idata=htmldata, ifile=savename, debug=False)        
        
        tsParse.stop()
        ts.stop()
Ejemplo n.º 6
0
 def parse(self, expr, force=False, debug=False, quiet=False):
     ts = timestat("Parsing Raw Pickled HTML Files(expr=\'{0}\', force={1}, debug={2}, quiet={3})".format(expr, force, debug, quiet))
     
     io = fileIO()
     newFiles = self.getArtistRawFiles(datatype="data", expr=expr, force=force)
     
     N = len(newFiles)
     modValue = 250 if N >= 500 else 50
     nSave = 0
     tsParse = timestat("Parsing {0} Raw Picked HTML Files".format(N))
     for i,ifile in enumerate(newFiles):
         if (i+1) % modValue == 0 or (i+1) == N or debug:
             tsParse.update(n=i+1, N=N)
         retval   = self.artist.getData(ifile)
         if retval is None:
             if debug:
                 print("Could not find data for {0}".format(ifile))
             continue
         artistID = retval.ID.ID
         if artistID is None:
             if debug:
                 print("Could not find artistID for {0}".format(ifile))
             continue
         savename = self.dutils.getArtistSavename(artistID)
         if isinstance(savename,str) and (force == True or fileUtil(savename).exists == False):
             io.save(idata=retval, ifile=savename)
             nSave += 1
             
     ts.stop()
     print("Saved {0} New Files".format(nSave))
Ejemplo n.º 7
0
    def parseSearch(self, modVal, expr=None, force=False, debug=False, quiet=False):
        ts = timestat("Parsing Discogs Search ModVal={0} Files(expr=\'{1}\', force={2}, debug={3}, quiet={4})".format(modVal, expr, force, debug, quiet))
                        
        io = fileIO()

            
        ########################################################################################
        # Previous DB Data
        ########################################################################################
        if not fileUtil(self.disc.getDBModValFilename(modVal)).exists:
            tsDB = timestat("Creating New DB For ModVal={0}".format(modVal))
            dbdata = Series({})
            ts.stop()
        else:
            tsDB = timestat("Loading ModVal={0} DB Data".format(modVal))
            dbdata = self.disc.getDBModValData(modVal)
            tsDB.stop()
            
        
        ########################################################################################
        # Previous Media Data
        ########################################################################################
        previousMetadata = self.disc.getMetadataAlbumData(modVal)
        
        
        ########################################################################################
        # Artist Search Data (No Media)
        ########################################################################################
        tsDB = timestat("Loading Artist Search Data For ModVal={0}".format(modVal))
        artistSearchFilenames = self.getArtistRawFiles(datatype="search", expr=expr, force=True)
        artistSearchFilename = [x for x in artistSearchFilenames if fileUtil(x).basename == "artistData-{0}".format(modVal)]
        if len(artistSearchFilename) == 1:
            artistSearchData = io.get(artistSearchFilename[0])
        else:
            raise ValueError("Could not find Discogs API Artist Search Data")
        tsDB.stop()
        
        
        N = artistSearchData.shape[0]
        modValue = 5000 if N >= 50000 else 1000
        nSave = 0
        tsParse = timestat("Parsing {0} Searched For Discogs API Artists".format(N))
        Nnew = 0
        for i,(artistID,artistData) in enumerate(artistSearchData.iterrows()):
            if (i+1) % modValue == 0 or (i+1) == N:
                tsParse.update(n=i+1, N=N)
            if dbdata.get(artistID) is not None:
                continue
            artistAPIData = {"Artist": artistData, "Albums": previousMetadata.get(artistID, {})}
            dbdata = dbdata.append(Series({artistID: self.artist.getData(artistAPIData)}))
            Nnew += 1
            
        if Nnew > 0:
            print("Saving [{0}/{1}] {2} Entries To {3}".format(len(dbdata), len(dbdata), "ID Data", self.disc.getDBModValFilename(modVal)))
            self.disc.saveDBModValData(modVal=modVal, idata=dbdata)
        else:
            print("Not saving any of the new data")
                
        ts.stop()            
Ejemplo n.º 8
0
    def findMyMusic(self, primeDir=None, artistName=None):
        artistAlbums = {}
        if primeDir is None and artistName is None:
            ts = timestat("Find PrimeDir Artist Paths")
            pdPaths = {
                pd: pdpath
                for pd, pdpath in
                {pd: setDir(self.musicDir, pd)
                 for pd in self.pdDirs}.items() if dirUtil(pdpath).isDir()
            }
            pdArtistPaths = {
                pd: findDirs(pdpath)
                for pd, pdpath in pdPaths.items()
            }
            artistPaths = {
                fsap.name: fsap.path
                for fsap in
                [dirUtil(ap) for ap in getFlatList(pdArtistPaths.values())]
            }
            artistAlbums = {
                artistName: self.getArtistPathData(artistName, artistPath)
                for artistName, artistPath in artistPaths.items()
            }
            print("  Found {0} Artists From {1} Prime Directories".format(
                len(artistAlbums), len(pdArtistPaths)))
            ts.stop()
        elif primeDir is not None:
            ts = timestat(
                "Finding All Artist Albums From [{0}] Prime Directory".format(
                    primeDir))
            pdPaths = {
                pd: pdpath
                for pd, pdpath in
                {pd: setDir(self.musicDir, pd)
                 for pd in [primeDir]}.items() if dirUtil(pdpath).isDir()
            }
            pdArtistPaths = {
                pd: findDirs(pdpath)
                for pd, pdpath in pdPaths.items()
            }
            artistPaths = {
                fsap.name: fsap.path
                for fsap in
                [dirUtil(ap) for ap in getFlatList(pdArtistPaths.values())]
            }
            artistAlbums = {
                artistName: self.getArtistPathData(artistName, artistPath)
                for artistName, artistPath in artistPaths.items()
            }
            print("  Found {0} Artists From [{1}] Prime Directory".format(
                len(artistAlbums), primeDir))
            ts.stop()
        elif artistName is not None:
            ts = timestat("Finding [{0}] Artist Albums".format(artistName))
            artistAlbums = self.getArtistPathData(artistName)
            ts.stop()

        self.artistAlbums = artistAlbums
        return artistAlbums
Ejemplo n.º 9
0
    def parse(self, modVal, expr, force=False, debug=False):
        ts = timestat("Parsing ModVal={0} Unofficial Files".format(modVal))

        tsFiles = timestat("Finding Files To Parse")
        newFiles = self.getArtistUnofficialFiles(modVal, expr, force)
        tsFiles.stop()

        N = len(newFiles)
        modValue = 50 if N >= 100 else 10
        if N > 0:
            tsDB = timestat("Loading ModVal={0} DB Data".format(modVal))
            dbdata = self.getDBData(modVal, force)
            tsDB.stop()

        newData = 0
        tsParse = timestat(
            "Parsing {0} New Unofficial Files For ModVal={1}".format(
                N, modVal))
        for i, ifile in enumerate(newFiles):
            if (i + 1) % modValue == 0 or (i + 1) == N:
                print("{0: <15}Parsing {1}".format("{0}/{1}".format(i + 1, N),
                                                   ifile))
            artistID = getBaseFilename(ifile)
            info = self.artist.getData(ifile)

            currentKeys = []
            if dbdata.get(artistID) is not None:
                currentKeys = list(dbdata[artistID].media.media.keys())
            else:
                dbdata[artistID] = info
                newData += 1
                continue

            keys = list(set(list(info.media.media.keys()) + currentKeys))
            for k in keys:
                v = info.media.media.get(k)
                if v is None:
                    continue
                iVal = {v2.code: v2 for v2 in v}
                dVal = dbdata[artistID].media.media.get(k)
                if dVal is None:
                    Tretval = iVal
                else:
                    Tretval = {v2.code: v2 for v2 in dVal}
                    Tretval.update(iVal)
                dbdata[artistID].media.media[k] = list(Tretval.values())
            newData += 1

        tsParse.stop()

        print("Found {0} Unofficial Artist Records For ModVal={1}".format(
            newData, modVal))
        if newData > 0:
            self.saveDBData(modVal, dbdata, newData)
Ejemplo n.º 10
0
    def parse(self, modVal, expr, force=False, debug=False, quiet=False):
        ts = timestat("Parsing Primary ModVal={0} Files(expr=\'{1}\', force={2}, debug={3}, quiet={4})".format(modVal, expr, force, debug, quiet))
                
        tsFiles  = timestat("Finding Files To Parse")
        newFiles = self.getArtistPrimaryFiles(modVal, expr, force)
        tsFiles.stop()

        N = len(newFiles)        
        if N == 0:
            ts.stop()
            return
        
        modValue = max([250 * round((N/10)/250), 250])

        if force is True or not fileUtil(self.disc.getDBModValFilename(modVal)).exists:
            tsDB = timestat("Creating New DB For ModVal={0}".format(modVal))
            dbdata = {}
            ts.stop()
        else:
            tsDB = timestat("Loading ModVal={0} DB Data".format(modVal))
            dbdata = self.disc.getDBModValData(modVal)
            tsDB.stop()
            
        newData  = 0
        tsParse = timestat("Parsing {0} New Files For ModVal={1}".format(N, modVal))
        for i,ifile in enumerate(newFiles):
            if (i+1) % modValue == 0 or (i+1) == N:
                tsParse.update(n=i+1, N=N)
                #print("{0: <15}Parsing {1}".format("{0}/{1}".format(i+1,N), ifile))
                
            artistID = getBaseFilename(ifile)
            info     = self.artist.getData(ifile)
            if debug:
                print("\t",ifile,' ==> ',info.ID.ID,' <-> ',artistID)
            if info.ID.ID != artistID:
                if debug is True:
                    print("Error for {0}  ID={1}  FileID={2}".format(info.meta.title,info.ID.ID,artistID))
                    1/0
                continue
            dbdata[artistID] = info
            newData += 1
        tsParse.stop()
            
        if newData > 0:
            dbdata = Series(dbdata)
            print("Saving [{0}/{1}] {2} Entries To {3}".format(newData, len(dbdata), "ID Data", self.disc.getDBModValFilename(modVal)))
            self.disc.saveDBModValData(modVal=modVal, idata=dbdata)
        
        ts.stop()
        
        return newData > 0
Ejemplo n.º 11
0
    def downloadUnknownArtistCompositions(self):
        newIgnores = []
        for modVal, modValMetadata in self.metadata.items():
            N = len(modValMetadata)
            ts = timestat(
                "Downloading {0} Unknown Composition Files For ModVal={1}".
                format(N, modVal))
            for i, (artistID,
                    artistIDData) in enumerate(modValMetadata.items()):
                savename = self.dutils.getArtistSavename(artistID, song=True)

                href = artistIDData["URL"]
                artist = artistIDData["Name"]
                if isFile(savename):
                    continue

                ## Replace /credits with /songs
                href = "/".join(href.split('/')[:-1] + ["songs", "all"])

                ## Create Full URL
                url = urllib.parse.urljoin(self.dbArtists.baseURL, href)
                print("\n")
                print("=" * 100)
                print("{0}/{1}:  [{2}] / [{3}]".format(i, N, artist, url))

                data, response = self.dutils.downloadURL(url)
                if response == 200:
                    bsdata = getHTML(data)
                    if len(bsdata.findAll("th",
                                          {"class": "title-composer"})) > 0:
                        print("  ---> Saving Data To {0}".format(savename))
                        saveFile(idata=data, ifile=savename)
                        sleep(3)
                        continue

                sleep(3)
                newIgnores.append(artistID)

                if i == 20:
                    break
            ts.stop()

        print("New IDs To Ignore")
        print(newIgnores)
        tsUpdate = timestat(
            "Adding {0} ArtistIDs To Master Composition Ignore List".format(
                len(newIgnores)))
        self.updateMasterIgnoreCompositionData(newIgnores)
        tsUpdate.stop()
Ejemplo n.º 12
0
 def parse(self, modVal, expr, force=False, debug=False, quiet=False):
     ts = timestat("Parsing Raw Pickled Spotify API Primary ModVal={0} Files(expr=\'{1}\', force={2}, debug={3}, quiet={4})".format(modVal, expr, force, debug, quiet))
     
     io = fileIO()
     newFiles = self.getArtistPrimaryFiles(modVal, expr, force)
     print("Found {0} New Files".format(len(newFiles)))
     if len(newFiles) == 0:            
         return
     
     artistSearchFilename = self.getArtistRawFiles(datatype="search", expr=expr, force=True)
     if len(artistSearchFilename) == 1:
         artistSearchData = io.get(artistSearchFilename[0])
     else:
         raise ValueError("Could not find Spotify API Artist Search Data")
             
     if force is True or not fileUtil(self.disc.getDBModValFilename(modVal)).exists:
         tsDB = timestat("Creating New DB For ModVal={0}".format(modVal))
         dbdata = Series({})
         ts.stop()
     else:
         tsDB = timestat("Loading ModVal={0} DB Data".format(modVal))
         dbdata = self.disc.getDBModValData(modVal)
         tsDB.stop()
     
     
     N = len(newFiles)
     modValue = 500 if N >= 5000 else 100
     nSave = 0
     tsParse = timestat("Parsing {0} Raw Picked API Files".format(N))
     for i,ifile in enumerate(newFiles):
         dData = io.get(ifile)
         artistID = dData['artistID']
         try:
             artistData = artistSearchData.loc[artistID]
         except:
             print("Could not find Spotify ID [{0}]".format(artistID))
             continue
             
         artistAPIData = {"Artist": artistData, "Albums": dData}
         dbdata = dbdata.append(Series({artistID: self.artist.getData(artistAPIData)}))
         nSave += 1
         
     if nSave > 0:
         print("Saving [{0}/{1}] {2} Entries To {3}".format(nSave, len(dbdata), "ID Data", self.disc.getDBModValFilename(modVal)))
         self.disc.saveDBModValData(modVal=modVal, idata=dbdata)
     else:
         print("Not saving any of the new data")
             
     ts.stop()
Ejemplo n.º 13
0
    def createArtistIDMap(self):
        print("="*125)
        ts = timestat("Creating Artist DBs for ==> {0} <==".format(self.db))
        print("="*125)

        artistIDToName       = Series()
        artistIDToRef        = Series()
        
        modValue = 10 if self.debug is False else 1
        for modVal in range(100):
            metadata       = self.disc.getMetadataArtistData(modVal)
            artistIDToName = artistIDToName.append(metadata.apply(lambda x: self.manc.getArtistName(x[0])))
            artistIDToRef  = artistIDToRef.append(metadata.apply(lambda x: x[1]))
            if (modVal+1) % modValue == 0:
                print("{0: <15}{1: >9}".format("ModVal={0}".format(modVal+1),len(artistIDToName)))
        print("\n\n==============================================\n")
        

        
        print("Saving [{0}] {1} Entries To {2}".format(len(artistIDToName), "ID => Name", self.disc.getArtistIDToNameFilename()))
        self.disc.saveArtistIDToNameData(idata=artistIDToName)
        
        print("Saving [{0}] {1} Entries To {2}".format(len(artistIDToName), "ID => Ref", self.disc.getArtistIDToRefFilename()))
        self.disc.saveArtistIDToRefData(idata=artistIDToRef)
        
        ts.stop()
Ejemplo n.º 14
0
    def createArtistIDAlbumsMap(self):
        print("="*125)
        ts = timestat("Creating Album DBs for ==> {0} <==".format(self.db))
        print("="*125)

        artistIDToNumAlbums      = {}
        artistIDToAlbumNames     = {}
        nAllAlbums               = 0
        
        modValue = 10 if self.debug is False else 1
        for modVal in range(100):
            metadata       = self.disc.getMetadataAlbumData(modVal)
            for j,(artistID,artistData) in enumerate(metadata.iteritems()):
                artistIDToNumAlbums[artistID]      = 0
                artistIDToAlbumNames[artistID]     = {}

                for mediaName,mediaData in artistData.items():
                    artistIDToAlbumNames[artistID].update({mediaName: mediaData})
                    nAllAlbums += len(mediaData)
                    artistIDToNumAlbums[artistID] += len(mediaData)
                    
            if (modVal+1) % modValue == 0:
                print("{0: <15}{1: >9}{2: >9}".format("ModVal={0}".format(modVal+1), len(artistIDToNumAlbums), nAllAlbums))

        print("\n\n==============================================\n")

        artistIDToNumAlbums = Series(artistIDToNumAlbums)
        print("Saving [{0}] {1} Entries To {2}".format(len(artistIDToNumAlbums), "ID => NumAlbums", self.disc.getArtistIDToNumAlbumsFilename()))
        self.disc.saveArtistIDToNumAlbumsData(idata=artistIDToNumAlbums)
        
        artistIDToAlbumNames = Series(artistIDToAlbumNames)
        print("Saving [{0}] {1} Entries To {2}".format(len(artistIDToAlbumNames), "ID => AlbumNames", self.disc.getArtistIDToAlbumNamesFilename()))
        self.disc.saveArtistIDToAlbumNamesData(idata=artistIDToAlbumNames)

        ts.stop()
Ejemplo n.º 15
0
    def createArtistMetadataMap(self):
        ts = timestat("Creating Artist DBs")

        artistIDGenre          = {}
        artistIDStyle          = {}
        artistIDCollaborations = {}

        albumsMetadataDBDir = self.disc.getAlbumsMetadataDBDir()
        files = findPatternExt(albumsMetadataDBDir, pattern="-ArtistMetadata", ext='.p')

        for ifile in files:
            print(ifile,'\t',end="")
            for artistID,artistData in getFile(ifile).items():
                genre   = artistData['Genre']
                artistIDGenre[artistID] = genre
                artists = artistData['Artists']
                artistIDCollaborations[artistID] = artists
                style   = artistData['Style']
                artistIDStyle[artistID] = style
            print(len(artistIDGenre))
        print("\n\n==============================================\n")


        savenames = {"IDToGenre": artistIDGenre, "IDToStyle": artistIDStyle, "IDToCollaborations": artistIDCollaborations}
        for basename,savedata in savenames.items():
            savename = setFile(self.disc.getDiscogDBDir(), "Artist{0}.p".format(basename))
            print("Saving {0} entries to {1}\n".format(len(savedata), savename))
            saveFile(ifile=savename, idata=Series(savedata), debug=True)   

        ts.stop()
Ejemplo n.º 16
0
 def createAlbumMetadata(self):
     ts = timestat("Creating Artist Album Metadata For ModVal={0}".format(self.modVal))        
     artistIDMetadata = {}
     errs = {}
     for artistID,artistData in self.dbData.items():
         artistID = str(artistID)
         if artistData.artist.name is None:
             continue
         artistIDMetadata[artistID] = {}
         for mediaName,mediaData in artistData.media.media.items():
             try:
                 albumURLs  = {mediaValues.code: mediaValues.url for mediaValues in mediaData}
                 albumNames = {mediaValues.code: mediaValues.album for mediaValues in mediaData}
                 artistIDMetadata[artistID][mediaName] = albumNames #, albumURLs]  
             except:
                 errs[artistID] = artistData.artist.name
                 #print(artistID,'\t',mediaName)
     artistIDMetadata = Series(artistIDMetadata)
     
     print("Saving [{0}] {1} Entries To {2}".format(len(artistIDMetadata), "ID => AlbumNames", self.disc.getMetadataAlbumFilename(self.modVal)))
     self.disc.saveMetadataAlbumData(idata=artistIDMetadata, modVal=self.modVal)    
     
     ts.stop()
     
     print(errs)
Ejemplo n.º 17
0
    def createCompositionMetadata(self, modVal=None):
        modVals = [modVal] if modVal is not None else range(100)

        ts = timestat("Creating AllMusic Composition Metadata")
        for modVal in modVals:

            tsDBData = timestat(
                "Finding Known Credit Artists For ModVal={0}".format(modVal))
            dbData = self.getDBData(modVal)
            dbArtistURLs = {
                artistID: {
                    "Name": artistData.artist.name,
                    "URL": artistData.url.url
                }
                for artistID, artistData in dbData.items()
            }
            tsDBData.stop()

            tsCredit = timestat(
                "Finding Known Credit Artists From {0} Artists For ModVal={1}".
                format(len(dbArtistURLs), modVal))
            creditArtistIDs = {
                artistID: artistData
                for artistID, artistData in dbArtistURLs.items()
                if artistData["URL"] is not None
                and artistData["URL"].endswith("/credits")
            }
            tsCredit.stop()

            tsIgnore = timestat(
                "Removing IDs To Ignore From {0} Primary Files For ModVal={0}".
                format(len(creditArtistIDs), modVal))
            availableArtistIDs = {
                artistID: artistData
                for artistID, artistData in creditArtistIDs.items()
                if artistID not in self.songIgnores
            }
            tsIgnore.stop()

            tsMeta = timestat(
                "Finding Metadata For {0}/{1}/{2} Missing ArtistIDs for ModVal={3}"
                .format(len(availableArtistIDs), len(creditArtistIDs),
                        len(dbArtistURLs), modVal))
            self.metadata[modVal] = availableArtistIDs
            tsMeta.stop()
        ts.stop()
Ejemplo n.º 18
0
 def createArtistMetadata(self):
     ts = timestat("Creating Artist Name Metadata For ModVal={0}".format(self.modVal))
     artistIDMetadata = {str(artistID): [artistData.artist.name, artistData.url.url] for artistID,artistData in self.dbData.items() if artistData.artist.name is not None}
     artistIDMetadata = Series(artistIDMetadata)
     
     print("Saving [{0}] {1} Entries To {2}".format(len(artistIDMetadata), "ID => Name/URL", self.disc.getMetadataArtistFilename(self.modVal)))
     self.disc.saveMetadataArtistData(idata=artistIDMetadata, modVal=self.modVal)
     
     ts.stop()
Ejemplo n.º 19
0
 def parseSearch(self, modVal, expr=None, force=False, debug=False, quiet=False):
     ts = timestat("Parsing Spotify Search ModVal={0} Files(expr=\'{1}\', force={2}, debug={3}, quiet={4})".format(modVal, expr, force, debug, quiet))
             
     if not fileUtil(self.disc.getDBModValFilename(modVal)).exists:
         tsDB = timestat("Creating New DB For ModVal={0}".format(modVal))
         dbdata = Series({})
         ts.stop()
     else:
         tsDB = timestat("Loading ModVal={0} DB Data".format(modVal))
         dbdata = self.disc.getDBModValData(modVal)
         tsDB.stop()
         
     
     io = fileIO()
     artistSearchFilename = self.getArtistRawFiles(datatype="search", expr=expr, force=True)
     if len(artistSearchFilename) == 1:
         artistSearchData = io.get(artistSearchFilename[0])
     else:
         raise ValueError("Could not find Spotify API Artist Search Data")
     #print(artistSearchData.columns)
     
     
     amv = artistModValue()
     idx = artistSearchData.reset_index()['sid'].apply(amv.getModVal) == modVal
     idx.index = artistSearchData.index
     artists = artistSearchData[idx]
     N = artists.shape[0]
     
     tsParse = timestat("Parsing {0} Searched For Spotify API Artists".format(N))
     Nnew = 0
     for artistID,artistData in artists.iterrows():
         if dbdata.get(artistID) is not None:
             continue
         artistAPIData = {"Artist": artistData, "Albums": {}}
         dbdata = dbdata.append(Series({artistID: self.artist.getData(artistAPIData)}))
         Nnew += 1
         
     if Nnew > 0:
         print("Saving [{0}/{1}] {2} Entries To {3}".format(len(dbdata), len(dbdata), "ID Data", self.disc.getDBModValFilename(modVal)))
         self.disc.saveDBModValData(modVal=modVal, idata=dbdata)
     else:
         print("Not saving any of the new data")
             
     ts.stop()
Ejemplo n.º 20
0
    def mergeArtistAlbumIDMap(self):
        print("="*50)
        print("")
        ts = timestat("Merging ArtistAlbumID DBs for ==> {0} <==".format(self.db))
        print("")
        print("="*50)
                
        mergerData = self.mam.getMergerDataByDB(self.db)
        
        savenames = ["IDToAlbumNames", "IDToAlbumRefs"]
        for basename in savenames:
            savename = setFile(self.disc.getDiscogDBDir(), "Artist{0}PreMerge.p".format(basename))
            savedata = getFile(savename).to_dict()
            print("Found {0} entries.".format(len(savedata)))
        
            
            fromIDs = mergerData.apply(lambda x: len(x["MergeData"])).sum()
            toIDs   = len(mergerData)
            
            print("")
            print("================================================")
            print("  Merger From [{0}] DB IDs To [{1}] New IDs".format(fromIDs, toIDs))
            print("    Pre Merge [{0}]".format(len(savedata)))

            for artistName,artistData in mergerData.iteritems():
                newID = artistData["ID"]
                dbIDs = artistData["MergeData"].keys()
                #print(newID,'\t',len(dbIDs),'\t',artistName)

                savedata[newID] = {}
                for i,artistID in enumerate(dbIDs):
                    if savedata.get(artistID) is not None:
                        for mediaName, mediaData in savedata[artistID].items():
                            if savedata[newID].get(mediaName) is not None:
                                savedata[newID][mediaName].update(mediaData)
                            else:
                                savedata[newID][mediaName] = mediaData

                for artistID in dbIDs:
                    try:
                        del savedata[artistID]
                    except:
                        print("Could not delete merged ID {0}".format(artistID))
                        
        
            print("   Post Merge [{0}]".format(len(savedata)))
            print("================================================")
            print("")

            
            savename = setFile(self.disc.getDiscogDBDir(), "Artist{0}.p".format(basename))
            print("Saving {0} entries to {1}\n".format(len(savedata), savename))
            saveFile(ifile=savename, idata=Series(savedata), debug=True)
            sleep(0.5)

        ts.stop()                
Ejemplo n.º 21
0
    def getData(self, fast=True, local=False):
        ftype = {True: "Pickle", False: "YAML"}
        ltype = {True: "Local", False: "Main"}
        ts = timestat("Getting Manual Renames Data From {0} {1} File".format(
            ltype[local], ftype[fast]))
        fname = self.getFilename(fast, local)
        manualRenames = self.io.get(fname)

        ts.stop()

        return manualRenames
Ejemplo n.º 22
0
 def createExtraMetadata(self, modVal=None):
     modVals = [modVal] if modVal is not None else range(100)
         
     ts = timestat("Creating Extra Files Metadata")
     for modVal in modVals:            
         tsDBData = timestat("Finding Pages/URL Data For ModVal={0}".format(modVal))
         dbData = self.getDBData(modVal)
         dbArtistURLPages = {artistID: {"Name": artistData.artist.name, "URL": artistData.url.url, "Pages": self.getNumPages(artistData.pages)} for artistID,artistData in dbData.items()}
         tsDBData.stop()
         
         tsPages = timestat("Finding Artists With More Pages From {0} Artists For ModVal={1}".format(len(dbArtistURLPages), modVal))
         pagesData = {artistID: artistData for artistID,artistData in dbArtistURLPages.items() if artistData["Pages"] > 1}
         tsPages.stop()
         
         tsIgnore = timestat("Removing Ignored Artists From {0} Artists For ModVal={1}".format(len(pagesData), modVal))
         ignoreData = {artistID: artistData for artistID,artistData in pagesData.items() if artistData["Name"] not in self.extraIgnores}
         tsIgnore.stop()
         
         tsMeta = timestat("Saving Metadata From {0}/{1}/{2} For ModVal={3}".format(len(ignoreData), len(pagesData),len(dbArtistURLPages),modVal))
         self.metadata[modVal] = ignoreData
         tsMeta.stop()
     ts.stop()
Ejemplo n.º 23
0
    def createMasterDBArtistAlbumsDataFrame(self):
        ts = timestat("=================================== Creating Artist Album DB ===================================")
        
        print("Loading ArtistID Data")
        artistIDtoAlbumNames  = self.disc.getArtistIDToAlbumNamesData()
        
        print("Creating Flattened List for {0} Artists".format(artistIDtoAlbumNames.shape[0]))
        artistIDToAlbumNamesData = artistIDtoAlbumNames.apply(lambda val: getFlatList([mediaData.values() for mediaType,mediaData in val.items()]))
        savename = setFile(self.disc.getDiscogDBDir(), "MasterArtistIDToAlbums.p")
        print("Saving {0}/{1} artists/albums to {2}".format(len(artistIDToAlbumNamesData), artistIDToAlbumNamesData.apply(len).sum(), savename))
        saveFile(ifile=savename, idata=artistIDToAlbumNamesData)
        
        artistIDToNumAlbumsData = artistIDToAlbumNamesData.apply(len)
        artistIDToNumAlbumsData.name = "NumAlbums"
        savename = setFile(self.disc.getDiscogDBDir(), "MasterArtistIDToNumAlbums.p")
        print("Saving {0}/{1} artists/albums to {2}".format(len(artistIDToNumAlbumsData), artistIDToNumAlbumsData.sum(), savename))
        saveFile(ifile=savename, idata=artistIDToNumAlbumsData)
        
        ts.stop()
        return
        

        ts = timestat("=================================== Creating Artist Album DB ===================================")

        print("Creating Pandas DataFrame for {0} Artists".format(artistIDtoAlbumNames.shape[0]))
        cols = ["Albums"]
        discdf = DataFrame(artistIDtoAlbumNames)
        discdf.columns = cols
        print("\tShape --> {0}".format(discdf.shape))

        print("DataFrame Shape is {0}".format(discdf.shape))

        saveFilename = self.disc.getMasterDBArtistAlbumsFilename()
        print("Saving Master Artist Albums DB File: {0}".format(saveFilename))
        saveFile(ifile=saveFilename, idata=discdf, debug=False)

        ts.stop()
Ejemplo n.º 24
0
    def downloadUnknownArtistCredits(self):
        newIgnores = []
        for modVal, modValMetadata in self.metadata.items():
            N = len(modValMetadata)
            ts = timestat(
                "Downloading {0} Unknown Credit Files For ModVal={1}".format(
                    N, modVal))
            for i, (artistID,
                    artistIDData) in enumerate(modValMetadata.items()):
                savename = self.dutils.getArtistSavename(artistID, credit=True)
                if isFile(savename):
                    continue
                title = artistIDData["title"]
                title = title.replace("Artist Search for ", "")
                title = title.replace(" | AllMusic", "")
                title = title.replace("Songs, Albums, Reviews, Bio & More",
                                      "").strip()
                title = title[1:] if title.startswith('"') else title
                title = title[:-1] if title.endswith('"') else title
                artist = title
                print("{0}/{1}:  [{2}]".format(i, N, artist))
                if len(artist) < 1:
                    continue
                numDownload = self.dbArtists.searchForArtistCredit(
                    artist=artist, artistID=artistID)
                if numDownload == 0:
                    newIgnores.append(artistID)
            ts.stop()

        print("New IDs To Ignore")
        print(newIgnores)
        tsUpdate = timestat(
            "Adding {0} ArtistIDs To Master Credit Ignore List".format(
                len(newIgnores)))
        self.updateMasterIgnoreCreditData(newIgnores)
        tsUpdate.stop()
Ejemplo n.º 25
0
 def parse(self, expr, force=False, debug=False, quiet=False):
     ts = timestat("Parsing Raw HTML Files(expr=\'{0}\', force={1}, debug={2}, quiet={3})".format(expr, force, debug, quiet))
     
     io = fileIO()
     newFiles = self.getArtistRawHTMLFiles(expr, force=force)
     
     N = len(newFiles)
     modValue = 250 if N >= 500 else 50
     modValue = 500 if N >= 2000 else modValue
     nSave = 0
     tsParse = timestat("Parsing {0} Raw HTML Files".format(N))
     for i,ifile in enumerate(newFiles):
         if (i+1) % modValue == 0 or (i+1) == N:
             tsParse.update(n=i+1, N=N)
         htmldata = io.get(ifile)
         retval   = self.artist.getData(ifile)
         artistID = retval.ID.ID
         savename = self.dutils.getArtistSavename(artistID)
         if isinstance(savename,str) and (force == True or fileUtil(savename).exists == False):
             io.save(idata=retval, ifile=savename)
             nSave += 1
             
     ts.stop()
     print("Saved {0} New Files".format(nSave))
Ejemplo n.º 26
0
def poolParse(dbObj, modVals, expr="< 0 Days", force=False, numProcs=2):
    num_processes = numProcs
    func = parseDB
    argument_list = modVals
    args = {"expr": expr, "class": dbObj, "force": force, "debug": False}

    ## Create kwargs for pool
    # Giving some arguments for kwargs
    pfunc = partial(func, **args)

    ts = timestat("Running imap multiprocessing for {0} mod values ...".format(
        len(argument_list)))
    result_list = tqdmMap(func=pfunc,
                          argument_list=argument_list,
                          num_processes=num_processes)
    ts.stop()
Ejemplo n.º 27
0
 def createArtistDataForMatch(self):
     print("="*125)
     ts = timestat("Creating Slimmed >=2 Albums + Translation DBs for ==> {0} <==".format(self.db))
     print("="*125)
     ignoreIDs = self.miid.getIgnoreDBIDs(self.db)
     print("  Removing These IDs: {0}".format(ignoreIDs))
     
     
     ####################################################################################################################
     ### Artist ID => Num Albums (w/ >=2 Albums)
     ####################################################################################################################
     artistIDToNumAlbums       = self.disc.getArtistIDToNumAlbumsData()
     artistIDToSearchNumAlbums = artistIDToNumAlbums[artistIDToNumAlbums >= 2]
     artistIDToSearchNumAlbums = artistIDToSearchNumAlbums[~artistIDToSearchNumAlbums.index.isin(ignoreIDs)]
     print("Saving [{0}/{1}] {2} Entries To {3}".format(len(artistIDToSearchNumAlbums), len(artistIDToNumAlbums), 
                                                        "ID => NumAlbums", self.disc.getArtistIDToSearchNumAlbumsFilename()))
     self.disc.saveArtistIDToSearchNumAlbumsData(idata=artistIDToSearchNumAlbums)
     
     
     ####################################################################################################################
     ### Artist ID => Name (w/ >=2 Albums)
     ####################################################################################################################
     artistIDToName            = self.disc.getArtistIDToNameData()
     artistIDToSearchName      = artistIDToName[artistIDToNumAlbums >= 2]        
     artistIDToSearchName      = artistIDToSearchName[~artistIDToSearchName.index.isin(ignoreIDs)]
     artistIDToSearchTransName = artistIDToSearchName.apply(self.transDB.renamed)
     numTranslated = (artistIDToSearchTransName != artistIDToSearchName).sum()
     print("Saving [{0}/{1}] (NumTrans={2}) {3} Entries To {4}".format(len(artistIDToSearchTransName), len(artistIDToName), 
                                                                       numTranslated, "ID => Name",
                                                                       self.disc.getArtistIDToSearchNameFilename()))
     self.disc.saveArtistIDToSearchNameData(idata=artistIDToSearchTransName)
     
     
     ####################################################################################################################
     ### Artist ID => Albums (w/ >=2 Albums)
     ####################################################################################################################
     artistIDToAlbumNames       = self.disc.getArtistIDToAlbumNamesData()
     artistIDToSearchAlbumNames = artistIDToAlbumNames[artistIDToNumAlbums >= 2]
     artistIDToSearchAlbumNames = artistIDToSearchAlbumNames[~artistIDToSearchAlbumNames.index.isin(ignoreIDs)]
     print("Saving [{0}/{1}] {2} Entries To {3}".format(len(artistIDToSearchAlbumNames), len(artistIDToAlbumNames), 
                                                        "ID => AlbumNames", self.disc.getArtistIDToSearchAlbumNamesFilename()))
     self.disc.saveArtistIDToSearchAlbumNamesData(idata=artistIDToSearchAlbumNames)
     
     ts.stop()
Ejemplo n.º 28
0
    def saveData(self, manualMultiArtists, fast=True, local=False):
        ftype = {True: "Pickle", False: "YAML"}
        ltype = {True: "Local", False: "Main"}
        ts = timestat("Saving Manual Renames Data To {0} {1} File".format(
            ltype[local], ftype[fast]))
        #manualMultiArtists = self.manualMultiArtists if manualMultiArtists is None else manualMultiArtists
        #self.summary(manualRenames)

        fname = self.getFilename(fast, local)
        if fast:
            toSave = Series(manualMultiArtists) if isinstance(
                manualMultiArtists, list) else manualMultiArtists
            toSave = toSave.sort_values()
        else:
            toSave = manualMultiArtists.to_list() if isinstance(
                manualMultiArtists, Series) else manualMultiArtists
        self.io.save(idata=toSave, ifile=fname)

        ts.stop()
Ejemplo n.º 29
0
 def loadAlbums(self, idxReq={}):
     ts = timestat("Loading DB Albums Data For {0} DBs".format(
         len(self.discs)))
     for db, disc in self.discs.items():
         idxs = self.mdbData[db]["IDToName"].notna(
         ) if self.mdbData[db].get("IDToName") is not None else None
         self.mdbData[db]["IDToAlbums"] = self.getArtistAlbumsData(db)
         if idxs is not None:
             self.mdbData[db]["IDToAlbums"] = self.mdbData[db][
                 "IDToAlbums"][self.mdbData[db]["IDToAlbums"].index.isin(
                     idxs.index)]
         idxs = self.getIdxReqs(db, idxReq=idxReq)
         if idxs is not None:
             self.mdbData[db]["IDToAlbums"] = self.mdbData[db][
                 "IDToAlbums"][self.mdbData[db]["IDToAlbums"].index.isin(
                     idxs.index)]
         self.mdbData[db]["IDToAlbums"].name = "Albums"
         #ts.update()
     ts.stop()
Ejemplo n.º 30
0
    def loadArtists(self, numAlbumsReq={}):
        ts = timestat("Loading DB Artist Data For {0} DBs".format(
            len(self.discs)))
        for db, disc in self.discs.items():
            idxs = self.getIdxReqs(db, numAlbumsReq=numAlbumsReq)
            self.mdbData[db]["IDToName"] = self.getArtistNameData(db)
            self.mdbData[db]["IDToName"] = self.mdbData[db][
                "IDToName"] if idxs is None else self.mdbData[db][
                    "IDToName"].loc[idxs]
            self.mdbData[db]["IDToName"].name = "ArtistName"

            self.mdbData[db]["IDToNumAlbums"] = self.getNumAlbumsData(
                db).loc[idxs]
            self.mdbData[db]["IDToNumAlbums"] = self.mdbData[db][
                "IDToNumAlbums"] if idxs is None else self.mdbData[db][
                    "IDToNumAlbums"].loc[idxs]
            self.mdbData[db]["IDToNumAlbums"].name = "NumAlbums"
            #ts.update()
        ts.stop()