def parseSearch(self, modVal, expr=None, force=False, debug=False, quiet=False): ts = timestat("Parsing Discogs Search ModVal={0} Files(expr=\'{1}\', force={2}, debug={3}, quiet={4})".format(modVal, expr, force, debug, quiet)) io = fileIO() ######################################################################################## # Previous DB Data ######################################################################################## if not fileUtil(self.disc.getDBModValFilename(modVal)).exists: tsDB = timestat("Creating New DB For ModVal={0}".format(modVal)) dbdata = Series({}) ts.stop() else: tsDB = timestat("Loading ModVal={0} DB Data".format(modVal)) dbdata = self.disc.getDBModValData(modVal) tsDB.stop() ######################################################################################## # Previous Media Data ######################################################################################## previousMetadata = self.disc.getMetadataAlbumData(modVal) ######################################################################################## # Artist Search Data (No Media) ######################################################################################## tsDB = timestat("Loading Artist Search Data For ModVal={0}".format(modVal)) artistSearchFilenames = self.getArtistRawFiles(datatype="search", expr=expr, force=True) artistSearchFilename = [x for x in artistSearchFilenames if fileUtil(x).basename == "artistData-{0}".format(modVal)] if len(artistSearchFilename) == 1: artistSearchData = io.get(artistSearchFilename[0]) else: raise ValueError("Could not find Discogs API Artist Search Data") tsDB.stop() N = artistSearchData.shape[0] modValue = 5000 if N >= 50000 else 1000 nSave = 0 tsParse = timestat("Parsing {0} Searched For Discogs API Artists".format(N)) Nnew = 0 for i,(artistID,artistData) in enumerate(artistSearchData.iterrows()): if (i+1) % modValue == 0 or (i+1) == N: tsParse.update(n=i+1, N=N) if dbdata.get(artistID) is not None: continue artistAPIData = {"Artist": artistData, "Albums": previousMetadata.get(artistID, {})} dbdata = dbdata.append(Series({artistID: self.artist.getData(artistAPIData)})) Nnew += 1 if Nnew > 0: print("Saving [{0}/{1}] {2} Entries To {3}".format(len(dbdata), len(dbdata), "ID Data", self.disc.getDBModValFilename(modVal))) self.disc.saveDBModValData(modVal=modVal, idata=dbdata) else: print("Not saving any of the new data") ts.stop()
def parse(self, expr, force=False, debug=False, quiet=False): ts = timestat("Parsing Raw Pickled HTML Files(expr=\'{0}\', force={1}, debug={2}, quiet={3})".format(expr, force, debug, quiet)) io = fileIO() newFiles = self.getArtistRawFiles(datatype="data", expr=expr, force=force) N = len(newFiles) modValue = 250 if N >= 500 else 50 nSave = 0 tsParse = timestat("Parsing {0} Raw Picked HTML Files".format(N)) for i,ifile in enumerate(newFiles): if (i+1) % modValue == 0 or (i+1) == N or debug: tsParse.update(n=i+1, N=N) retval = self.artist.getData(ifile) if retval is None: if debug: print("Could not find data for {0}".format(ifile)) continue artistID = retval.ID.ID if artistID is None: if debug: print("Could not find artistID for {0}".format(ifile)) continue savename = self.dutils.getArtistSavename(artistID) if isinstance(savename,str) and (force == True or fileUtil(savename).exists == False): io.save(idata=retval, ifile=savename) nSave += 1 ts.stop() print("Saved {0} New Files".format(nSave))
def parse(self, modVal, expr, force=False, debug=False, quiet=False): ts = timestat("Parsing Primary ModVal={0} Files(expr=\'{1}\', force={2}, debug={3}, quiet={4})".format(modVal, expr, force, debug, quiet)) tsFiles = timestat("Finding Files To Parse") newFiles = self.getArtistPrimaryFiles(modVal, expr, force) tsFiles.stop() N = len(newFiles) if N == 0: ts.stop() return modValue = max([250 * round((N/10)/250), 250]) if force is True or not fileUtil(self.disc.getDBModValFilename(modVal)).exists: tsDB = timestat("Creating New DB For ModVal={0}".format(modVal)) dbdata = {} ts.stop() else: tsDB = timestat("Loading ModVal={0} DB Data".format(modVal)) dbdata = self.disc.getDBModValData(modVal) tsDB.stop() newData = 0 tsParse = timestat("Parsing {0} New Files For ModVal={1}".format(N, modVal)) for i,ifile in enumerate(newFiles): if (i+1) % modValue == 0 or (i+1) == N: tsParse.update(n=i+1, N=N) #print("{0: <15}Parsing {1}".format("{0}/{1}".format(i+1,N), ifile)) artistID = getBaseFilename(ifile) info = self.artist.getData(ifile) if debug: print("\t",ifile,' ==> ',info.ID.ID,' <-> ',artistID) if info.ID.ID != artistID: if debug is True: print("Error for {0} ID={1} FileID={2}".format(info.meta.title,info.ID.ID,artistID)) 1/0 continue dbdata[artistID] = info newData += 1 tsParse.stop() if newData > 0: dbdata = Series(dbdata) print("Saving [{0}/{1}] {2} Entries To {3}".format(newData, len(dbdata), "ID Data", self.disc.getDBModValFilename(modVal))) self.disc.saveDBModValData(modVal=modVal, idata=dbdata) ts.stop() return newData > 0
def parse(self, modVal, expr, force=False, debug=False, quiet=False): ts = timestat("Parsing Raw Pickled Spotify API Primary ModVal={0} Files(expr=\'{1}\', force={2}, debug={3}, quiet={4})".format(modVal, expr, force, debug, quiet)) io = fileIO() newFiles = self.getArtistPrimaryFiles(modVal, expr, force) print("Found {0} New Files".format(len(newFiles))) if len(newFiles) == 0: return artistSearchFilename = self.getArtistRawFiles(datatype="search", expr=expr, force=True) if len(artistSearchFilename) == 1: artistSearchData = io.get(artistSearchFilename[0]) else: raise ValueError("Could not find Spotify API Artist Search Data") if force is True or not fileUtil(self.disc.getDBModValFilename(modVal)).exists: tsDB = timestat("Creating New DB For ModVal={0}".format(modVal)) dbdata = Series({}) ts.stop() else: tsDB = timestat("Loading ModVal={0} DB Data".format(modVal)) dbdata = self.disc.getDBModValData(modVal) tsDB.stop() N = len(newFiles) modValue = 500 if N >= 5000 else 100 nSave = 0 tsParse = timestat("Parsing {0} Raw Picked API Files".format(N)) for i,ifile in enumerate(newFiles): dData = io.get(ifile) artistID = dData['artistID'] try: artistData = artistSearchData.loc[artistID] except: print("Could not find Spotify ID [{0}]".format(artistID)) continue artistAPIData = {"Artist": artistData, "Albums": dData} dbdata = dbdata.append(Series({artistID: self.artist.getData(artistAPIData)})) nSave += 1 if nSave > 0: print("Saving [{0}/{1}] {2} Entries To {3}".format(nSave, len(dbdata), "ID Data", self.disc.getDBModValFilename(modVal))) self.disc.saveDBModValData(modVal=modVal, idata=dbdata) else: print("Not saving any of the new data") ts.stop()
def parseSearch(self, modVal, expr=None, force=False, debug=False, quiet=False): ts = timestat("Parsing Spotify Search ModVal={0} Files(expr=\'{1}\', force={2}, debug={3}, quiet={4})".format(modVal, expr, force, debug, quiet)) if not fileUtil(self.disc.getDBModValFilename(modVal)).exists: tsDB = timestat("Creating New DB For ModVal={0}".format(modVal)) dbdata = Series({}) ts.stop() else: tsDB = timestat("Loading ModVal={0} DB Data".format(modVal)) dbdata = self.disc.getDBModValData(modVal) tsDB.stop() io = fileIO() artistSearchFilename = self.getArtistRawFiles(datatype="search", expr=expr, force=True) if len(artistSearchFilename) == 1: artistSearchData = io.get(artistSearchFilename[0]) else: raise ValueError("Could not find Spotify API Artist Search Data") #print(artistSearchData.columns) amv = artistModValue() idx = artistSearchData.reset_index()['sid'].apply(amv.getModVal) == modVal idx.index = artistSearchData.index artists = artistSearchData[idx] N = artists.shape[0] tsParse = timestat("Parsing {0} Searched For Spotify API Artists".format(N)) Nnew = 0 for artistID,artistData in artists.iterrows(): if dbdata.get(artistID) is not None: continue artistAPIData = {"Artist": artistData, "Albums": {}} dbdata = dbdata.append(Series({artistID: self.artist.getData(artistAPIData)})) Nnew += 1 if Nnew > 0: print("Saving [{0}/{1}] {2} Entries To {3}".format(len(dbdata), len(dbdata), "ID Data", self.disc.getDBModValFilename(modVal))) self.disc.saveDBModValData(modVal=modVal, idata=dbdata) else: print("Not saving any of the new data") ts.stop()
def getDataBase(self, inputdata): if isinstance(inputdata, dict): self.bsdata = inputdata return elif fileUtil(inputdata).isFile(): ## Assumes str I believe ioData = fileIO().get(inputdata) if isinstance(ioData, artistDBDataClass): self.dbdata = ioData return elif isinstance(ioData, str): try: self.bsdata = getHTML(ioData) except: raise ValueError( "Cannot read artist [str] file: {0}".format(inputdata)) return elif isinstance(ioData, bytes): try: self.bsdata = getHTML(ioData) except: raise ValueError( "Cannot read artist [bytes] file: {0}".format( inputdata)) return elif isBS4(ioData): self.bsdata = ioData return elif isinstance(ioData, dict): self.bsdata = ioData return else: raise ValueError("Not sure about file data type: {0}".format( type(ioData))) else: raise ValueError("Not sure about input type: {0}".format( type(inputdata))) return
def parse(self, expr, force=False, debug=False, quiet=False): ts = timestat("Parsing Raw HTML Files(expr=\'{0}\', force={1}, debug={2}, quiet={3})".format(expr, force, debug, quiet)) io = fileIO() newFiles = self.getArtistRawHTMLFiles(expr, force=force) N = len(newFiles) modValue = 250 if N >= 500 else 50 modValue = 500 if N >= 2000 else modValue nSave = 0 tsParse = timestat("Parsing {0} Raw HTML Files".format(N)) for i,ifile in enumerate(newFiles): if (i+1) % modValue == 0 or (i+1) == N: tsParse.update(n=i+1, N=N) htmldata = io.get(ifile) retval = self.artist.getData(ifile) artistID = retval.ID.ID savename = self.dutils.getArtistSavename(artistID) if isinstance(savename,str) and (force == True or fileUtil(savename).exists == False): io.save(idata=retval, ifile=savename) nSave += 1 ts.stop() print("Saved {0} New Files".format(nSave))
def __init__(self): self.ignoreDirname = dirUtil(prefix).join("dbdata") self.ignoreFilename = fileUtil( self.ignoreDirname).join("dbIgnoreData.yaml") self.io = fileIO()
def parse(self, modVal, expr, force=False, debug=False, quiet=False): ts = timestat("Parsing Raw Pickled Discogs API Primary ModVal={0} Files(expr=\'{1}\', force={2}, debug={3}, quiet={4})".format(modVal, expr, force, debug, quiet)) io = fileIO() ######################################################################################## # New Files ######################################################################################## tsDB = timestat("Finding New Files For ModVal={0}".format(modVal)) newFiles = self.getArtistPrimaryFiles(modVal, expr, force=True) print("Found {0} New Files".format(len(newFiles))) tsDB.stop() if len(newFiles) == 0: return ######################################################################################## # Artist Search Data (No Media) ######################################################################################## tsDB = timestat("Loading Artist Search Data For ModVal={0}".format(modVal)) artistSearchFilenames = self.getArtistRawFiles(datatype="search", expr=expr, force=True) artistSearchFilename = [x for x in artistSearchFilenames if fileUtil(x).basename == "artistData-{0}".format(modVal)] if len(artistSearchFilename) == 1: artistSearchData = io.get(artistSearchFilename[0]) else: raise ValueError("Could not find Discogs API Artist Search Data") tsDB.stop() ######################################################################################## # Previous Media Data ######################################################################################## tsDB = timestat("Loading Media Metadata For ModVal={0}".format(modVal)) previousMetadata = self.disc.getMetadataAlbumData(modVal) tsDB.stop() ######################################################################################## # Previous DB Data ######################################################################################## if force is True or not fileUtil(self.disc.getDBModValFilename(modVal)).exists: tsDB = timestat("Creating New DB For ModVal={0}".format(modVal)) dbdata = Series({}) ts.stop() else: tsDB = timestat("Loading ModVal={0} DB Data".format(modVal)) dbdata = self.disc.getDBModValData(modVal) tsDB.stop() N = len(newFiles) modValue = 2500 if N >= 5000 else 250 nSave = 0 tsParse = timestat("Parsing {0} Raw Picked API Files".format(N)) for i,ifile in enumerate(newFiles): if (i+1) % modValue == 0 or (i+1) == N: tsParse.update(n=i+1, N=N) dData = io.get(ifile) artistID = fileUtil(ifile).basename try: artistData = artistSearchData.loc[artistID] except: print("Could not find Discogs ID [{0}]".format(artistID)) continue artistAPIData = {"Artist": artistData, "Albums": dData} retval = Series({artistID: self.artist.getData(artistAPIData)}) if dbdata.get(artistID) is not None: dbdata[artistID] = retval else: dbdata = dbdata.append(retval) nSave += 1 if nSave > 0: print("Saving [{0}/{1}] {2} Entries To {3}".format(nSave, len(dbdata), "ID Data", self.disc.getDBModValFilename(modVal))) self.disc.saveDBModValData(modVal=modVal, idata=dbdata) else: print("Not saving any of the new data") ts.stop()