コード例 #1
0
ファイル: dbArtistsBase.old.py プロジェクト: tgadf/dbdata
    def rmIDFromDB(self, artistID, modValue=None):
        print("Trying to remove data from ArtistID {0}".format(artistID))
        if modValue is None:
            modValue  = self.dutils.getDiscIDHashMod(discID=artistID, modval=self.disc.getMaxModVal())
        artistDBDir = self.disc.getArtistsDBDir()
        dbname  = setFile(artistDBDir, "{0}-DB.p".format(modValue))     
        print("Loading {0}".format(dbname))
        dbdata  = getFile(dbname)
        
        saveVal = False

        if isinstance(artistID, str):
            artistID = [artistID]
        elif not isinstance(artistID, list):
            raise ValueError("Not sure what to do with {0}".format(artistID))
            
        for ID in artistID:
            try:
                del dbdata[ID]
                print("Deleted {0}".format(ID))
                saveVal = True
            except:
                print("Not there...")

            self.rmIDFiles(ID)

        if saveVal:
            print("Saving {0}".format(dbname))
            saveFile(idata=dbdata, ifile=dbname)
        else:
            print("No reason to save {0}".format(dbname))
コード例 #2
0
    def saveCorrections(self, debug=True):
        corrsavename = setFile(self.getDataDir(), "corr.yaml")
        corrData = getFile(corrsavename)        

        try:
            savename = setFile(self.getDataDir(), "saved.yaml")
            savedData = getFile(savename)
        except:
            raise ValueError("Could not access saved data!")
            savedData = {}

        if corrData is None:
            print("There is no corrections data.")
        else:
            print("Found {0} old corrections".format(len(savedData)))
            print("Found {0} new corrections".format(len(corrData)))
            for movie,corrs in corrData.items():
                if savedData.get(movie) is None:
                    if debug:
                        print("Adding {0}".format(movie))
                    savedData[movie] = corrs
                else:
                    newSaved = list(set(savedData[movie] + corrs))
                    if len(newSaved) != len(savedData[movie]):
                        print("Adding new corrections to {0}".format(movie))
                    savedData[movie] = newSaved

            try:
                savename = setFile(self.getDataDir(), "saved.yaml")
                saveFile(idata=savedData, ifile=savename, debug=debug)        
                print("There are {0} total corrections".format(len(savedData)))
            except:
                raise ValueError("There was an error saving the saved corrctions yaml file!")
コード例 #3
0
ファイル: wikifilm.py プロジェクト: tgadf/movies
    def processWikiFilmYearlyData(self, procYear=None, debug=False):
        outdir = self.getDataDir()
        if procYear == None:
            files = findExt(outdir, ext=".p")
        else:
            files = findPatternExt(outdir, pattern=str(procYear), ext=".p")

        from collections import OrderedDict
        movies = OrderedDict()
        yearlyData = {}

        for ifile in sorted(files):
            if debug:
                print("Processing {0}".format(ifile))
            year = getBaseFilename(ifile)
            movies[year] = self.parseWikiFilmYearlyData(ifile, debug=False)

            yearlyData[year] = sorted(movies[year].items(),
                                      key=operator.itemgetter(1),
                                      reverse=False)
            print("---->", year,
                  " (Top 5/{0} Movies) <----".format(len(yearlyData[year])))
            for item in yearlyData[year][:5]:
                print(item)
            print('\n')

        savename = setFile(self.getResultsDir(), "{0}.json".format(self.name))
        print("Saving {0} WikiFilm data to {1}".format(len(yearlyData),
                                                       savename))
        saveFile(savename, yearlyData)
コード例 #4
0
ファイル: dbArtistsParse.py プロジェクト: tgadf/dbdata
    def parse(self, expr, force=False, debug=False, quiet=False):
        ts = timestat("Parsing Raw HTML Files")
        
        tsFiles  = timestat("Finding Files To Parse")
        newFiles = self.getArtistRawHTMLFiles(expr, force)
        tsFiles.stop()
        if debug:
            print("Parsing {0} Raw HTML Files From Expr[{1}]".format(len(newFiles), expr))

        N = len(newFiles)
        modValue = 250 if N >= 500 else 50
        tsParse = timestat("Parsing {0} Raw HTML Files".format(N))
        for i,ifile in enumerate(newFiles):
            if (i+1) % modValue == 0 or (i+1) == N or debug:
                tsParse.update(n=i+1, N=N)
                #print("{0: <15}Parsing {1}".format("{0}/{1}".format(i+1,N), ifile))
            
            if debug:
                print("{0}/{1}\tParsing {2}".format(i,N,ifile))
            htmldata = getFile(ifile)
            retval   = self.artist.getData(ifile)
            artistID = retval.ID.ID
            if debug:
                print("  ---> ID={0}".format(artistID))
            savename = self.dutils.getArtistSavename(artistID)
            saveFile(idata=htmldata, ifile=savename, debug=False)        
        
        tsParse.stop()
        ts.stop()
コード例 #5
0
ファイル: oscar.py プロジェクト: tgadf/movies
    def createRawOscarData(self, debug=True):
        print("Checking for poorly parsed oscar data.")
        indir = self.wikiData.getResultsDir()
        files = sorted(findExt(indir, ext=".json"))
        if debug:
            print("Found {0} oscar files".format(len(files)))
        yearlyData = {}
        for ifile in files:
            year = getBaseFilename(ifile)
            yearlyData[year] = getFile(ifile)

        savename = setFile(self.getCorrectionsDir(), "saved.yaml")
        if not isFile(savename):
            savedData = {}
        else:
            savedData = getFile(savename)

        for year in savedData.keys():
            for title in savedData[year].keys():
                savedWinner = savedData[year][title].get("Winner")
                savedNominees = savedData[year][title].get("Nominees")
                if savedWinner is not None:
                    print("Overwritting {0} {1} winner".format(year, title))
                    yearlyData[year][title]["Winner"] = savedWinner
                if savedNominees is not None:
                    print("Overwritting {0} {1} nominees".format(year, title))
                    yearlyData[year][title]["Nominees"] = savedNominees

        savename = setFile(self.getCorrectionsDir(), "raw.yaml")
        saveFile(idata=yearlyData, ifile=savename)
コード例 #6
0
ファイル: dbArtistsParse.py プロジェクト: tgadf/dbdata
 def parse(self, expr, force=False, debug=False, quiet=False):
     ts = timestat("Parsing Raw Files")  
     
     tsFiles  = timestat("Finding Files To Parse")
     newFiles = self.getArtistRawFiles(datatype=self.datatype, expr=expr, force=force)
     tsFiles.stop()
         
     N = len(newFiles)
     tsParse = timestat("Parsing {0} New Raw Files".format(N))
     
     newData = 0
     modValue = 250 if N >= 500 else 50
     for i,ifile in enumerate(newFiles):
         if (i+1) % modValue == 0 or (i+1) == N:
             tsParse.update(n=i+1, N=N)
             #print("{0: <15}Parsing {1}".format("{0}/{1}".format(i+1,N), ifile))
         htmldata = getFile(ifile)
         retval   = self.artist.getData(ifile)
         artistID = retval.ID.ID
         if artistID is None:
             continue
         savename = self.dutils.getArtistSavename(artistID)
         if savename is None:
             continue
         saveFile(idata=htmldata, ifile=savename, debug=False)
         newData += 1
         
     print("Created {0}/{1} New Artist Files".format(newData, N))
     tsParse.stop()
コード例 #7
0
    def processAACTACategoryData(self, debug=False):
        outdir = self.getDataDir()
        files = findExt(outdir, ext="*.p")

        from collections import OrderedDict
        movies = OrderedDict()
        print(files)
        for ifile in files:

            if debug:
                print("Processing {0}".format(ifile))
            category = getBaseFilename(ifile)
            results = self.parseAACTACategoryData(ifile, category, debug=debug)

            if len(results) == 0:
                raise ValueError("No results for {0}".format(ifile))

            for year, yearData in results.items():
                for category, categoryData in yearData.items():
                    if movies.get(year) is None:
                        movies[year] = []
                    for movie in categoryData:
                        movies[year].append(movie)

        for year in movies.keys():
            movies[year] = list(set(movies[year]))
            yearlyMovies = movies[year]
            movies[year] = []
            for movie in yearlyMovies:
                movies[year].append([movie, 10])

        savename = setFile(self.getResultsDir(), "{0}.json".format(self.name))
        print("Saving {0} Years of AACTA Data to {1}".format(
            len(movies), savename))
        saveFile(savename, movies)
コード例 #8
0
ファイル: rottentomatoes.py プロジェクト: tgadf/movies
    def parseRottenTomatoes(self, debug=False):
        outdir = self.getDataDir()
        files = findExt(outdir, ext=".p")

        movies = {}
        for ifile in files:
            result = self.parseRottenTomatoesFile(ifile, debug=debug)
            for year, yearlyResult in result.items():
                if movies.get(year) is None:
                    movies[year] = yearlyResult
                else:
                    movies[year] = {**movies[year], **yearlyResult}

        yearlyData = {}
        for year in movies.keys():
            yearlyData[year] = sorted(movies[year].items(),
                                      key=operator.itemgetter(1),
                                      reverse=True)
            print("---->", year,
                  " (Top 5/{0} Movies) <----".format(len(yearlyData[year])))
            for item in yearlyData[year][:5]:
                print(item)
            print('\n')

        savename = setFile(self.getResultsDir(), "rottentomatoes.json")
        print("Saving", len(yearlyData), "yearly results to", savename)
        saveFile(savename, yearlyData)
コード例 #9
0
ファイル: dbArtistsKWorbSpotify.py プロジェクト: tgadf/dbdata
    def downloadKWorbSpotifyYouTubeArtists(self, update=False):
        url = "https://kworb.net/youtube/archive.html"
        savename = "kworb_youtubeartists.p"
        if update is True:
            self.dutils.downloadArtistURL(url=url,
                                          savename=savename,
                                          force=True)

        bsdata = getHTML(savename)
        data = []
        artistDir = self.disc.getArtistsDir()
        saveDir = setDir(artistDir, "youtube")
        print(artistDir)
        for table in bsdata.findAll("table"):
            ths = [th.text for th in table.findAll("th")]
            for tr in table.findAll("tr")[1:]:
                item = dict(zip(ths, tr.findAll("td")))
                data.append(item)

        print(data)

        if False:
            bsdata = getHTML(savename)
            artistDir = self.disc.getArtistsDir()
            saveDir = setDir(artistDir, "youtube")
            for div in bsdata.findAll("div", {"class": "subcontainer"}):
                if div.find("span", {"class": "pagetitle"}) is None:
                    continue
                for ref in div.findAll("a"):
                    href = ref.attrs['href']
                    url = "{0}/{1}".format(self.youtubeURL, href)
                    savename = "{0}/{1}".format(saveDir,
                                                href.replace(".html", ".p"))
                    if isFile(savename):
                        print("Y\t", savename, '\t', url)
                    else:
                        print("-\t", savename, '\t', url)
                        #dbArtistsKWorb().dutils.downloadArtistURL(url=fullURL, savename=savename, force=True)

            for ifile in findExt(saveDir, ".p"):
                bsdata = getHTML(ifile)
                for table in bsdata.findAll("table"):
                    trs = table.findAll("tr")
                    for tr in trs[1:]:
                        ref = tr.find("a")
                        href = ref.attrs['href']
                        name = ref.text
                        url = "{0}/{1}".format(self.youtubeURL, href)
                        savename = "{0}/{1}".format(
                            setDir(saveDir, "artist"),
                            href.replace(".html", ".p"))
                        print(url, savename)

                        if isFile(savename) is False:
                            data, code = downloadURL(url)
                            from ioUtils import getFile, saveFile
                            saveFile(idata=data, ifile=savename)
                            sleep(3)
                            break
コード例 #10
0
    def parseArtistFiles(self, force=False, debug=False):   
        from glob import glob
        
        artistDir = self.disc.getArtistsDir()
        
        artistDBData = {}
                
        files = findExt(self.knownDir, ext='.p')        
        files = glob("/Volumes/Biggy/Discog/artists-datpiff/*/*.p")
        print("Found {0} downloaded search terms".format(len(files)))
        for i,ifile in enumerate(files):
            if ifile.endswith("datPiffKnown.p"):
                continue
            fileresults = getFile(ifile)
            if debug:
                print(i,'/',len(files),'\t',ifile)
            for j,fileresult in enumerate(fileresults):
                if debug:
                    print("  ",j,'/',len(fileresults))
                mixArtists  = fileresult["ArtistName"]
                albumName   = fileresult["AlbumName"]
                albumURL    = fileresult["AlbumURL"]
                
                mixArtistNames = self.mulArts.getArtistNames(mixArtists)
                mixArtistNames = [x.title() for x in mixArtistNames.keys()]
                
                for artistName in mixArtistNames:
                    artistID   = str(self.dutils.getArtistID(artistName))
                    albumID    = str(self.dutils.getArtistID(albumName))
                    modval     = self.dutils.getArtistModVal(artistID)
                    if artistDBData.get(modval) is None:
                        artistDBData[modval] = {}
                    if artistDBData[modval].get(artistName) is None:
                        artistDBData[modval][artistName] = {"Name": artistName, "ID": artistID, "URL": None, "Profile": None, "Media": []}
                    albumData = {"Artists": mixArtistNames, "Name": albumName, "URL": albumURL, "Code": albumID}
                    artistDBData[modval][artistName]["Media"].append(albumData)

                    
                    
                    
        maxModVal   = self.disc.getMaxModVal()
        artistDBDir = self.disc.getArtistsDBDir()     
        totalSaves  = 0
        for modVal,modvaldata in artistDBData.items():
            dbData = {}
            for artistName, artistData in modvaldata.items():
                self.artist.setData(artistData)
                artistVal = self.artist.parse()
                dbData[artistVal.ID.ID] = artistVal
                        
            savename = setFile(artistDBDir, "{0}-DB.p".format(modVal))
            print("Saving {0} artist IDs to {1}".format(len(dbData), savename))
            totalSaves += len(dbData)
            saveFile(idata=dbData, ifile=savename)
            
            self.createArtistModValMetadata(modVal=modVal, db=dbData, debug=debug)
            self.createArtistAlbumModValMetadata(modVal=modVal, db=dbData, debug=debug)
            
        print("Saved {0} new artist IDs".format(totalSaves))
コード例 #11
0
    def processFlopsData(self, debug=False):
        outdir = self.getDataDir()
        files = findExt(outdir, ext=".html")

        from collections import OrderedDict
        movies = OrderedDict()
        yearlyData = {}
        for ifile in files:
            htmldata = getFile(ifile)
            bsdata = getHTML(htmldata)

            tables = bsdata.findAll("table", {"class": "wikitable"})
            for table in tables:

                trs = table.findAll("tr")

                try:
                    ths = trs[0].findAll("th")
                    ths = [x.text for x in ths]
                    ths = [x.replace("\n", "") for x in ths]
                except:
                    raise ValueError("Could not get headers")

                print(ths)

                for itr, tr in enumerate(trs[2:]):

                    ths = tr.findAll("th")
                    try:
                        movie = ths[0].text
                        movie = movie.replace("\n", "").strip()
                        movie = movie.replace("[nb 2]", "")
                    except:
                        raise ValueError(
                            "Could not find movie in {0}".format(ths))

                    tds = tr.findAll("td")
                    try:
                        year = tds[0].text
                        year = int(year)
                    except:
                        raise ValueError(
                            "Could not find year in {0}".format(tds))

                    print(year, '\t', movie)

                    if yearlyData.get(year) is None:
                        yearlyData[year] = []
                    yearlyData[year].append(movie)

        for year in sorted(yearlyData.keys()):
            movies[year] = []
            for movie in yearlyData[year]:
                movies[year].append([movie, 10])

        savename = setFile(self.getResultsDir(), "{0}.json".format(self.name))
        print("Saving {0} Years of flops Data to {1}".format(
            len(movies), savename))
        saveFile(savename, movies)
コード例 #12
0
ファイル: ultimatemovierankings.py プロジェクト: tgadf/movies
    def parseUltimateMovieRankingsYearlyData(self, procYear=None, debug=False):
        outdir = self.getDataDir()
        if procYear == None:
            files = findExt(outdir, ext=".p")
        else:
            files = findPatternExt(outdir, pattern=str(procYear), ext=".p")

        from collections import OrderedDict
        movieData = OrderedDict()
        for ifile in sorted(files):
            #ifile = "/Users/tgadfort/Documents/code/movies/ultimatemovierankings/data/2017.p"
            htmldata = getFile(ifile)
            bsdata = getHTML(htmldata)
            year = getBaseFilename(ifile)

            data = {}
            done = False
            tables = bsdata.findAll("table")  #, {"id": "table_3"})
            movies = {}
            for it, table in enumerate(tables):
                ths = table.findAll("th")
                trs = table.findAll("tr")
                for itr, tr in enumerate(trs):
                    tds = tr.findAll("td")
                    if len(tds) == 11:
                        val = removeTag(tds[1], 'span')
                        film = val.text
                        film = film.replace(" ({0})".format(year), "")
                        try:
                            rank = float(tds[-1].text)
                        except:
                            try:
                                rank = float(tds[-2].text)
                            except:
                                raise ValueError(tds[-1], tds[-2], tr)

                        movies[film] = rank

            movieData[year] = movies

        yearlyData = {}
        for year in sorted(movieData.keys()):
            yearlyData[year] = sorted(movieData[year].items(),
                                      key=operator.itemgetter(1),
                                      reverse=True)
            print("---->", year,
                  " (Top 5/{0} Movies) <----".format(len(yearlyData[year])))
            for item in yearlyData[year][:5]:
                print(item)
            print('\n')

        savename = setFile(self.getResultsDir(), "{0}.json".format(self.name))
        print("Saving {0} Years of Ultimate Movie Rankings data to {1}".format(
            len(yearlyData), savename))
        saveFile(savename, yearlyData)
コード例 #13
0
ファイル: dbArtistsMusicStack.py プロジェクト: tgadf/dbdata
 def parseDownloadedFiles(self):
     artistDir = self.disc.getArtistsDir()
     dataDir = setDir(artistDir, "data")
     files = findPatternExt(dataDir,
                            pattern="Discography and Albums",
                            ext=".htm")
     for ifile in files:
         htmldata = getFile(ifile)
         retval = self.getData(ifile)
         artistID = retval.ID.ID
         savename = self.getArtistSavename(artistID)
         saveFile(idata=htmldata, ifile=savename, debug=True)
コード例 #14
0
 def parseDownloadedFiles(self, previousDays=None, force=False):
     artistDir = self.disc.getArtistsDir()
     files = self.getArtistRawHTMLFiles(previousDays=None, force=False)
     return
     dataDir = setDir(artistDir, "data")
     files = findPatternExt(dataDir, pattern="Rate Your Music", ext=".html")
     for ifile in files:
         htmldata = getFile(ifile)
         retval = self.artist.getData(ifile)
         artistID = retval.ID.ID
         savename = self.dutils.getArtistSavename(artistID)
         saveFile(idata=htmldata, ifile=savename, debug=False)
コード例 #15
0
    def downloadUnknownArtistCompositions(self):
        newIgnores = []
        for modVal, modValMetadata in self.metadata.items():
            N = len(modValMetadata)
            ts = timestat(
                "Downloading {0} Unknown Composition Files For ModVal={1}".
                format(N, modVal))
            for i, (artistID,
                    artistIDData) in enumerate(modValMetadata.items()):
                savename = self.dutils.getArtistSavename(artistID, song=True)

                href = artistIDData["URL"]
                artist = artistIDData["Name"]
                if isFile(savename):
                    continue

                ## Replace /credits with /songs
                href = "/".join(href.split('/')[:-1] + ["songs", "all"])

                ## Create Full URL
                url = urllib.parse.urljoin(self.dbArtists.baseURL, href)
                print("\n")
                print("=" * 100)
                print("{0}/{1}:  [{2}] / [{3}]".format(i, N, artist, url))

                data, response = self.dutils.downloadURL(url)
                if response == 200:
                    bsdata = getHTML(data)
                    if len(bsdata.findAll("th",
                                          {"class": "title-composer"})) > 0:
                        print("  ---> Saving Data To {0}".format(savename))
                        saveFile(idata=data, ifile=savename)
                        sleep(3)
                        continue

                sleep(3)
                newIgnores.append(artistID)

                if i == 20:
                    break
            ts.stop()

        print("New IDs To Ignore")
        print(newIgnores)
        tsUpdate = timestat(
            "Adding {0} ArtistIDs To Master Composition Ignore List".format(
                len(newIgnores)))
        self.updateMasterIgnoreCompositionData(newIgnores)
        tsUpdate.stop()
コード例 #16
0
 def parseFilms101Data(self, debug=False):
     outdir = self.getDataDir()
     resultsdir = self.getResultsDir()
     files  = findExt(outdir, ext=".p")
     movies = {}
     
     for ifile in sorted(files):
         year    = getBaseFilename(ifile)
         results = self.parseFilms101YearlyData(ifile, debug=debug)
         movies[year] = []
         for movie in results:
             movies[year].append([movie,10])
         print("Found {0} movies in {1}".format(len(movies[year]),year))
     savename = setFile(self.getResultsDir(), "{0}.json".format(self.name))
     print("Saving {0} Years of films101 Data to {1}".format(len(movies), savename))
     saveFile(savename, movies)
コード例 #17
0
ファイル: dbArtistsBase.old.py プロジェクト: tgadf/dbdata
 def createArtistAlbumModValMetadata(self, modVal, db=None, debug=False):
     if db is None:
         db = self.disc.getArtistsDBModValData(modVal)
     
     artistIDMetadata = {}
     for artistID,artistData in db.items():
         artistIDMetadata[artistID] = {}
         for mediaName,mediaData in artistData.media.media.items():
             albumURLs  = {mediaValues.code: mediaValues.url for mediaValues in mediaData}
             albumNames = {mediaValues.code: mediaValues.album for mediaValues in mediaData}
             artistIDMetadata[artistID][mediaName] = [albumNames, albumURLs]
     
     artistDBDir = self.disc.getArtistsDBDir()     
     savename    = setSubFile(artistDBDir, "metadata", "{0}-MediaMetadata.p".format(modVal))
     
     print("Saving {0} new artist IDs media data to {1}".format(len(artistIDMetadata), savename))
     saveFile(idata=artistIDMetadata, ifile=savename)
コード例 #18
0
ファイル: dbArtistsBase.old.py プロジェクト: tgadf/dbdata
 def createArtistModValMetadata(self, modVal, db=None, debug=False):
     if db is None:
         db = self.disc.getArtistsDBModValData(modVal)
 
     artistIDMetadata = {k: [v.artist.name, v.url.url] for k,v in db.items()}
     
     for artistID,artistData in db.items():
         if artistData.profile.variations is not None:
             artistIDMetadata[artistID].append([v2.name for v2 in artistData.profile.variations])
         else:
             artistIDMetadata[artistID].append([artistData.artist.name])
     
     artistDBDir = self.disc.getArtistsDBDir()     
     savename    = setSubFile(artistDBDir, "metadata", "{0}-Metadata.p".format(modVal))
     
     print("Saving {0} new artist IDs name data to {1}".format(len(artistIDMetadata), savename))
     saveFile(idata=artistIDMetadata, ifile=savename)
コード例 #19
0
    def searchForArtist(self, artist):
        print("\n\n===================== Searching For {0} =====================".format(artist))
        url = self.getSearchArtistURL(artist)
        if url is None:
            raise ValueError("URL is None!")

        ## Download data
        data, response = self.downloadURL(url)
        if response != 200:
            print("Error downloading {0}".format(url))
            return False
        
        known = getFile(self.knownFile)
        print("  Found {0} previously searched for terms.".format(len(known)))
        known.append(artist)
        saveFile(idata=known, ifile=self.knownFile)

        self.parseSearchArtist(artist, data)
コード例 #20
0
    def parseBoxOfficeMojoResults(self,
                                  startYear=1982,
                                  endYear=2017,
                                  debug=False):
        outdir = self.getDataDir()
        resultsdir = self.getResultsDir()

        if endYear == None: endYear = startYear
        years = range(int(startYear), int(endYear) + 1)
        for year in years:
            retval = []
            files = findPatternExt(outdir, pattern=str(year), ext=".p")
            for ifile in files:
                result = self.parseBoxOfficeMojo(ifile, debug=debug)
                retval.append(result)

            savename = setFile(resultsdir, str(year) + ".json")
            print("Saving", len(retval), "weekends of movie data to", savename)
            saveFile(savename, retval)
コード例 #21
0
    def mergeBoxOfficeMojoResults(self, debug=False):
        retval = {}
        files = findExt(self.getResultsDir(), ext=".json")
        if debug:
            print("Found {0} files in the results directory".format(
                len(files)))
        for ifile in sorted(files):
            year = getBaseFilename(ifile)
            try:
                int(year)
            except:
                continue
            data = getFile(ifile)
            retval[year] = data
            if debug:
                print("  Adding {0} entries from {1}".format(len(data), ifile))

        savename = setFile(self.getResultsDir(), "results.json")
        if debug:
            print("Saving", len(retval), "years of movie data to", savename)
        saveFile(savename, retval)
コード例 #22
0
    def processBoxOfficeMojo(self, debug=False):
        outdir = self.getResultsDir()
        savename = setFile(outdir, "results.json")

        data = getFile(savename)
        movies = {}
        yearlyData = {}
        for i, year in enumerate(sorted(data.keys())):
            movies[year] = {}
            ydata = data[year]

            for wdata in ydata:
                for mdata in wdata:
                    movie = mdata[2]
                    retval = re.search("\((\d+)\)", movie)
                    if retval:
                        stryear = retval.group()
                        movie = movie.replace(stryear, "").strip()

                    gross = convertCurrency(mdata[9])
                    weekly = convertCurrency(mdata[4])
                    money = max(gross, weekly)
                    if movies[year].get(movie) == None:
                        movies[year][movie] = money
                    else:
                        movies[year][movie] = max(money, movies[year][movie])

            yearlyData[year] = sorted(movies[year].items(),
                                      key=operator.itemgetter(1),
                                      reverse=True)
            print("---->", year,
                  " (Top 25/{0} Movies) <----".format(len(yearlyData[year])))
            for item in yearlyData[year][:25]:
                print(item)
            print('\n')

        savename = setFile(outdir, "{0}.json".format(self.name))
        print("Saving", len(yearlyData), "yearly results to", savename)
        saveFile(savename, yearlyData)
コード例 #23
0
ファイル: historical.py プロジェクト: tgadf/football
    def downloadGameData(self, debug=False, verydebug=False):
        resultsDir = self.getSeasonResultsDir()
        files = findExt(resultsDir, ext=".p", debug=False)

        gameType = "playbyplay"
        print("Sleeping for 5 seconds...")
        sleep(5)

        for ifile in files:
            seasonData = getFile(ifile)
            year = seasonData.getYear()
            if year not in [2013, 2014, 2015]:
                continue
            gamesDir = self.getYearlyGamesDir(year)

            teams = seasonData.teams
            for teamID, teamData in teams.items():
                teamGames = teamData.games
                for gameData in teamGames:
                    gameResult = gameData["Result"]
                    gameObject = gameData["Game"]
                    gameID = gameObject.gameID

                    if False:
                        prevLocation = "/Volumes/Seagate/Football/Games/Plays/{0}.html".format(
                            gameID)
                        if isFile(prevLocation):
                            savename = setFile(gamesDir,
                                               "{0}.p".format(gameID))
                            if not isFile(savename) or True:
                                data = open(prevLocation, "rb").read()
                                saveFile(idata=data,
                                         ifile=savename,
                                         debug=True)
                                continue
                        continue

                    self.downloadGameDataByID(gameID, year, debug)
コード例 #24
0
ファイル: dbArtistsBase.old.py プロジェクト: tgadf/dbdata
 def downloadArtistURL(self, url, savename, force=False, sleeptime=2):
     if isFile(savename):
         if self.debug:
             print("{0} exists.".format(savename))
         if force is False:
             return False
         else:
             print("Downloading again.")
               
     ## Download data
     data, response = self.downloadURL(url)
     if response != 200:
         print("Error downloading {0}".format(url))
         return False
         
     print("Saving {0} (force={1})".format(savename, force))
     saveFile(idata=data, ifile=savename)
     print("Done. Sleeping for {0} seconds".format(sleeptime))
     sleep(sleeptime)
     
     if isFile(savename):
         return True
     else:
         return False
コード例 #25
0
    def processWikipediaYearlyData(self, procYear=None, debug=False):
        outdir = self.getDataDir()
        if procYear == None:
            files = findExt(outdir, ext=".p")
        else:
            files = findPatternExt(outdir, pattern=str(procYear), ext=".p")

        from collections import OrderedDict
        movies = OrderedDict()
        for ifile in files:

            if debug:
                print("Processing {0}".format(ifile))
            year = getBaseFilename(ifile)
            #if year == "1985": continue
            htmldata = getFile(ifile)
            bsdata = getHTML(htmldata)
            results = self.parseWikipediaOscarData(ifile, debug=False)

            if len(results) == 0:
                results = self.parseWikipediaOscarDataSpecial(ifile,
                                                              debug=debug)
            if len(results) == 0:
                raise ValueError("No results for {0}".format(ifile))

            for k, v in results.items():
                print("====>", year, '\t', k)
                print("      Winner  :", results[k]["Winner"])
                if debug:
                    print("      Nominees:", results[k]["Nominees"])
                    print("")

            savename = setFile(self.getResultsDir(), "{0}.json".format(year))
            print("Saving {0} wikipedia oscar data to {1}".format(
                year, savename))
            saveFile(savename, results)
コード例 #26
0
    def matchMyMusicAlbums(self, db, albumType=1, ratioCut=0.95, maxCut=0.1):
        self.matchedAlbums = {}

        start, cmt = clock(
            "Checking for Albums Matches Against {0} DB".format(db))

        print("{0: <40}{1: <15}{2: <45} --> {3}".format(
            "Artist", "Database", "Album Name", "Matched Album"))

        ######################################################################
        #### Get Map of Artists and Unmatched Albums
        ######################################################################
        artistNames = self.mmb.getArtists()
        #artistAlbums = self.mmb.getArtistAlbums()

        ######################################################################
        #### Loop Over Artist Name <-> Prime Map Items
        ######################################################################
        for artistName in artistNames:
            matchedAlbums = self.matchMyMusicAlbumsByArtist(
                db, artistName, albumType, ratioCut, maxCut)
            if len(matchedAlbums) > 0:
                if self.matchedAlbums.get(db) is None:
                    self.matchedAlbums[db] = {}
                self.matchedAlbums[db][artistName] = matchedAlbums
                for myAlbumName, bestMatchVal in matchedAlbums.items():
                    print("{0: <40}{1: <15}{2: <45} --> {3}".format(
                        artistName, db, myAlbumName, bestMatchVal["Album"]))

        elapsed(start, cmt)

        saveFile(ifile=self.mmn.moveFilename,
                 idata=self.matchedAlbums,
                 debug=True)
        print("Found {0} music <-> discogs albums maps".format(
            len(self.matchedAlbums)))
コード例 #27
0
    def parseSearchArtist(self, artist, data):
        if data is None:
            return None
        
        ## Parse data
        bsdata = getHTML(data)
        
        artistDB  = []

        contentdivs = bsdata.findAll("div", {"class": "contentItem"})
        for i,contentdiv in enumerate(contentdivs):
            artistDiv = contentdiv.find("div", {"class": "artist"})
            if artistDiv is None:
                continue
            artistName = artistDiv.text

            albumDiv = contentdiv.find("div", {"class": "title"})
            if albumDiv is None:
                continue
            albumName = albumDiv.text
            try:
                albumURL  = albumDiv.find("a").attrs['href']
            except:
                albumURL  = None
                
            artistDB.append({"ArtistName": artistName, "AlbumName": albumName, "AlbumURL": albumURL})
        

        artistID = self.dutils.getArtistID(artist)
        page     = 1
        savename = self.getArtistSavename(artistID, page)
        while isFile(savename):
            page += 1
            savename = self.getArtistSavename(artistID, page)
        print("Saving {0} new artist media to {1}".format(len(artistDB), savename))
        saveFile(idata=artistDB, ifile=savename)
コード例 #28
0
    def processRollingStoneData(self, debug=False):
        outdir = self.getDataDir()
        files = findExt(outdir, ext=".html")

        from collections import OrderedDict
        movies = OrderedDict()
        yearlyData = {}
        for ifile in files:
            htmldata = getFile(ifile)
            bsdata = getHTML(htmldata)

            h3s = bsdata.findAll("h3", {"class": "c-list__title t-bold"})
            h3s = [x.text for x in h3s]
            h3s = [x.replace("\n", "").strip() for x in h3s]
            for h3 in h3s:
                try:
                    year = int(h3[-5:-1])
                except:
                    raise ValueError("Could not get year from {0}".format(h3))

                movie = h3[1:-8]
                print(year, '\t', movie)

                if yearlyData.get(year) is None:
                    yearlyData[year] = []
                yearlyData[year].append(movie)

        for year in sorted(yearlyData.keys()):
            movies[year] = []
            for movie in yearlyData[year]:
                movies[year].append([movie, 10])

        savename = setFile(self.getResultsDir(), "{0}.json".format(self.name))
        print("Saving {0} Years of rollingstone Data to {1}".format(
            len(movies), savename))
        saveFile(savename, movies)
コード例 #29
0
 def findMyMovies(self, debug=False):
     movies = glob("/Volumes/*/Movies/*.*")
     mine   = dict(zip([getBaseFilename(x) for x in movies], movies))
     print("Found {0} movies on my disks".format(len(movies)))
     savename = setFile(self.getDataDir(), "mymovies.json")
     saveFile(idata=mine, ifile=savename, debug=True)
コード例 #30
0
ファイル: dbBase.py プロジェクト: tgadf/dbdata
 def saveDiagnosticAlbumIDs(self, albumIDs):
     savename = setFile(self.getDiagnosticDir(), "albumKnownIDs.p")
     saveFile(ifile=savename, idata=albumIDs)