Esempio n. 1
0
    def processWikiFilmYearlyData(self, procYear=None, debug=False):
        outdir = self.getDataDir()
        if procYear == None:
            files = findExt(outdir, ext=".p")
        else:
            files = findPatternExt(outdir, pattern=str(procYear), ext=".p")

        from collections import OrderedDict
        movies = OrderedDict()
        yearlyData = {}

        for ifile in sorted(files):
            if debug:
                print("Processing {0}".format(ifile))
            year = getBaseFilename(ifile)
            movies[year] = self.parseWikiFilmYearlyData(ifile, debug=False)

            yearlyData[year] = sorted(movies[year].items(),
                                      key=operator.itemgetter(1),
                                      reverse=False)
            print("---->", year,
                  " (Top 5/{0} Movies) <----".format(len(yearlyData[year])))
            for item in yearlyData[year][:5]:
                print(item)
            print('\n')

        savename = setFile(self.getResultsDir(), "{0}.json".format(self.name))
        print("Saving {0} WikiFilm data to {1}".format(len(yearlyData),
                                                       savename))
        saveFile(savename, yearlyData)
Esempio n. 2
0
    def createArtistMetadataMap(self):
        ts = timestat("Creating Artist DBs")

        artistIDGenre          = {}
        artistIDStyle          = {}
        artistIDCollaborations = {}

        albumsMetadataDBDir = self.disc.getAlbumsMetadataDBDir()
        files = findPatternExt(albumsMetadataDBDir, pattern="-ArtistMetadata", ext='.p')

        for ifile in files:
            print(ifile,'\t',end="")
            for artistID,artistData in getFile(ifile).items():
                genre   = artistData['Genre']
                artistIDGenre[artistID] = genre
                artists = artistData['Artists']
                artistIDCollaborations[artistID] = artists
                style   = artistData['Style']
                artistIDStyle[artistID] = style
            print(len(artistIDGenre))
        print("\n\n==============================================\n")


        savenames = {"IDToGenre": artistIDGenre, "IDToStyle": artistIDStyle, "IDToCollaborations": artistIDCollaborations}
        for basename,savedata in savenames.items():
            savename = setFile(self.disc.getDiscogDBDir(), "Artist{0}.p".format(basename))
            print("Saving {0} entries to {1}\n".format(len(savedata), savename))
            saveFile(ifile=savename, idata=Series(savedata), debug=True)   

        ts.stop()
Esempio n. 3
0
    def parseUltimateMovieRankingsYearlyData(self, procYear=None, debug=False):
        outdir = self.getDataDir()
        if procYear == None:
            files = findExt(outdir, ext=".p")
        else:
            files = findPatternExt(outdir, pattern=str(procYear), ext=".p")

        from collections import OrderedDict
        movieData = OrderedDict()
        for ifile in sorted(files):
            #ifile = "/Users/tgadfort/Documents/code/movies/ultimatemovierankings/data/2017.p"
            htmldata = getFile(ifile)
            bsdata = getHTML(htmldata)
            year = getBaseFilename(ifile)

            data = {}
            done = False
            tables = bsdata.findAll("table")  #, {"id": "table_3"})
            movies = {}
            for it, table in enumerate(tables):
                ths = table.findAll("th")
                trs = table.findAll("tr")
                for itr, tr in enumerate(trs):
                    tds = tr.findAll("td")
                    if len(tds) == 11:
                        val = removeTag(tds[1], 'span')
                        film = val.text
                        film = film.replace(" ({0})".format(year), "")
                        try:
                            rank = float(tds[-1].text)
                        except:
                            try:
                                rank = float(tds[-2].text)
                            except:
                                raise ValueError(tds[-1], tds[-2], tr)

                        movies[film] = rank

            movieData[year] = movies

        yearlyData = {}
        for year in sorted(movieData.keys()):
            yearlyData[year] = sorted(movieData[year].items(),
                                      key=operator.itemgetter(1),
                                      reverse=True)
            print("---->", year,
                  " (Top 5/{0} Movies) <----".format(len(yearlyData[year])))
            for item in yearlyData[year][:5]:
                print(item)
            print('\n')

        savename = setFile(self.getResultsDir(), "{0}.json".format(self.name))
        print("Saving {0} Years of Ultimate Movie Rankings data to {1}".format(
            len(yearlyData), savename))
        saveFile(savename, yearlyData)
Esempio n. 4
0
 def parseDownloadedFiles(self, previousDays=None, force=False):
     artistDir = self.disc.getArtistsDir()
     files = self.getArtistRawHTMLFiles(previousDays=None, force=False)
     return
     dataDir = setDir(artistDir, "data")
     files = findPatternExt(dataDir, pattern="Rate Your Music", ext=".html")
     for ifile in files:
         htmldata = getFile(ifile)
         retval = self.artist.getData(ifile)
         artistID = retval.ID.ID
         savename = self.dutils.getArtistSavename(artistID)
         saveFile(idata=htmldata, ifile=savename, debug=False)
Esempio n. 5
0
 def parseDownloadedFiles(self):
     artistDir = self.disc.getArtistsDir()
     dataDir = setDir(artistDir, "data")
     files = findPatternExt(dataDir,
                            pattern="Discography and Albums",
                            ext=".htm")
     for ifile in files:
         htmldata = getFile(ifile)
         retval = self.getData(ifile)
         artistID = retval.ID.ID
         savename = self.getArtistSavename(artistID)
         saveFile(idata=htmldata, ifile=savename, debug=True)
Esempio n. 6
0
    def parseBoxOfficeMojoResults(self,
                                  startYear=1982,
                                  endYear=2017,
                                  debug=False):
        outdir = self.getDataDir()
        resultsdir = self.getResultsDir()

        if endYear == None: endYear = startYear
        years = range(int(startYear), int(endYear) + 1)
        for year in years:
            retval = []
            files = findPatternExt(outdir, pattern=str(year), ext=".p")
            for ifile in files:
                result = self.parseBoxOfficeMojo(ifile, debug=debug)
                retval.append(result)

            savename = setFile(resultsdir, str(year) + ".json")
            print("Saving", len(retval), "weekends of movie data to", savename)
            saveFile(savename, retval)
Esempio n. 7
0
    def createAlbumIDMap(self):
        ts = timestat("Creating Artist DBs")

        albumIDToName    = {}
        albumIDToRef     = {}
        albumIDToArtists = {}

        albumsMetadataDBDir = self.disc.getAlbumsMetadataDBDir()
        files = findPatternExt(albumsMetadataDBDir, pattern="-ArtistAlbums", ext='.p')
        for ifile in files:
            print(ifile,'\t',end="")
            for artistID,artistData in getFile(ifile).items():
                for albumID,albumData in artistData.items():
                    albumName    = albumData[0]
                    albumRef     = albumData[1]
                    albumCountry = albumData[2].most_common(1)[0]
                    albumYear    = albumData[3].most_common(1)[0]


                    albumIDToName[albumID] = albumName
                    albumIDToRef[albumID]  = albumRef

                    if albumIDToArtists.get(albumID) is None:                
                        albumIDToArtists[albumID] = []
                    albumIDToArtists[albumID].append(artistID)
            print(len(albumIDToArtists))
        print("\n\n==============================================\n")

        for albumID in albumIDToArtists.keys():
            albumIDToArtists[albumID] = list(set(albumIDToArtists[albumID]))
        print("\n\n==============================================\n")


        savenames = {"IDToName": albumIDToName, "IDToRef": albumIDToRef, "IDToArtists": albumIDToArtists}
        for basename,savedata in savenames.items():
            savename = setFile(self.disc.getDiscogDBDir(), "Album{0}.p".format(basename))
            print("Saving {0} entries to {1}\n".format(len(savedata), savename))
            saveFile(ifile=savename, idata=Series(savedata), debug=True) 

        ts.stop()
Esempio n. 8
0
    def processWikipediaYearlyData(self, procYear=None, debug=False):
        outdir = self.getDataDir()
        if procYear == None:
            files = findExt(outdir, ext=".p")
        else:
            files = findPatternExt(outdir, pattern=str(procYear), ext=".p")

        from collections import OrderedDict
        movies = OrderedDict()
        for ifile in files:

            if debug:
                print("Processing {0}".format(ifile))
            year = getBaseFilename(ifile)
            #if year == "1985": continue
            htmldata = getFile(ifile)
            bsdata = getHTML(htmldata)
            results = self.parseWikipediaOscarData(ifile, debug=False)

            if len(results) == 0:
                results = self.parseWikipediaOscarDataSpecial(ifile,
                                                              debug=debug)
            if len(results) == 0:
                raise ValueError("No results for {0}".format(ifile))

            for k, v in results.items():
                print("====>", year, '\t', k)
                print("      Winner  :", results[k]["Winner"])
                if debug:
                    print("      Nominees:", results[k]["Nominees"])
                    print("")

            savename = setFile(self.getResultsDir(), "{0}.json".format(year))
            print("Saving {0} wikipedia oscar data to {1}".format(
                year, savename))
            saveFile(savename, results)