Example #1
0
def GetPredictions(file=None):
    results = []

    # So I don't have to re-do a lot of predictions if I start and stop
    records_to_skip = 0
    if file is not None:
        results = pd.read_csv(file).to_dict('records')
        records_to_skip = len(results)

    count = 0
    total_count = req_reviews.shape[0]
    t = Timer()
    t.Start()
    for row in req_reviews.iterrows():
        count += 1
        if count > records_to_skip: # in the event we're continuing a file, jump to the last record
            predicted = PredictReview(row[1].reviewerID, row[1].asin)
            results.append({"datapointID":row[1].datapointID,"overall":predicted})
            if count % 1000 == 0:
                # informative prints so we know it's still working
                t.Stop()
                super_print("({} of {}) ({:.4f}s/prediction)".format(count, total_count, t.elapsed/1000))
                t.Start()
                DataFrame(results).to_csv("output.csv", index=False)
    DataFrame(results).to_csv("output.csv", index=False)
Example #2
0
def Load_MovieData():
    print("Loading Movie data from CSV...")
    t = Timer()
    t.Start()
    # Create a temporary table, since the dataset has duplicate IDs that violate
    # the Primary Key Constraint of FilmId. SQLite doesn't have an ADD CONSTRAINT
    # so we make an identical table without the constraint, fill it with data,
    # then copy that data line by line to the new table
    statement = '''
    CREATE TABLE "Film_temp" (
        'FilmID' INTEGER,
        'Title' TEXT,
        'Release' TEXT,
        'Budget' INTEGER,
        'Revenue' INTEGER,
        'Runtime' INTEGER,
        'Rating' TEXT,
        'Poster' TEXT,
        'Rating_IMDB' INTEGER,
        'Rating_RT' INTEGER,
        'Rating_MC' INTEGER,
        'BestPicture' INTEGER,
        'AA_Wins' INTEGER,
        'AA_Nominations' INTEGERS
    );
    '''
    cur.execute(statement)

    for f in pd.read_csv(MOVIEMETADATA_CSV, iterator=True):
        inserts = []
        for row in f.itertuples():
            inserts.append([
                row[6],
                row[9],
                row[15],
                row[3],
                row[16],
                row[17],
            ])
    statement = 'INSERT INTO Film_temp VALUES (?,?,?,?,?,?,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL)'
    cur.executemany(statement, inserts)

    # the CSV contains duplicate entries for 29 films - remove them here
    statement = '''
        DELETE FROM Film_temp WHERE FilmID IN (SELECT MIN(FilmID) FROM Film_temp GROUP BY FilmID HAVING COUNT(*) > 1)
    '''
    cur.execute(statement)

    # copy the entirety of the temp table to the actual Film table, which has the PK constraint
    cur.execute("SELECT * FROM Film_temp")
    inserts = []
    statement = 'INSERT INTO Film VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?)'
    for row in cur:
        inserts.append(row)

    cur.executemany(statement, inserts)

    t.Stop()
    print("Movie Data loaded in " + str(t))
Example #3
0
def Create_NPZ(file, output_file):
    reviews = pd.read_json(file, lines=True)
    reviews.sort_values(['reviewerID', 'asin'], ascending=[True, True], inplace=True)

    # create two DataFrames to act as indexes for the matrix
    # the index of the movie dataframe is the column number
    # the index of the users dataframe is the row number
    # example:
    #    users[users.userID=="A01174011QPNX7GZF4B92"].index.values[0] returns 7
    #    movies[movies.asin=="6300248135"].index.values[0] returns 9
    #    the value of m_reviews[7,9] is 5
    movies = DataFrame(data=reviews.asin.unique(), columns=["asin"])
    users = DataFrame(data=reviews.reviewerID.unique(), columns=["userID"])

    # initialize a new lil matrix, with size (users,movies)
    m_reviews = lil_matrix((len(users), len(movies)), dtype=np.int8)

    t = Timer()
    t.Start()
    count = 0
    total = reviews.shape[0]
    # iterate through all rows in the reviews file I've loaded in
    for row in reviews.iterrows():
        count += 1
        # grab the user and movie IDs from the dictionaries I made
        m_row_ID = users[users.userID==row[1].reviewerID].index.values[0]
        m_col_ID = movies[movies.asin==row[1].asin].index.values[0]
        m_value = row[1].overall
        # assign the rating value to the matrix coordinate [user,movie]
        m_reviews[m_row_ID, m_col_ID] = m_value

        # just so I know it's still working
        if count % 1000 == 0:
            sys.stdout.write("{} of {} ({} remaining)...\n".format(count, total, (total-count)))
            sys.stdout.flush()

    # save the user->userid and movie->movieid dictionaries to files, because I've lost them twice already
    movies.to_json(output_file+"movies_df.json",orient='records', lines=True)
    users.to_json(output_file+"users_df.json",orient='records', lines=True)

    # convert to a coo matrix so we can save it
    m_reviews = coo_matrix(m_reviews)
    # save to file
    scipy.sparse.save_npz(output_file + ".npz", m_reviews)
    t.Stop()
    print("Completed in ",t)
Example #4
0
def Load_Ratings():
    print("Loading Ratings data from CSV...")
    t = Timer()
    t.Start()
    chunksize = 100000
    i = 0
    for f in pd.read_csv(RATINGS_CSV, chunksize=chunksize, iterator=True):
        inserts = []
        for row in f.itertuples():
            inserts.append(row[1:])
        statement = 'INSERT INTO Ratings VALUES (?,?,?,?)'
        cur.executemany(statement, inserts)
        conn.commit()
        i += 1
        sys.stdout.write("loading chunk #{}...\n".format(str(i)))
        sys.stdout.flush()
    t.Stop()
    print("Ratings Loaded in " + str(t))
Example #5
0
def Load_Credits():
    print("Loading Film Credits from CSV...")
    t = Timer()
    t.Start()
    chunksize = 30
    i = 0
    sys.stdout.write("loading chunks.")
    for f in pd.read_csv(CREDITS_CSV, chunksize=chunksize, iterator=True):
        inserts = []
        for row in f.itertuples():
            movieId = row[3]
            pattern = r'{.*?}'  # pull strings out that are inside brackts
            for cast in re.findall(pattern, row[1]):
                try:
                    cast_json = CleanJSONString(cast)
                    AddPersonToDB(movieId, cast_json['name'], "Cast")
                except Exception as e:
                    with open("errors.txt", 'a', encoding="utf8") as f:
                        f.write("ERR: " + str(e) + "\n")
                        f.write(cast + "\n\n")
                    pass

            for crew in re.findall(pattern, row[2]):
                try:
                    crew_json = CleanJSONString(crew)
                    AddPersonToDB(movieId, crew_json['name'], crew_json['job'])
                except Exception as e:
                    with open("errors.txt", 'a', encoding="utf8") as f:
                        f.write("ERR: " + str(e) + "\n")
                        f.write(crew + "\n\n")
                    pass
        i += 1
        # sys.stdout.write("loading chunk #{} of 25...\n".format(str(i)))
        sys.stdout.write(".")
        sys.stdout.flush()
        # if i == 10:
        #     break
    t.Stop()
    print()
    print("Credits Loaded in " + str(t))
def AAwardWinningFilms():
    t = Timer()
    t.Start()
    url = 'https://en.wikipedia.org/wiki/List_of_Academy_Award-winning_films'
    AA_Cache = CacheFile('WikipediaCache.json', print_info=True)
    AA_Soup = AA_Cache.CheckCache_Soup(url, strainer=SoupStrainer(class_="wikitable"))
    Films = []
    for row in AA_Soup.find_all("tr"):
        if "Nominations" in row.text:
            pass
        else:
            cols = row.find_all("td")
            f = FilmAcademyAward()
            f.title = cols[0].text
            f.year = cols[1].text.split('/')[0]
            try:
                f.BestPicture = ("#EEDD82" in row.attrs['style'])
            except:
                pass
            f.Awards = tryParseInt(cols[2].text.split(' ')[0])
            f.Nominations = tryParseInt(cols[3].text)
            Films.append(f)

    conn = sqlite3.connect(Database_Name)
    cur = conn.cursor()
    inserts = []
    for film in Films:
        inserts.append(film.InsertTuple())

    statement = '''
        UPDATE Film SET BestPicture=?,AA_Wins=?,AA_Nominations=? WHERE Title == ? AND Release LIKE ?
    '''
    cur.executemany(statement,inserts)

    conn.commit()
    conn.close()
    t.Stop()
    print("Scraping Completed in " + str(t))
Example #7
0
def ResetDatabase():
    try:
        t = Timer()
        t.Start()
        global cur
        conn = sqlite.connect(DATABASE_NAME)
        cur = conn.cursor()

        ResetTable("Film")  # will also reset ratings when it's done
        ResetTable("Credits")

        conn.commit()
        t.Stop()
        print("Database Reset in " + str(t))
    except sqlite.OperationalError as e:
        if str(e) == "database is locked":
            print(DATABASE_NAME +
                  " has pending changes. Write those changes and restart")
        else:
            print("Database ERROR: " + str(e))
            print(type(e))
    except Exception as e:
        print("ERROR: " + str(e))
        print(type(e))
def InitializeOMDBImport():
    t = Timer()
    t.Start()
    print("Loading data from OMDB API...")
    conn = sqlite3.connect(Database_Name)
    cur = conn.cursor()
    cur2 = conn.cursor()

    # get ratings for the most popular, most highly rated films, and any film that
    # has won at least 2 academy awards
    statement = '''
    SELECT Title, Release FROM Film
        WHERE FilmID IN
            (
                SELECT MovieID
        		FROM Ratings
        		GROUP BY MovieID
        		HAVING COUNT(*) > 10
        		ORDER BY AVG(Rating)
        		LIMIT 350
            )
        OR FilmID IN
        	(
        		SELECT MovieID
        		FROM Ratings
        		GROUP BY MovieID
        		ORDER BY COUNT(*) DESC
        		LIMIT 500
        	)
        OR FilmID IN
        	(
        		SELECT FilmID
        		FROM Film
        		WHERE AA_Wins > 1
        	)
    '''
    cur.execute(statement)

    updates = []
    for row in cur:
        try:
            OMD_data = Import_OMD(row[0], row[1][:4])
            values = [None, None, None, None, None, row[0], row[1]]
            values[0] = OMD_data['Rated']
            values[1] = OMD_data['Poster']
            for ratings in OMD_data['Ratings']:
                if ratings['Source'] == "Internet Movie Database":
                    values[2] = ratings['Value'].split('/')[0]
                if ratings['Source'] == "Rotten Tomatoes":
                    values[3] = ratings['Value']
                if ratings['Source'] == "Metacritic":
                    values[4] = ratings['Value'].split('/')[0]
            updates.append(values)
        except Exception as e:
            pass
    statement = 'UPDATE Film SET Rating=?, Poster=?, Rating_IMDB = ?, Rating_RT=?, Rating_MC=? WHERE Title == ? AND Release == ?'
    cur.executemany(statement, updates)
    conn.commit()
    conn.close()

    t.Stop()
    print("OMDB Import completed in " + str(t))