def define_Controversy(db_loc): from Scripts import db_ec contro_table_stat = '''CREATE TABLE IF NOT EXISTS contRatings( movieId integer, ratio real, PolarFreq real, TotalVotes integer, AvgRating real, primary key (movieId), foreign key (movieId) references ratings(movieId), foreign key (movieId) references metadata(ratingId));''' calc_stat_data = '''SELECT t.movieId, t.HighCount*1.0/t.LowCount as Ratio, ((t.HighCount + t.LowCount)*1.0/t.TotalCount) as PolarFreq, t.TotalCount, t.AvgRating FROM (SELECT movieId, COUNT(CASE WHEN rating <= 1 THEN rating END) as LowCount, COUNT(CASE WHEN rating > 4 THEN rating END) as HighCount, COUNT(rating) as TotalCount, AVG(rating) as AvgRating FROM ratings GROUP BY movieId) t ; ''' try: conn = db_ec.connect_db(db_loc) db_ec.create_table(conn, contro_table_stat) conn.commit() except: print("Error creating table") finally: conn.close() ##load controversy table try: conn = db_ec.connect_db(db_loc) cur = conn.cursor() cont_dat = cur.execute(calc_stat_data) cur.executemany( 'INSERT OR REPLACE INTO contRatings(movieId,ratio,PolarFreq,TotalVotes, AvgRating) values (?,?,?,?,?)', cont_dat.fetchall()) conn.commit() finally: conn.close()
def movieRatings_load(db_loc, rating_csv_loc): from Scripts import db_ec ratings_stat = '''CREATE TABLE IF NOT EXISTS ratings( userID integer, movieId integer, rating real, timestamp integer, primary key(userID,movieID, timestamp)) ; ''' try: conn = db_ec.connect_db(db_loc) db_ec.create_table(conn, ratings_stat) except: print("Error creating table 2") finally: conn.close() ##load data from clean ratings data try: conn = db_ec.connect_db(db_loc) cur = conn.cursor() with open(rating_csv_loc, 'r') as myfile: reader = csv.DictReader(myfile) to_db = [(i['userId'], i['movieId'], i['rating'], i['timestamp']) for i in reader] cur.executemany( '''INSERT OR REPLACE into ratings(userID,movieID,rating,timestamp) VALUES(?,?,?,?);''', to_db) conn.commit() except: print('Error loading ratings') finally: conn.close()
def movieKeywords_load(db_loc, keyword_csv_loc): from Scripts import db_ec keyword_stat = '''CREATE TABLE IF NOT EXISTS keywords( id integer, keywords text, primary key(id), foreign key(id) references metadata(id) ); ''' try: conn = db_ec.connect_db(db_loc) db_ec.create_table(conn, keyword_stat) except: print("Error creating table") finally: conn.close() try: conn = db_ec.connect_db(db_loc) cur = conn.cursor() with open(keyword_csv_loc, 'r', encoding='utf8') as myfile: reader = csv.DictReader(myfile) to_db = [(i['id'], i['keywords']) for i in reader] cur.executemany( '''INSERT OR REPLACE into keywords(id,keywords) VALUES(?,?);''', to_db) conn.commit() except: print('Error loading keywords') finally: conn.close()
def movieCSVcur(db_loc, statement, output): from Scripts import db_ec conn = db_ec.connect_db(db_loc) cur = conn.cursor() cur.execute(statement) with open(output, 'w', encoding='utf8') as myfile: writer = csv.writer(myfile) writer.writerow(i[0] for i in cur.description) writer.writerows(cur) conn.close()
def movieMeta_load(db_loc, meta_csv_loc): from Scripts import db_ec meta_stat = '''CREATE TABLE IF NOT EXISTS metadata( adult text, belongs_to_collection text, budget integer, genres text, homepage text, id integer, imdb_id text, original_language text, original_title text, overview text, popularity real, poster_path text, production_companies text, production_countries text, release_date text, revenue real, runtime real, spoken_languages text, status text, tagline text, title text, video text, vote_average real, vote_count real, ratingId integer, primary key(id), foreign key (ratingId) references ratings(movieId)) ; ''' try: conn = db_ec.connect_db(db_loc) db_ec.create_table(conn, meta_stat) except: print("Error creating metadata table") finally: conn.close() ##load try: conn = db_ec.connect_db(db_loc) cur = conn.cursor() records = cur.execute('SELECT * FROM metadata;') #clear current records if records: cur.execute("DELETE FROM metadata;") conn.commit() with open(meta_csv_loc, 'r', encoding="utf8") as myfile: reader = csv.DictReader(myfile) to_db = [ (i['adult'], i['belongs_to_collection'], i['budget'], i['genres'], i['homepage'], i["id"], i["imdb_id"], i["original_language"], i["original_title"], i["overview"], i["popularity"], i["poster_path"], i["production_companies"], i["production_countries"], i["release_date"], i["revenue"], i["runtime"], i["spoken_languages"], i["status"], i["tagline"], i["title"], i["video"], i["vote_average"], i["vote_count"], i["ratingId"]) for i in reader ] cur.executemany( '''INSERT OR REPLACE INTO metadata (adult, belongs_to_collection,budget,genres, homepage, id, imdb_id, original_language,original_title,overview,popularity,poster_path, production_companies,production_countries,release_date,revenue,runtime,spoken_languages, status,tagline,title,video,vote_average,vote_count, ratingId) VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?); ''', to_db) conn.commit() except: print("Error loading DB") finally: conn.close()
(dat['PolarFreq'] > PolarRating), 1, 0) print("Contrversial Ratio set between " + str(ratio) + " and " + str(1 / ratio)) print("Polar Voting must make up " + str(PolarRating) + " percentile of ratings") ##add rewrites dat['Rewrite'] = np.where(dat['title'] != dat['original_title'], 1, 0) ##pull keywords from movies after 1994 and with more than 30 votes stat = '''SELECT k.id, k.keywords FROM metadata m, keywords k, contratings r WHERE k.id = m.id AND r.movieId = m.ratingId AND m.release_date > '1994-12-31' AND r.TotalVotes > 30;''' conn = db_ec.connect_db(db_loc) keywords_Df = pd.read_sql_query(stat, conn) print("Keywords pulled from database") ##format from json to list def get_key_name_list(x): if isinstance(x, list): names = [i['name'] for i in x] return names return [] keywords_Df['keywords'] = keywords_Df['keywords'].apply(literal_eval) keywords_Df['keywords'] = keywords_Df['keywords'].apply(get_key_name_list)