def get_trailer_metadata(): """ Function which uses youtubeId to collect trailer metadata """ #get all trailers from the database trailers_df = database_helper.select_query("trailers") with tqdm(total=len(trailers_df)) as pbar: for index, row in trailers_df.iterrows(): #use the youtube id to make an api request for video meta data trailer_data = yt.get_video_metadata(row['youtubeId']) #update the db with collected meta data update_params = { 'title': trailer_data['video_title'], 'channelTitle': trailer_data['channel_title'], 'channelId': trailer_data['channel_id'], 'categoryId': trailer_data['video_category'], 'commentCount': trailer_data['video_comment_count'], 'description': trailer_data['video_description'], 'likeCount': trailer_data['video_like_count'], 'dislikeCount': trailer_data['video_dislike_count'], 'viewCount': trailer_data['video_view_count'], 'publishDate': trailer_data['video_publish_date'], 'tags': trailer_data['video_tags'] } select_params = {"youtubeId": row["youtubeId"]} database_helper.update_data("trailers", update_params=update_params, select_params=select_params) pbar.update(1)
def get_youtube_trailers(): """ Attempt to collect movie trailers from YouTube (does not work due to API limits) """ movies_df = database_helper.select_query("movies") with tqdm(total=len(movies_df)) as pbar: for index, row in movies_df.iterrows(): title = re.sub(r"\s*\(.*\)\s*", "", row["title"]) title = re.sub(r'[^\w\s]', '', title) print(title) # if (row['distributor']): # try: # yt_search = yt.search(q = title + " trailer", max_results=10, parser=None) # distributor_trailers = list(filter(lambda x : row['distributor'].lower() in x['snippet']['channelTitle'].lower(), yt_search)) # if (len(distributor_trailers) > 0): # #add trailers to db # for trailer in distributor_trailers: # database_helper.insert_data("trailers", {"movieId" : row["movieId"], "youtubeId" : trailer['id']['videoId']}) # else: # print("Couldnt find trailer for " + row["title"]) # except Exception as error: # print(error) pbar.update(1)
def __init__(self, db_row): """ Director box office class constructor :param db_row: pandas series object corresponding to row from which object should be built """ self.movie_imdbId = db_row.m_imdbId person_df = database_helper.select_query("people", { "imdbId" : db_row.p_imdbId }) Person.__init__(self, person_df.iloc[0])
def get_actors(): """Function which uses imdb to collect movie actors""" #get all movies from db movies_df = movie_helper.get_movies_df() with tqdm(total=len(movies_df)) as pbar: for index, row in movies_df.iterrows(): #if imdbid exists user it to look up the API if (row['imdbId']): movie = ia.get_movie(str(row['imdbId'])) #get list of cast cast_list = movie.get('cast') if (cast_list != None): for cast_member in cast_list: #Try to get the name of the character character_name = "" if (isinstance(cast_member.currentRole, list)): character_name = ','.join( [x['name'] for x in cast_member.currentRole]) else: try: character_name = cast_member.currentRole[ 'name'] except: character_name = "Unknown" #first check if the person exists imdb_id = cast_member.personID person_df = database_helper.select_query( "people", {'imdbId': imdb_id}) if (person_df.empty): database_helper.insert_data( "people", { "imdbId": imdb_id, "fullName": cast_member["name"] }) #add movie director link database_helper.insert_data( "actors", { "p_imdbId": imdb_id, "m_imdbId": row['imdbId'], "role": character_name }) pbar.update(1)
def check_synopsis(): """Check the database to make sure synopsis have been collected for every movie.""" movies_df = database_helper.select_query("movies", {"enabled": '1'}) movies = [] with tqdm(total=len(movies_df)) as pbar: for index, row in movies_df.iterrows(): movie = Movie(row) movies.append(movie) #if there is no synopsis print movie to command line if (movie.synopsis == ''): print(movie.title + " (" + movie.imdbId + ") no synopsis") pbar.update(1)
def __init__(self, db_row): """ Actor box office class constructor :param db_row: pandas series object corresponding to row from which object should be built """ self.actorId = db_row.id self.movie_imdbId = db_row.m_imdbId self.role = db_row.role self.credited = not db_row.notes == '(uncredited)' #get person entry person_df = database_helper.select_query("people", { "imdbId" : db_row.p_imdbId }) Person.__init__(self, person_df.iloc[0])
def get_hashtags_from_trailers(): """Function to extract the movie hashtags from trailer descriptions""" #get all the trailers from the db trailers_df = database_helper.select_query("trailers") with tqdm(total=len(trailers_df)) as pbar: for index, row in trailers_df.iterrows(): #extract hashtags from the description and print to the console for inspection if ('#' in row.description): hashtags = re.findall(r"#(\w+)", row.description) print(row.title) print(hashtags) pbar.update(1)
def get_trailer_release_dates(): """Function to specifically update the trailer release dates which could not be retreived by get_trailer_metadata()""" #get all trailers from the db trailers_df = database_helper.select_query("trailers") with tqdm(total=len(trailers_df)) as pbar: for index, row in trailers_df.iterrows(): #use customized api request to correctly retreive the release dates of the trailers trailer_date = youtube_helper.get_trailer_release( row['youtubeId'], yt) #update the database update_params = {'publishDate': trailer_date} select_params = {"youtubeId": row["youtubeId"]} database_helper.update_data("trailers", update_params=update_params, select_params=select_params) pbar.update(1)
def add_box_office(): """Function which adds the weekend box office data from the BFI into the db""" #get full film set film_df = bfi_helper.get_raw_data() film_df_sub = film_df[['Film', 'Country of Origin', 'Distributor']].drop_duplicates() film_df_unq = film_df.drop_duplicates() with tqdm(total=len(film_df)) as pbar: for index, row in film_df.iterrows(): #get the movie id and use it to insert weekend data into the db movie_df = database_helper.select_query("movies", {"title": row['Film']}) movie_id = int(movie_df.iloc[0]['movieId']) percentage_change = None try: percentage_change = float(row['% change on last week']) except ValueError: percentage_change = None insert_params = { "movieId": movie_id, "weeksOnRelease": row['Weeks on release'], "noOfcinemas": row['Number of cinemas'], "weekendGross": row['Weekend Gross'], "percentageChange": percentage_change, "siteAverage": row['Site average'], "grossToDate": row['Total Gross to date'], "weekendStart": row['weekendStart'], "weekendEnd": row['weekendEnd'], "rank": row['Rank'] } database_helper.insert_data("weekend_box_office", insert_params) pbar.update(1)
def get_writers(): """ Function which uses imdb id to get list of writers """ #get all movies from db movies_df = movie_helper.get_movies_df() with tqdm(total=len(movies_df)) as pbar: for index, row in movies_df.iterrows(): #if imdbid exists user it to look up the API if (row['imdbId']): movie = ia.get_movie(str(row['imdbId'])) #get list of writers writers = movie.get('writer') if (writers != None): for writer in writers: #first check if the person exists imdb_id = writer.personID person_df = database_helper.select_query( "people", {'imdbId': imdb_id}) if (person_df.empty): database_helper.insert_data( "people", { "imdbId": imdb_id, "fullName": writer["name"] }) #add movie director link database_helper.insert_data("writers", { "p_imdbId": imdb_id, "m_imdbId": row['imdbId'] }) pbar.update(1)
def add_movies_to_db(): """Function which creates a unique list of movies from BFI data and inserts into DB""" #get full film set film_df = bfi_helper.get_raw_data() film_df_sub = film_df[['Film', 'Country of Origin', 'Distributor']].drop_duplicates() film_df_unq = film_df.drop_duplicates() with tqdm(total=len(film_df_unq)) as pbar: for index, row in film_df_unq.iterrows(): #check that the movie has not been added yet existing = database_helper.select_query("movies", {"title": row["Film"]}) if (existing.empty): #insert into db database_helper.insert_data( "movies", { "title": row['Film'], "distributor": row['Distributor'], "country": row['Country of Origin'] }) pbar.update(1)
def plot_chi_sqrd_surface(movieId=0, normalize_by="All", start_date=None, end_date=None, critical_period=False): """ Function for generating expectation maps :param movieId: integer movieId for creating expecation score for movie tweets :param normalize_by: string val indicating if scores should be normalized by all tweets, or movie tweets :param start_date: datetime of start date for filtering tweets :param end_date: datetime of end_date for filtering tweets :param critical_period: bool indicating if movie tweets should only be counted over critical period """ #fix dates so we include start of start and end of end if not start_date == None: start_date = datetime.combine(start_date.date(), datetime.min.time()) if not end_date == None: end_date = datetime.combine(end_date.date(), datetime.max.time()) #use gb regions for normalizing and for plotting #Ordanance survey data contained only the shape files for GB so need to normalize populations by this rather than fishnet which also uses NI gb_regions = database_helper.get_geo_data("select * from uk_regions", "geombng") gb_regions_count = database_helper.select_region_tweets( start_date=start_date, end_date=end_date) #check if we are using the entire population of tweets or just the movie populaiton if normalize_by == "Movies": #normalize by movie tweets gb_regions_count = database_helper.select_movie_region_tweets( start_date=start_date, end_date=end_date) gb_regions_count = gb_regions_count.drop(columns=['movieid']) gb_regions_count = gb_regions_count.groupby( by="cellid").size().reset_index(name="tweet_count") total_gb_tweets = gb_regions_count["tweet_count"].sum() #first step get total tweets in uk fishnet uk_fishnet_count = database_helper.select_fishnet_count( start_date=start_date, end_date=end_date) #now get total movie tweets in uk fishnet movie_fishnet_tweets = database_helper.select_movie_fishnet_tweets( movieId, start_date=start_date, end_date=end_date) #now get movie tweets per cell movie_cell_tweets = movie_fishnet_tweets.groupby( by="cellid").size().reset_index(name="movie_tweets") #now group with total fishnet counts fishnet_movie_comb = uk_fishnet_count.merge(movie_cell_tweets, how='left', on='cellid') #attach results to geodataframe so it can be plotted uk_fishnet = database_helper.get_geo_data("select * from uk_fishnet", "geombng") uk_fishnet = uk_fishnet.rename(columns={"id": "cellid"}) uk_fishnet = uk_fishnet.merge(fishnet_movie_comb, how='left', on='cellid') #replace na with 0 uk_fishnet = uk_fishnet.fillna(0) ##get total gb tweets for movie gb_movie_fishnet = sjoin(gb_regions, uk_fishnet, how='inner') gb_movie_fishnet = gb_movie_fishnet[[ "cellid", "movie_tweets" ]].drop_duplicates().reset_index(drop=True) gb_movie_total = gb_movie_fishnet["movie_tweets"].sum() #do expecation calculation uk_fishnet['surf_expectation'] = uk_fishnet.apply( lambda row: calc_surface_expectation(total_gb_tweets, gb_movie_total, row["tweet_count"], row[ "movie_tweets"]), axis=1) #replace na with 0 (not all fishnet cells have tweets) uk_fishnet = uk_fishnet.fillna(0) #get cell colors uk_fishnet["color"] = uk_fishnet.apply( lambda row: get_cell_color(row["surf_expectation"]), axis=1) uk_fishnet["label"] = uk_fishnet.apply( lambda row: get_cell_label(row["surf_expectation"]), axis=1) #return uk_fishnet #now do plots fig, ax = plt.subplots(1, figsize=(9, 9)) #this takes time, may be useful to create the overlay and store in db then use pandas join/merge to input expectation overlay = gpd.overlay(gb_regions, uk_fishnet, how='intersection') map_ax = overlay.plot(color=overlay['color'], ax=ax) title = "Movie Tweets Expectation Map" #get movie if movieId > 0: movies_df = database_helper.select_query("movies", {"movieId": movieId}) title = movies_df.iloc[0]["title"] + " Tweet Expecation" if critical_period: title = "{0} (Critical Period)".format(title) elif (start_date != None) and (end_date != None): title = "{0} ({1} - {2})".format(title, start_date.date(), end_date.date()) ax.set_axis_off() #plt.axis('equal') legend_elements = [ Line2D([0], [0], marker='s', color='red', label='Above Expected', markerfacecolor='red', markersize=15), Line2D([0], [0], marker='s', color='white', label='At Expected', markerfacecolor='white', markersize=15), Line2D([0], [0], marker='s', color='blue', label='Below Expected', markerfacecolor='blue', markersize=15) ] ax.legend(handles=legend_elements, loc="upper left") plt.title(title) plt.show() plt.clf() plt.cla() plt.close() return overlay
def get_most_popular_movie_per_region(start_date=None, end_date=None, senti_class=None, ignore_list=[28, 121], senti_percentage=False, critical_period=False): """ Function to get the most popular move per region by tweet count :param start_date: datetime of start date for filtering tweets :param end_date: datetime of end_date for filtering tweets :param senti_class: string to filter tweets by sentiment :param ignore_list: integer list of movie ids to ignore :param senti_percentage: bool indicating favourites should be based on sentiment percentage :param critical_period: bool indicating if tweets should be filtered to crticial period :return dataframe of regions and their favourite movies """ #get all regional tweets according to date and sentiment filters region_movie_tweets = database_helper.select_movie_region_tweets( start_date=start_date, end_date=end_date, senti_class=senti_class) #check if we need to filter by the crticial period if critical_period: movies_df = movie_helper.get_movies_df() small_movies_df = movies_df[[ "movieId", "critical_start", "critical_end" ]] small_movies_df = small_movies_df.rename( columns={"movieId": "movieid"}) region_movie_tweets = region_movie_tweets.merge(small_movies_df, how="left", on="movieid") region_movie_tweets = region_movie_tweets[ (region_movie_tweets["created_at"] >= region_movie_tweets["critical_start"]) & (region_movie_tweets["created_at"] <= region_movie_tweets["critical_end"])] #group tweets by region and movie region_movie_grouped = region_movie_tweets.groupby( by=["unit_id", "movieid"]).size().reset_index(name="tweet_count") #check if we should use sentiment percentage (i.e film with highest percentage of positive tweets) group_col = "tweet_count" if (senti_percentage) and (not senti_class == None): #calculate sentiment tweets as percentage region_movie_all = database_helper.select_movie_region_tweets( start_date=start_date, end_date=end_date) if critical_period: movies_df = movie_helper.get_movies_df() small_movies_df = movies_df[[ "movieId", "critical_start", "critical_end" ]] small_movies_df = small_movies_df.rename( columns={"movieId": "movieid"}) region_movie_all = region_movie_all.merge(small_movies_df, how="left", on="movieid") region_movie_all = region_movie_all[ (region_movie_all["created_at"] >= region_movie_all["critical_start"]) & (region_movie_all["created_at"] <= region_movie_all["critical_end"])] region_movie_all_grouped = region_movie_all.groupby( by=["unit_id", "movieid"]).size().reset_index( name="tweet_count_all") #use threshold of 20 tweets per region? region_movie_all_grouped = region_movie_all_grouped[ region_movie_all_grouped["tweet_count_all"] >= 20] region_movie_grouped = region_movie_grouped.merge( region_movie_all_grouped, how="left", on=["unit_id", "movieid"]) region_movie_grouped["senti_percentage"] = ( region_movie_grouped["tweet_count"] / region_movie_grouped["tweet_count_all"]) * 100 group_col = "senti_percentage" #remove ignored movies from list if len(ignore_list) > 0: region_movie_grouped = region_movie_grouped[ ~region_movie_grouped["movieid"].isin(ignore_list)] #get the movies with the highest count per region most_popular_per_region = region_movie_grouped.loc[ region_movie_grouped.groupby(['unit_id'])[group_col].idxmax()] #this is slow but really helps with generating the figures #attach movie ttitles to results movies_df = movie_helper.get_movies_df() movie_titles = movies_df[["movieId", "title"]] #attach region names gb_regions = database_helper.select_query("tweets_region_count") gb_regions = gb_regions[["unit_id", "region"]] most_popular_per_region = most_popular_per_region.merge(gb_regions, how="left", on="unit_id") most_popular_per_region = most_popular_per_region.merge( movie_titles, how="left", left_on="movieid", right_on="movieId").drop(columns="movieId") return most_popular_per_region
def plot_region_tweets_bar(movieId=0, normalize=False, start_date=None, end_date=None, critical_period=True): """ Function for generating bar plot of regional movie tweets :param movieId: integer movieId for creating expecation score for movie tweets :param normalize: bool indicating if tweet counts should be normalized :param start_date: datetime of start date for filtering tweets :param end_date: datetime of end_date for filtering tweets :param critical_period: bool indicating if tweets should be filtered to crticial period """ #select movie tweets with region cell id and unit id attached region_movie_tweets = database_helper.select_movie_region_tweets( movieId, start_date=start_date, end_date=end_date) #group by region unit_id to per region tweet count tweet_freq = region_movie_tweets.drop(columns=['movieid']) tweet_freq = region_movie_tweets.groupby(by="unit_id").size().reset_index( name="movie_tweet_count") plot_col = "movie_tweet_count" title = "Regional Movie Tweets" ylabel = "Movie Tweet" movie_title = "" if movieId > 0: movies_df = database_helper.select_query("movies", {"movieId": movieId}) movie_title = movies_df.iloc[0]["title"] title = movie_title + " Tweets" #if normalize generate column (movie tweets per million tweets) tweet_region_counts = database_helper.select_query("tweets_region_count") if normalize: tweet_freq = tweet_region_counts.merge(tweet_freq, on="unit_id", how="left") #fill na with 0 tweet_freq = tweet_freq.fillna(0) tweet_freq["norm_count"] = (tweet_freq['movie_tweet_count'] / tweet_freq['tweet_count']) * 1000000 plot_col = "norm_count" title = "Regional Movie Tweets (per million tweets)" if movieId > 0: title = movie_title + " Tweets (per million tweets)" ylabel = "Movie Tweets (per million tweets)" else: regions = tweet_region_counts[["unit_id", "region"]] tweet_freq = tweet_freq.merge(regions, on="unit_id", how="left") #check if we need to filter by the critical period if critical_period: title = "{0} (Critical Period)".format(title) elif (start_date != None) and (end_date != None): title = "{0} ({1} - {2})".format(title, start_date.date(), end_date.date()) #create bar plot ax = sns.barplot(x="region", y=plot_col, data=tweet_freq) ax.set(xlabel='Region', ylabel=ylabel) plt.title(title) plt.xticks(rotation=90) plt.show() return tweet_freq
def plot_movie_tweets_map(movieId=0, normalize=False, start_date=None, end_date=None, critical_period=False): """ Function for generating heatmap of movie tweets :param movieId: integer movieId for creating expecation score for movie tweets :param start_date: datetime of start date for filtering tweets :param end_date: datetime of end_date for filtering tweets :param critical_period: bool indicating if tweets should be filtered to crticial period """ #select movie tweets with region cell id and unit id attached region_movie_tweets = database_helper.select_movie_region_tweets( movieId, start_date=start_date, end_date=end_date) #group by region unit_id to per region tweet count tweet_freq = region_movie_tweets.drop(columns=['movieid']) tweet_freq = region_movie_tweets.groupby(by="unit_id").size().reset_index( name="movie_tweet_count") map_col = "movie_tweet_count" title = "Regional Movie Tweets" movie_title = "" if movieId > 0: movies_df = database_helper.select_query("movies", {"movieId": movieId}) movie_title = movies_df.iloc[0]["title"] title = movie_title + " Tweets" #if normalize generate column (movie tweets per million tweets) if normalize: tweet_region_counts = database_helper.select_query( "tweets_region_count") tweet_freq = tweet_region_counts.merge(tweet_freq, on="unit_id", how="left") #fill na with 0 tweet_freq = tweet_freq.fillna(0) tweet_freq["norm_count"] = (tweet_freq['movie_tweet_count'] / tweet_freq['tweet_count']) * 1000000 map_col = "norm_count" #title = "Regional Movie Tweets (per million tweets)" #if movieId > 0: #title = movie_title + " Tweets (per million tweets)" #check if we need to filter by critical period if critical_period: title = "{0} (Critical Period)".format(title) elif (start_date != None) and (end_date != None): title = "{0} ({1} - {2})".format(title, start_date.date(), end_date.date()) #merge with shape file gb = gpd.read_file("../../ProjectData/Data/GB/european_region_region.shp") map_freq = gb.merge(tweet_freq, left_on='UNIT_ID', right_on='unit_id') #plot fig, ax = plt.subplots(1, 1, figsize=(11, 9)) ax.axis('off') ax.set_title(title) fig.set_dpi(100) map_freq.plot(column=map_col, ax=ax, legend=True, cmap='OrRd') plt.show() return map_freq
""" from tqdm import tqdm import pandas as pd from selenium import webdriver from webdriver_manager.chrome import ChromeDriverManager from selenium.webdriver.common.keys import Keys import time import sys sys.path.insert(1, '/home/andy/Documents/MscProject/MscProj/Utils') import database_helper from youtube_helper import YouTubeHelper yt = YouTubeHelper().yt trailers_df = database_helper.select_query("trailers") #filter out this list of selected trailers filter_ids = [ 95, 103, 81, 93, 89, 36, 239, 71, 30, 41, 14, 80, 70, 350, 59, 65, 64, 110, 124, 368, 372, 123 ] filtered_trailers = trailers_df[~trailers_df.id.isin(filter_ids)] def custom_parser(json): snippet = json['snippet']['topLevelComment']['snippet'] comment = { 'commentId': json['id'], 'channelUrl': '',
""" import imdb from tqdm import tqdm import pandas as pd import sys # insert at 1, 0 is the script path (or '' in REPL) sys.path.insert(1, '/home/andy/Documents/MscProject/MscProj/Utils') import database_helper #initialize imdb ia = imdb.IMDb() #greta greta = database_helper.select_query("movies", {"movieId": 234}) greta = greta.iloc[0] greta_res = ia.get_movie('2639336') year = greta_res['year'] if (greta_res.get('genres')): genres = ','.join(greta_res.get('genres')) rating = greta_res.get('rating') votes = greta_res.get('votes') certificates = None if (greta_res.get('certificates')): certificates = ','.join(greta_res.get('certificates')) #update database update_params = { "imdbId": '2639336',