def proc_Douban(df_Douban_raw, df_MovieLens, source=''): lists_Douban = df_Douban_raw.values.tolist() n = 1 for a_movie_list in lists_Douban: movie_api = Movie() movie_api.wait_on_rate_limit = True while True: try: search = movie_api.search(a_movie_list[1]) break except BaseException as error: print(error) time.sleep(0.25) print(n) tmdbId = 0 for res in search: if ('release_date' not in res.__dict__) or ( res.release_date.split('-')[0] != a_movie_list[2]): continue else: tmdbId = res.id name = res.title break if tmdbId == 0 and len(search): tmdbId = search[0].id name = search[0].title a_movie_list[1] = name a_movie_list_extension = get_movie_details(tmdbId, df_MovieLens) a_movie_list += a_movie_list_extension df_Douban = pd.DataFrame(lists_Douban, columns=[ 'rank', 'name', 'year', 'Douban_rating', 'country', 'genres', 'budget', 'revenue', 'keywords', 'MLens_rating', 'tmdb_id' ]) if source == 'test': df_Douban.to_csv('./data/Douban_top_250_test.csv') print('Douban testing dataset processed') else: df_Douban.to_csv('./data/Douban_top_250.csv') print('Complete Douban dataset processed') return df_Douban
def proc_Yahoo(df_Yahoo_raw, df_MovieLens, source=''): lists_Yahoo = df_Yahoo_raw.values.tolist() for a_movie_list in lists_Yahoo: movie_api = Movie() movie_api.wait_on_rate_limit = True while True: try: search = movie_api.search(a_movie_list[1]) break except BaseException as error: print(error) time.sleep(0.25) tmdbId = 0 for res in search: if ('release_date' not in res.__dict__) or ( res.release_date.split('-')[0] != a_movie_list[2]): continue else: tmdbId = res.id break if tmdbId == 0 and len(search): tmdbId = search[0].id a_movie_list_extension = get_movie_details(tmdbId, df_MovieLens) a_movie_list += a_movie_list_extension df_Yahoo = pd.DataFrame(lists_Yahoo, columns=[ 'rank', 'name', 'year', 'Yahoo_rating', 'country', 'genres', 'budget', 'revenue', 'keywords', 'MLens_rating', 'tmdb_id' ]) for i in range(510): df_Yahoo['year'][i] = re.sub('[()]', '', df_Yahoo['year'][i]) df_Yahoo['year'] = df_Yahoo['year'].astype(int) if source == 'test': df_Yahoo.to_csv('./data/Yahoo_top_500_test.csv') print('Yahoo testing dataset processed') else: df_Yahoo.to_csv('./data/Yahoo_top_500.csv') print('Complete Yahoo dataset processed') return df_Yahoo
def get_movie_details(tmdbId, df_MovieLens): country, genres, budget, revenue, keywords, MLens_rating = '', '', np.NaN, np.NaN, '', np.NaN if tmdbId: movie_api = Movie() movie_api.wait_on_rate_limit = True movie_dict = movie_api.details(tmdbId).__dict__ country_list = movie_dict['entries']['production_countries'] if country_list: country = movie_dict['entries']['production_countries'][0][ 'iso_3166_1'] genres_list = movie_dict['entries']['genres'] genres = '/'.join([i['name'] for i in genres_list]) budget = movie_dict['entries']['budget'] revenue = movie_dict['entries']['revenue'] keywords_list = movie_dict['entries']['keywords']['keywords'] keywords = '/'.join([i['name'] for i in keywords_list]) MLens_rating_Series = df_MovieLens[df_MovieLens['tmdbId'] == tmdbId]['rating'] if len(MLens_rating_Series) == 1: MLens_rating = float(MLens_rating_Series) #time.sleep(0.5) # to ensure not to exceed the rate limit of API return [country, genres, budget, revenue, keywords, MLens_rating, tmdbId]