def get_bga_categories_from_api(): url = 'https://api.boardgameatlas.com/api/game/categories?' params = {'client_id': '16OTwjJZDB'} # query API to get all categories categories = bga_api_call(url, params)['categories'] # create categories dataframe categories_df = pd.DataFrame(categories) # drop checked column: del categories_df['checked'] # rename a few columns categories_df.rename(columns={ 'id': 'category_bga_id', 'name': 'category_name', 'url': 'category_bga_url' }, inplace=True) # export bga_categories: export_df_to_csv( categories_df, '../Data/BoardGameAtlas/Raw/API/Categories/all_bga_categories.csv')
def extract_users(): # import reviews df: reviews_path = '../Data/Joined/Integration/Reviews/Reviews_All_Games_Integrated.pickle' all_reviews = import_pickle_to_dataframe(reviews_path) # remove index column from all_reviews. # Including index_col=0 in the read_csv statement throws an error for some unknown reason. all_reviews.drop(all_reviews.columns[0], axis=1) # Create user dataframe: users_df = all_reviews.groupby(['user_name', 'review_origin' ]).size().reset_index(name='num_ratings') # Count individual users in both datasets: bga_users = users_df[users_df['review_origin'] == 'bga'] sum_bga_users = len(bga_users) bgg_users = users_df[users_df['review_origin'] == 'bgg'] sum_bgg_users = len(bgg_users) print('User count:') print('BoardGameAtlas users: ' + str(sum_bga_users)) print('BoardGameGeeks users: ' + str(sum_bgg_users)) # Add average rating column to user df: users_df['avg_rating'] = all_reviews.groupby( ['user_name', 'review_origin'], as_index=False).agg({'rating': 'mean'})['rating'] # Rename origin column: users_df.rename(columns={'review_origin': 'user_origin'}, inplace=True) # Create user_id: users_df.insert(0, 'user_key', range(1, 1 + len(users_df))) # Export users to csv: export_df_to_csv(users_df, '../Data/Joined/Results/User.csv')
def get_bga_mechanics_from_api(): url = 'https://api.boardgameatlas.com/api/game/mechanics?' params = {'client_id': '16OTwjJZDB'} # query API to get all mechanics mechanics = bga_api_call(url, params)['mechanics'] # create mechanics dataframe mechanics_df = pd.DataFrame(mechanics) # drop checked column: del mechanics_df['checked'] # rename a few columns mechanics_df.rename(columns={ 'id': 'mechanic_bga_id', 'name': 'mechanic_name', 'url': 'mechanic_bga_url' }, inplace=True) # export bga_mechanics: export_df_to_csv( mechanics_df, '../Data/BoardGameAtlas/Raw/API/Mechanics/all_bga_mechanics.csv')
def normalize_bga_game_categories_relation(): # import bga_games_category_relation: categories_bga_games_relation_path_fuzzy = '../Data/BoardGameAtlas/Processed/API/05_BGA_Game_Categories_Relation_*.json' categories_bga_games_relation_path = get_latest_version_of_file( categories_bga_games_relation_path_fuzzy) categories_bga_games_relation_df = import_json_to_dataframe( categories_bga_games_relation_path) # import bga categories: categories_bga = pd.read_csv( '../Data/BoardGameAtlas/Raw/API/categories/all_bga_categories.csv', index_col=0) # import game keys: game_keys = pd.read_csv( '../Data/Joined/Integration/GameKeys/Keys_All_Games_Integrated.csv', index_col=0) # join games_category_relation table to replace bga_game_id column with game_keys column: categories_bga_games_relation_df = pd.merge( left=categories_bga_games_relation_df, right=game_keys, left_on='game_id', right_on='bga_game_id') # normalize by only keeping game_key and category_id categories_bga_games_relation_df = categories_bga_games_relation_df[[ 'game_key', 'category_id' ]] # export df export_path = '../Data/BoardGameAtlas/Processed/API/05_BGA_Game_Categories_Relation_Cleaned.csv' export_df_to_csv(categories_bga_games_relation_df, export_path)
def replace_old_ids_with_new_key_and_concatenate_category_relation_tables(): # import bga_category_game_relations: path_1 = '../Data/BoardGameAtlas/Processed/API/05_BGA_Game_Categories_Relation_Cleaned.csv' bga_categories_game_relation = pd.read_csv(path_1, index_col=0) # import bgg_category_game_relations: path_2 = '../Data/BoardGameGeeks/Processed/GameInformation/05_BGA_Game_Categories_Relation_Cleaned.csv' bgg_categories_game_relation = pd.read_csv(path_2, index_col=0) # import categories: path_3 = '../Data/Joined/Integration/GameInformation/05_categories_Integrated_with_bga_and_bgg_ids.csv' categories_df = pd.read_csv(path_3, index_col=0) # replace old ids in bga_category_game_relations: bga_categories_game_relation = pd.merge(left=bga_categories_game_relation, right=categories_df, left_on='category_id', right_on='category_bga_id') bga_categories_game_relation = bga_categories_game_relation[[ 'game_key', 'category_key' ]] # replace old ids in bgg_category_game_relations: bgg_categories_game_relation = pd.merge(left=bgg_categories_game_relation, right=categories_df, left_on='bgg_category_key', right_on='bgg_category_key') bgg_categories_game_relation = bgg_categories_game_relation[[ 'game_key', 'category_key' ]] # delete old bga and bgg id & name columns in categories_df categories_df = categories_df[[ 'category_key', 'category_name', 'category_bga_url' ]].reset_index(drop=True) # # CONCATENATE both tables: # concat_categories_game_relation = pd.concat( [bga_categories_game_relation, bgg_categories_game_relation], ignore_index=True, sort=False).sort_values(['game_key']).reset_index(drop=True) # remove duplicates: categories_df.drop_duplicates(inplace=True) concat_categories_game_relation.drop_duplicates(inplace=True) # export categories_game_relation export_path_1 = '../Data/Joined/Results/Category_Game_Relation.csv' export_df_to_csv(concat_categories_game_relation, export_path_1) # export categories_df export_path_2 = '../Data/Joined/Results/Categories.csv' export_df_to_csv(categories_df, export_path_2)
def create_list_of_all_bga_designers(): # import bga designers fuzzy_import_path = '../Data/BoardGameAtlas/Processed/API/03_BGA_Game_designers_Relation*.json' import_path = get_latest_version_of_file(fuzzy_import_path) bga_designers_game_relation = import_json_to_dataframe(import_path) # extract designers ids and designer urls designers = bga_designers_game_relation[['designer_id', 'designer_url']] # keep only unique designers: designers.drop_duplicates(subset='designer_id', keep='first', inplace=True) # export designers to csv: export_path = '../Data/BoardGameAtlas/Processed/API/BGA_All_Unique_designers.csv' export_df_to_csv(designers, export_path)
def normalize_bgg_game_categories_relation(): # import bgg games: categories_bgg_games_relation_path_fuzzy = '../Data/BoardGameGeeks/Processed/GameInformation/05_BGG_Game_category_Relation_*.csv' categories_bgg_games_relation_path = get_latest_version_of_file( categories_bgg_games_relation_path_fuzzy) categories_bgg_games_relation_df = pd.read_csv( categories_bgg_games_relation_path, index_col=0) # create categories list: categories_bgg = pd.DataFrame( categories_bgg_games_relation_df['category_name'].drop_duplicates()) # create temporary key_column: categories_bgg.insert(0, 'bgg_category_key', range(1001, 1001 + len(categories_bgg))) # import game keys: game_keys = pd.read_csv( '../Data/Joined/Integration/GameKeys/Keys_All_Games_Integrated.csv', index_col=0) # join games_category_relation table to replace bga_game_id column with game_keys column: categories_bgg_games_relation_df = pd.merge( left=categories_bgg_games_relation_df, right=game_keys, left_on='bgg_game_id', right_on='bgg_game_id') # replace 'category_name' with 'bgg_category_key' in categories_bgg_games_relation: categories_bgg_games_relation_df = pd.merge( left=categories_bgg_games_relation_df, right=categories_bgg, left_on='category_name', right_on='category_name') # normalize by only keeping game_key and category_id categories_bgg_games_relation_df = categories_bgg_games_relation_df[[ 'game_key', 'bgg_category_key' ]] # export bgg categories: export_path = '../Data/BoardGameGeeks/Raw/BGG_categories.csv' export_df_to_csv(categories_bgg, export_path) # export bgg game_categories_relation: export_path = '../Data/BoardGameGeeks/Processed/GameInformation/05_BGA_Game_Categories_Relation_Cleaned.csv' export_df_to_csv(categories_bgg_games_relation_df, export_path)
def integrate_game_name_relation_tables(): # Import BGA game_name_relation table: fuzzy_import_path_1 = '../Data/BoardGameAtlas/Processed/API/06_BGA_Game_Names_Relation_*.json' import_path_1 = get_latest_version_of_file(fuzzy_import_path_1) names_bga = import_json_to_dataframe(import_path_1) # Import BGG game_name_relation table: fuzzy_import_path_2 = '../Data/BoardGameGeeks/Processed/GameInformation/06_BGG_Game_Name_Relation_*.csv' import_path_2 = get_latest_version_of_file(fuzzy_import_path_2) names_bgg = pd.read_csv(import_path_2, index_col=0) # Import Game keys: import_path_3 = '../Data/Joined/Integration/GameKeys/Keys_All_Games_Integrated.csv' game_keys = pd.read_csv(import_path_3, index_col=0) # Replace bga 'game_id' with 'game_key' names_bga = pd.merge(left=names_bga, right=game_keys, left_on='game_id', right_on='bga_game_id') names_bga = names_bga[['game_key', 'game_name']] # Replace bgg 'game_id' with 'game_key' names_bgg = pd.merge(left=names_bgg, right=game_keys, left_on='bgg_game_id', right_on='bgg_game_id') names_bgg = names_bgg[['game_key', 'game_name']] # Merge both dataframes: names_combined = pd.concat([names_bga, names_bgg]).sort_values('game_key') # Remove duplicates: print( 'Number of duplicate game names in GameNameTranslation table found and dropped: ' + str(len(names_combined) - len(names_combined.drop_duplicates()))) names_combined.drop_duplicates(inplace=True) # Drop duplicates: names_combined.drop_duplicates(inplace=True) # Export result: export_path = '../Data/Joined/Results/GameNameTranslation.csv' export_df_to_csv(names_combined, export_path)
def clean_bga_api_review_data(): filename = '../Data/BoardGameAtlas/Processed/API/bga_all_reviews_for_games_with_more_than_2_reviews.json' # first check if file already exists: # if file doesn't exist call function to create it: if not os.path.isfile(filename): gather_bga_api_review_data() # import data df = import_json_to_dataframe( '../Data/BoardGameAtlas/Processed/API/bga_all_reviews_for_games_with_more_than_2_reviews.json' ) # remove reviews with a rating < 1: these ratings are actually errors since the rating scale goes from 1 - 5 # and not 0 - 5 when rating games on boardgameatlas.com. df = df[df['rating'] >= 1] # adjust ratings so that they correspond to a scale from 1-10 instead of 1-5: # a BGA rating of 1 should result in a 1 on the new scale: 1 * 2.25 - 1.25 = 2.25 - 1.25 = 1 # a BGA rating of 5 should result in a 10 on the new scale: 5 * 2.25 - 1.25 = 11.25 - 1.25 = 10 df['rating'] = 2.25 * df['rating'] - 1.25 # drop column review_title del df['review_title'] # add column has_review_text (0 = only rating, no text; 1 = rating + text) df['has_review_text'] = np.where(df['review_text'].isnull(), 0, 1) # add column that states the origin of the comment (datasource) df['review_origin'] = 'bga' # rename columns: df.rename(columns={ 'username': '******', 'date': 'review_date' }, inplace=True) export_df_to_csv( df, '../Data/BoardGameAtlas/Processed/API/bga_all_reviews_for_games_with_more_than_2_reviews_CLEANED.csv' )
def merge_scraped_bga_designer_data_and_api_data(): # import scraped bga designer data import_path_1 = '../Data/BoardGameAtlas/Raw/Scrapy/designers/bga_designers.json' designers_scrapy = import_json_to_dataframe(import_path_1) # import api bga designer data import_path_2_fuzzy = '../Data/BoardGameAtlas/Processed/API/03_BGA_Game_designers_Relation_*.json' import_path_2 = get_latest_version_of_file(import_path_2_fuzzy) designers_game_relation = import_json_to_dataframe(import_path_2) # remove list around designer url: designers_scrapy = designers_scrapy.explode('designer_bga_image_url') # merge both dataframes: designers_merged = pd.merge(left=designers_scrapy, right=designers_game_relation, left_on='designer_url', right_on='designer_url') # export df export_path = '../Data/BoardGameAtlas/Processed/API/bga_designers_scrapy_and_api_data_merged.csv' export_df_to_csv(designers_merged, export_path)
def merge_game_information(): ''' Function merges the boardgames of the previously matched games. For the matched games there are four types of columns: a) columns that exist in both datasets but we only need to keep one of them (can include conflicting values): (e.g. name, year_published, min_players, ...) In this case we chose to keep the bgg columns! ["trust-your-friends" avoidance strategy as in case of contradicting values we keep values based on which data source they come from] b) columns that exist in both datasets but we want to keep both: (e.g. bga_game_id/bgg_game_id, num_user_ratings, average_user_rating, bga_rank/bgg_rank, ...) c) columns that exist only in the bgg dataset: (e.g. num_user_comments, bgg_average_weight, ...) d) columns that exist only in the bga dataset: (e.g. reddit_all_time_count, bga_game_url, ...) ''' # import data: # bgg game information dataset: bgg_filename = get_latest_version_of_file( '../Data/BoardGameGeeks/Processed/GameInformation/01_BGG_Game_Information_*.csv' ) bgg_df = pd.read_csv(bgg_filename, index_col=0) # bga game information dataset: bga_filename = get_latest_version_of_file( '../Data/BoardGameAtlas/Processed/API/01_BGA_Game_Information_*.json') bga_df = import_json_to_dataframe(bga_filename) # 1) this leaves us with three groups: # a) Matched Games # b) BGG Games that could not be matched # c) BGA Games that could not be matched # 1a) matched games: ids_matched_games_df = pd.read_csv( '../Data/Joined/Integration/GameInformation/matched_bga_and_bgg_ids.csv', index_col=0) bgg_subset_matches = bgg_df[bgg_df['bgg_game_id'].isin( ids_matched_games_df['bgg_game_id'])] bga_subset_matches = bga_df[bga_df['bga_game_id'].isin( ids_matched_games_df['bga_game_id'])] # 1b) BGG games no matched: bgg_subset_no_matches = bgg_df[~bgg_df['bgg_game_id']. isin(ids_matched_games_df['bgg_game_id'])] # 1c) BGA games no matched: bga_subset_no_matches = bga_df[~bga_df['bga_game_id']. isin(ids_matched_games_df['bga_game_id'])] # 2) # For the matched games there are three types of columns: # a) columns that exist in both datasets but we only need to keep one of them: # (e.g. name, year_published, min_players, ...) # In this case we chose to keep the bgg columns! It doesn't really matter which ones you keep though! # b) columns that exist in both datasets but we want to keep both: # (e.g. bga_game_id/bgg_game_id, num_user_ratings, average_user_rating, bga_rank/bgg_rank, ...) # c) columns that exist only in the bgg dataset: # (e.g. num_user_comments, bgg_average_weight, ...) # d) columns that exist only in the bga dataset: # (e.g. reddit_all_time_count, bga_game_url, ...) # 2a) drop columns from bga dataset: drop_bga_columns = [ 'name', 'year_published', 'min_players', 'max_players', 'min_playtime', 'max_playtime', 'min_age', 'game_description', 'image_url', 'thumbnail_url' ] bga_subset_matches.drop(columns=drop_bga_columns, inplace=True) # add 'matched_bgg_id' column: bga_subset_matches = pd.merge(left=bga_subset_matches, right=ids_matched_games_df, left_on='bga_game_id', right_on='bga_game_id') # merge both datasets: matched_games_df = pd.merge(left=bgg_subset_matches, right=bga_subset_matches, left_on=['bgg_game_id'], right_on=['bgg_game_id']) # Handle duplicate ids in matched_games_df: # remove duplicates: # duplicate bgg_ids: matched_games_df.drop_duplicates(subset=['bgg_game_id'], keep='first', inplace=True) # duplicate bga_ids: matched_games_df.drop_duplicates(subset=['bga_game_id'], keep='first', inplace=True) # In a last (union) step we now have to concatenate all three dataframes to one big dataframes: games_df = matched_games_df.append( [bgg_subset_no_matches, bga_subset_no_matches], ignore_index=True, sort=False) # reorder columns: cols_to_order = [ 'name', 'bgg_game_id', 'bga_game_id', 'year_published', 'min_players', 'max_players', 'min_playtime', 'max_playtime', 'min_age', 'bgg_average_user_rating', 'bga_average_user_rating', 'bgg_num_user_ratings', 'bga_num_user_ratings' ] new_columns = cols_to_order + ( games_df.columns.drop(cols_to_order).tolist()) games_df = games_df[new_columns] # create new unique key_column: games_df.insert(0, 'game_key', range(100001, 100001 + len(games_df))) # create key_csv that contains bga_game_id, bgg_game_id and game_key: key_df = games_df[['game_key', 'bga_game_id', 'bgg_game_id']] # check if there are any duplicates in game_df: games_df_duplicates = len(games_df) - len(games_df.drop_duplicates()) if games_df_duplicates > 0: print('Warning. ' + str(games_df_duplicates) + ' duplicates found in BoardGameTable: ') games_df.drop_duplicates(inplace=True) print('Duplicates removed!') # check if there are any duplicates in key_df: count_duplicates_bgg = len(key_df[~key_df['bgg_game_id'].isnull()]) - len( key_df[~key_df['bgg_game_id'].isnull()].drop_duplicates( subset='bgg_game_id')) count_duplicates_bga = len(key_df[~key_df['bga_game_id'].isnull()]) - len( key_df[~key_df['bga_game_id'].isnull()].drop_duplicates( subset='bga_game_id')) if (count_duplicates_bga + count_duplicates_bga) > 0: print('Warning. Duplicates found: ') print('BGG_game_ids: ' + str(count_duplicates_bgg)) print('BGA_game_ids: ' + str(count_duplicates_bga)) key_df.drop_duplicates(inplace=True) print('Duplicates removed:') # Fix badly encoded symbols # Insert quotation marks games_df['game_description'] = games_df['game_description'].str.replace( r'"', '\'') games_df['game_description'] = games_df['game_description'].str.replace( r'”', '\'') games_df['game_description'] = games_df['game_description'].str.replace( r'’', '\'') games_df['game_description'] = games_df['game_description'].str.replace( r'“', '\'') games_df['game_description'] = games_df['game_description'].str.replace( r'&', '&') games_df['game_description'] = games_df['game_description'].str.replace( r'é', 'e') # Insert Umlaute games_df['game_description'] = games_df['game_description'].str.replace( r'ä', 'ä') games_df['game_description'] = games_df['game_description'].str.replace( r'Ü', 'ü') games_df['game_description'] = games_df['game_description'].str.replace( r'ü', 'ü') games_df['game_description'] = games_df['game_description'].str.replace( r'ö', 'ö') games_df['game_description'] = games_df['game_description'].str.replace( r'ß', 'ß') # Insert dashes & non-breaking space games_df['game_description'] = games_df['game_description'].str.replace( r'–', '-') games_df['game_description'] = games_df['game_description'].str.replace( r'—', '-') games_df['game_description'] = games_df['game_description'].str.replace( r' ', ' ') games_df['game_description'] = games_df['game_description'].str.replace( r'×', 'x') games_df['game_description'] = games_df['game_description'].str.replace( r'­', '-') # Kick html characters games_df['game_description'] = games_df['game_description'].str.replace( r'&#...;', '') games_df['game_description'] = games_df['game_description'].str.replace( r'&#..;', ' ') games_df['game_description'] = games_df['game_description'].str.replace( r'&#.;', '') games_df['game_description'] = games_df['game_description'].str.replace( r'.....;', '') games_df['game_description'] = games_df['game_description'].str.replace( r'....;', '') games_df['game_description'] = games_df['game_description'].str.replace( r'...;', '') games_df['game_description'] = games_df['game_description'].str.replace( r'..;', '') games_df['game_description'] = games_df['game_description'].str.replace( r'.;', '') # Remove double semicolon and double spaces games_df['game_description'] = games_df['game_description'].str.replace( r';;', ' ') games_df['game_description'] = games_df['game_description'].str.replace( ' +', ' ') games_df['game_description'] = games_df['game_description'].str.strip() # export to csv: export_df_to_csv(games_df, '../Data/Joined/Results/BoardGames.csv') export_df_to_csv( key_df, '../Data/Joined/Integration/GameKeys/Keys_All_Games_Integrated.csv')
def merge_bga_and_bgg_publisher_game_relation(): # import bga_publisher_game_relation import_path_1 = '../Data/BoardGameAtlas/Processed/API/bga_publishers_scrapy_and_api_data_merged.csv' game_publisher_relation_bga = pd.read_csv(import_path_1, index_col=0) game_publisher_relation_bga = game_publisher_relation_bga[[ 'game_id', 'publisher_id' ]] # import bgg_publisher_game_relation import_path_2_fuzzy = '../Data/BoardGameGeeks/Processed/GameInformation/02_BGG_Game_Publisher_Relation_*.csv' import_path_2 = get_latest_version_of_file(import_path_2_fuzzy) game_publisher_relation_bgg = pd.read_csv(import_path_2, index_col=0) game_publisher_relation_bgg = game_publisher_relation_bgg[[ 'bgg_game_id', 'publisher_name' ]] # import publishers import_path_3 = '../Data/Joined/Results/publisher.csv' publishers = pd.read_csv(import_path_3, index_col=0) # import game keys import_path_4 = '../Data/Joined/Integration/GameKeys/Keys_All_Games_Integrated.csv' game_keys = pd.read_csv(import_path_4, index_col=0) # replace bga game ids with game keys game_publisher_relation_bga = pd.merge(left=game_publisher_relation_bga, right=game_keys, left_on='game_id', right_on='bga_game_id') game_publisher_relation_bga = game_publisher_relation_bga[[ 'game_key', 'publisher_id' ]] # replace publisher_bga_id with publisher_key game_publisher_relation_bga = pd.merge(left=publishers, right=game_publisher_relation_bga, left_on='publisher_bga_id', right_on='publisher_id') game_publisher_relation_bga = game_publisher_relation_bga[[ 'game_key', 'publisher_key' ]] # replace bgg game ids with game keys game_publisher_relation_bgg = pd.merge(left=game_publisher_relation_bgg, right=game_keys, left_on='bgg_game_id', right_on='bgg_game_id') game_publisher_relation_bgg = game_publisher_relation_bgg[[ 'game_key', 'publisher_name' ]] # replace bgg publisher name with publisher key game_publisher_relation_bgg = pd.merge(left=game_publisher_relation_bgg, right=publishers, left_on='publisher_name', right_on='bgg_publisher_name') game_publisher_relation_bgg = game_publisher_relation_bgg[[ 'game_key', 'publisher_key' ]] # concat both dataframes: game_publisher_relation_combined = pd.concat( [game_publisher_relation_bga, game_publisher_relation_bgg]) # remove duplicates: game_publisher_relation_combined.drop_duplicates(inplace=True) # export game_publisher relation: export_path = '../Data/Joined/Results/Publisher_Game_Relation.csv' export_df_to_csv(game_publisher_relation_combined, export_path)
def merge_bga_and_bgg_designers(): # import bga data: import_path_1 = '../Data/BoardGameAtlas/Processed/API/bga_designers_scrapy_and_api_data_merged.csv' bga_designers_game_relation = pd.read_csv(import_path_1, index_col=0) # import bgg data: import_path_2_fuzzy = '../Data/BoardGameGeeks/Processed/GameInformation/03_BGG_Game_designer_Relation_*.csv' import_path_2 = get_latest_version_of_file(import_path_2_fuzzy) bgg_designers_game_relation = pd.read_csv(import_path_2, index_col=0) # drop NA's from bga_designers_game_relation names bga_designers_na = bga_designers_game_relation[ bga_designers_game_relation['designer_name'].isnull()] if len(bga_designers_na) > 0: print( str(len(bga_designers_na)) + ' rows dropped from bga_game_relation table because designer_names are missing.' ) bga_designers_game_relation = bga_designers_game_relation[ ~bga_designers_game_relation['designer_name'].isnull()] # create designers df: bga_designers_df = bga_designers_game_relation[[ 'designer_name', 'designer_bga_image_url', 'designer_url', 'designer_id' ]].drop_duplicates() bga_designers_df.rename(columns={ 'designer_bga_image_url': 'designer_image_url', 'designer_id': 'designer_bga_id' }, inplace=True) bgg_designers_df = pd.DataFrame( bgg_designers_game_relation[['designer_name']].drop_duplicates()) bgg_designers_df.rename(columns={'designer_name': 'bgg_designer_name'}, inplace=True) # add bgg_designer_key: bgg_designers_df.insert(0, 'designer_bgg_key', range(1, 1 + len(bgg_designers_df))) # extract designer names: bga_designer_names_list = bga_designers_game_relation[ 'designer_name'].drop_duplicates().to_list() bgg_designer_names_list = bgg_designers_game_relation[ 'designer_name'].drop_duplicates().to_list() # match designer names: # get exact name matches: exact_matches = list( set(bga_designer_names_list).intersection(bgg_designer_names_list)) # subsets for data that could not get matched exactly: bga_names_not_matched_list = [ name for name in bga_designer_names_list if name not in exact_matches ] bgg_names_not_matched_list = [ name for name in bgg_designer_names_list if name not in exact_matches ] # jaccard matching for names that could not be matched exactly matches = [] for bga_designer in bga_names_not_matched_list: match = find_match(bga_designer, bgg_names_not_matched_list, JACCARD_THRESHOLD_DESIGNERS) matches.append({ 'bga_name': bga_designer, 'bgg_name': match['name'], 'jaccard_score': match['jaccard_score'] }) # create list of matched designer names: jaccard_matches_bga = [ designer['bga_name'] for designer in matches if designer['jaccard_score'] != '' ] jaccard_matches_bgg = [ designer['bgg_name'] for designer in matches if designer['jaccard_score'] != '' ] # create list of a games matched: all_matches_bga = exact_matches + jaccard_matches_bga all_matches_bgg = exact_matches + jaccard_matches_bgg # create dataframe of matched designers: jaccard_matches_df = pd.DataFrame(matches) jaccard_matches_df = jaccard_matches_df[ jaccard_matches_df['jaccard_score'] != ''].sort_values('jaccard_score', ascending=False) del jaccard_matches_df['jaccard_score'] # 1) Create DF of all designers that could be matched # a) exact matches # b) jaccard matches # 2) Create DF of bga designers that could not be matched # 3) Create DF ob bgg designers that could not be matched # 4) Concat all DFs to one designers df # Structure: designer_key | designer_name | designer_bga_id | designer_bgg_id | designer_url | designer_image_url # 1) a) bga_exact_matches = bga_designers_df[ bga_designers_df['designer_name'].isin(exact_matches)] bgg_exact_matches = bgg_designers_df[ bgg_designers_df['bgg_designer_name'].isin(exact_matches)] joined_exact_matches = pd.merge(left=bga_exact_matches, right=bgg_exact_matches, left_on='designer_name', right_on='bgg_designer_name') # 1) b) bga_jaccard_matches = pd.merge(left=bga_designers_df, right=jaccard_matches_df, left_on='designer_name', right_on='bga_name') bgg_jaccard_matches = pd.merge(left=bgg_designers_df, right=jaccard_matches_df, left_on='bgg_designer_name', right_on='bgg_name') joined_jaccard_matches = pd.merge(left=bga_jaccard_matches, right=bgg_jaccard_matches, left_on='designer_name', right_on='bga_name') # drop columns not needed joined_jaccard_matches = joined_jaccard_matches[[ 'designer_name', 'designer_bga_id', 'designer_bgg_key', 'bgg_designer_name', 'designer_url', 'designer_image_url' ]] # 2) bga_no_matches = bga_designers_df[~bga_designers_df['designer_name']. isin(all_matches_bga)] # 3) bgg_no_matches = bgg_designers_df[~bgg_designers_df['bgg_designer_name']. isin(all_matches_bgg)] # add designers column: bgg_no_matches['designer_name'] = bgg_no_matches['bgg_designer_name'] # 4) Create large dataframe by concatenating all dataframes: # size: 473 [1a] + 7 [1b] + 25 [2] + 5928 [3] = 6433 designers_df = pd.concat([ joined_exact_matches, joined_jaccard_matches, bga_no_matches, bgg_no_matches ]) # add designer key: designers_df.insert(0, 'designer_key', range(1, 1 + len(designers_df))) # remove duplicates: designers_df.drop_duplicates(inplace=True) # export designers export_path = '../Data/Joined/Results/Designer.csv' export_df_to_csv(designers_df, export_path)
def clean_reviews(): # import users: users_path = '../Data/Joined/Results/User.csv' users_df = pd.read_csv(users_path, index_col=0) users_df = users_df[['user_key', 'user_name', 'user_origin']] # import reviews: reviews_path = '../Data/Joined/Integration/Reviews/Reviews_All_Games_Integrated.pickle' all_reviews = import_pickle_to_dataframe(reviews_path) # Delete user_id column from review_df which currently holds user_ids from bga_dataset # Also delete these columns: review_id, review_date, game_id, bga_game_id and bgg_game_id delete_columns = [ 'user_id', 'review_id', 'review_date', 'game_id', 'bga_game_id', 'bgg_game_id' ] all_reviews.drop(columns=delete_columns, inplace=True) # Match user_name and user_id all_reviews = pd.merge(left=all_reviews, right=users_df, left_on=['review_origin', 'user_name'], right_on=['user_origin', 'user_name']) # Drop columns user_name and review_origin to normalize: delete_columns = ['user_name', 'review_origin'] all_reviews.drop(columns=delete_columns, inplace=True) # Change column order: cols_to_order = [ 'game_key', 'user_key', 'rating', 'review_text', 'has_review_text' ] new_columns = cols_to_order + ( all_reviews.columns.drop(cols_to_order).tolist()) all_reviews = all_reviews[new_columns] # Drop text columns since we do not use them and they take away a lot of memory: del all_reviews['review_text'] ## Take care of duplicates: # check if there are any duplicates count_duplicates = len(all_reviews) - len( all_reviews.drop_duplicates(subset=['game_key', 'user_key'], keep='first')) if count_duplicates > 0: print('Warning. Duplicates ' + str(count_duplicates) + ' found: ') # remove duplicates: all_reviews.drop_duplicates(subset=['game_key', 'user_key'], keep='first', inplace=True) print('Found duplicates removed!') ## Keep only reviews of users with a certain amount of ratings and reviews of games with >= ... ratings: print('Removing reviews of games with less than ' + str(MIN_REVIEWS_PER_GAME) + ' reviews and users with less than ' + str(MIN_REVIEWS_PER_USER) + ' reviews.') reviews_full_dataset = len(all_reviews) games_full_dataset = all_reviews['game_key'].nunique() users_full_dataset = all_reviews['user_key'].nunique() # keep only reviews of games with >= ... reviews: num_reviews_per_game = all_reviews.game_key.value_counts() all_reviews = all_reviews[all_reviews.game_key.isin( num_reviews_per_game.index[num_reviews_per_game.ge( MIN_REVIEWS_PER_GAME)])] # keep only reviews of users with >= ... reviews: num_reviews_per_user = all_reviews.user_key.value_counts() all_reviews = all_reviews[all_reviews.user_key.isin( num_reviews_per_user.index[num_reviews_per_user.ge( MIN_REVIEWS_PER_USER)])] ## Track changes: # Count reviews, games and users in reduced dataset: reviews_reduced_dataset = len(all_reviews) games_reduced_dataset = all_reviews['game_key'].nunique() users_reduced_dataset = all_reviews['user_key'].nunique() # Calculate absolute number of dropped values: reviews_dropped_abs = reviews_full_dataset - reviews_reduced_dataset games_dropped_abs = games_full_dataset - games_reduced_dataset users_dropped_abs = users_full_dataset - users_reduced_dataset # Calculate relative number of dropped values: reviews_dropped_rel = reviews_dropped_abs / reviews_full_dataset games_dropped_rel = games_dropped_abs / games_full_dataset users_dropped_rel = users_dropped_abs / users_full_dataset # Create a nice table to visualize the results: table = [[ 'Reviews', reviews_full_dataset, reviews_reduced_dataset, reviews_dropped_abs, reviews_dropped_rel ], [ 'Games', games_full_dataset, games_reduced_dataset, games_dropped_abs, games_dropped_rel ], [ 'Users', users_full_dataset, users_reduced_dataset, users_dropped_abs, users_dropped_rel ]] print( tabulate(table, headers=[ '_', 'Count (full dataset)', 'Count (reduced dataset)', 'Num dropped (absolute)', 'Num dropped (relative)' ])) # Calculate the size of the utility matrix, prior and after reducing the dataset: size_utility_matrix_full_dataset = games_full_dataset * users_full_dataset size_utility_matrix_reduced_dataset = games_reduced_dataset * users_reduced_dataset print('Reviews dropped: ' + str(reviews_dropped_rel * 100) + ' %') print('Utility matrix full dataset: ' + str(games_full_dataset) + ' * ' + str(users_full_dataset)) print('Utility matrix reduced dataset: ' + str(games_reduced_dataset) + ' * ' + str(users_reduced_dataset)) print('Size of utility matrix reduced by: ' + str(100 * (1 - (size_utility_matrix_reduced_dataset / size_utility_matrix_full_dataset))) + ' %') # Export df: export_df_to_csv(all_reviews, '../Data/Joined/Results/Reviews_Reduced.csv')
def clean_bgg_games(): filename = '../Data/BoardGameGeeks/Raw/games_detailed_info.csv' df = pd.read_csv(filename) # remove first column (unnamed): df.drop(df.columns[0], axis=1, inplace=True) # remove further unwanted columns: columns_to_remove = [ 'type', 'playingtime', 'boardgameartist', 'boardgamefamily', 'boardgameexpansion', 'boardgameimplementation', 'median', 'owned', 'wanting', 'wishing', 'trading', 'numweights', 'suggested_num_players', 'suggested_playerage', 'suggested_language_dependence', 'boardgameintegration', 'boardgamecompilation', 'Strategy Game Rank', 'Family Game Rank', 'Party Game Rank', 'Abstract Game Rank', 'Thematic Rank', 'War Game Rank', 'Customizable Rank', "Children's Game Rank", 'RPG Item Rank', 'Accessory Rank', 'Video Game Rank', 'Amiga Rank', 'Commodore 64 Rank', 'Arcade Rank', 'Atari ST Rank' ] df.drop(columns_to_remove, axis=1, inplace=True) # rename columns: df.rename(columns={ 'id': 'bgg_game_id', 'primary': 'name', 'yearpublished': 'year_published', 'minplayers': 'min_players', 'maxplayers': 'max_players', 'minplaytime': 'min_playtime', 'maxplaytime': 'max_playtime', 'minage': 'min_age', 'usersrated': 'bgg_num_user_ratings', 'numcomments': 'bgg_num_user_comments', 'average': 'bgg_average_user_rating', 'bayesaverage': 'bgg_bayes_average', 'stddev': 'bgg_stddev', 'Board Game Rank': 'bgg_rank', 'description': 'game_description', 'image': 'image_url', 'thumbnail': 'thumbnail_url', 'averageweight': 'bgg_average_weight' }, inplace=True) # desired dataframes: # 1) main_game_information # 2) publishers # 3) designers # 4) mechanics # 5) categories # 6) names # 2) extract publishers from 'boardgamepublisher' column: # create subset dataframe with only id and names/alternate column: publishers_df = df[['bgg_game_id', 'boardgamepublisher']] # rename columns publishers_df.rename(columns={ 'bgg_game_id': 'bgg_game_id', 'boardgamepublisher': 'publisher_name' }, inplace=True) # add empty column 'publisher_url' and 'publisher_id': publishers_df['publisher_url'] = np.nan publishers_df['publisher_id'] = np.nan # drop nas: publishers_df = publishers_df[publishers_df['publisher_name'].notna()] # lists are stored as strings and not as lists. Therefore we have to change the class from str to list: publishers_df['publisher_name'] = publishers_df['publisher_name'].apply( eval) # explode transforms each element of the list to a row publishers_df = publishers_df.explode('publisher_name') # drop column from games dataframe: del df['boardgamepublisher'] # 3) extract designers from 'boardgamedesigner' column (sames procedure as for publishers) designers_df = df[['bgg_game_id', 'boardgamedesigner']] designers_df.rename(columns={ 'id': 'bgg_game_id', 'boardgamedesigner': 'designer_name' }, inplace=True) designers_df['designer_url'] = np.nan designers_df['designer_id'] = np.nan designers_df = designers_df[designers_df['designer_name'].notna()] designers_df['designer_name'] = designers_df['designer_name'].apply(eval) designers_df = designers_df.explode('designer_name') del df['boardgamedesigner'] # 4) extract designers from 'boardgamemechanic' column mechanics_df = df[['bgg_game_id', 'boardgamemechanic']] mechanics_df.rename(columns={ 'id': 'bgg_game_id', 'boardgamemechanic': 'mechanic_name' }, inplace=True) mechanics_df['mechanic_id'] = np.nan mechanics_df['mechanic_url'] = np.nan mechanics_df = mechanics_df[mechanics_df['mechanic_name'].notna()] mechanics_df['mechanic_name'] = mechanics_df['mechanic_name'].apply(eval) mechanics_df = mechanics_df.explode('mechanic_name') del df['boardgamemechanic'] # 5) extract designers from 'boardgamecategory' column categories_df = df[['bgg_game_id', 'boardgamecategory']] categories_df.rename(columns={ 'id': 'bgg_game_id', 'boardgamecategory': 'category_name' }, inplace=True) categories_df['category_id'] = np.nan categories_df['category_url'] = np.nan categories_df = categories_df[categories_df['category_name'].notna()] categories_df['category_name'] = categories_df['category_name'].apply(eval) categories_df = categories_df.explode('category_name') del df['boardgamecategory'] # 6) extract names from 'alternate' column: names_df = df[['bgg_game_id', 'alternate']] names_df.rename(columns={ 'id': 'bgg_game_id', 'alternate': 'game_name' }, inplace=True) names_df = names_df[names_df['game_name'].notna()] names_df['game_name'] = names_df['game_name'].apply(eval) names_df = names_df.explode('game_name') del df['alternate'] # Export all 6 dataframes: path = '../Data/BoardGameGeeks/Processed/GameInformation/' export_df_to_csv( df, path + '01_BGG_Game_Information_' + datetime.now().strftime("%d_%m_%Y-%H_%M") + '.csv') export_df_to_csv( publishers_df, path + '02_BGG_Game_Publisher_Relation_' + datetime.now().strftime("%d_%m_%Y-%H_%M") + '.csv') export_df_to_csv( designers_df, path + '03_BGG_Game_Designer_Relation_' + datetime.now().strftime("%d_%m_%Y-%H_%M") + '.csv') export_df_to_csv( mechanics_df, path + '04_BGG_Game_Mechanic_Relation_' + datetime.now().strftime("%d_%m_%Y-%H_%M") + '.csv') export_df_to_csv( categories_df, path + '05_BGG_Game_Category_Relation_' + datetime.now().strftime("%d_%m_%Y-%H_%M") + '.csv') export_df_to_csv( names_df, path + '06_BGG_Game_Name_Relation_' + datetime.now().strftime("%d_%m_%Y-%H_%M") + '.csv')
def match_and_merge_bga_and_bgg_categories(): # import bga and bgg categories bga_categories = pd.read_csv( '../Data/BoardGameAtlas/Raw/API/categories/all_bga_categories.csv', index_col=0) bgg_categories = pd.read_csv( '../Data/BoardGameGeeks/Raw/BGG_categories.csv', index_col=0) bga_categories_names = bga_categories['category_name'].tolist() bgg_categories_names = bgg_categories['category_name'].tolist() category_jaccard_threshold = 0.4 # match categories: mecha_list = [] for bga_category in bga_categories_names: match = find_match(bga_category, bgg_categories_names, category_jaccard_threshold) mecha_list.append({ 'bga_name': bga_category, 'bgg_name': match['name'], 'jaccard_score': match['jaccard_score'] }) matches_df = pd.DataFrame(mecha_list) # drop entries that could not be matched matches_df = matches_df[matches_df['jaccard_score'] != ''].sort_values( 'jaccard_score', ascending=False) bga_names_matched = matches_df['bga_name'].tolist() bgg_names_matched = matches_df['bgg_name'].tolist() # build subsets depending on if category name was matched or not bga_subset_matches = bga_categories[bga_categories['category_name'].isin( bga_names_matched)] bgg_subset_matches = bgg_categories[bgg_categories['category_name'].isin( bgg_names_matched)] bga_subset_no_matches = bga_categories[~bga_categories['category_name']. isin(bga_names_matched)] bgg_subset_no_matches = bgg_categories[~bgg_categories['category_name']. isin(bgg_names_matched)] # rename category_name column: bga_subset_matches.rename(columns={'category_name': 'bga_name'}, inplace=True) bgg_subset_matches.rename(columns={'category_name': 'bgg_name'}, inplace=True) bga_subset_no_matches.rename(columns={'category_name': 'bga_name'}, inplace=True) bgg_subset_no_matches.rename(columns={'category_name': 'bgg_name'}, inplace=True) # join matches: # start with bga subset subset_matches = pd.merge(left=bga_subset_matches, right=matches_df, left_on='bga_name', right_on='bga_name') # and then also merge with the bgg subset subset_matches = pd.merge(left=subset_matches, right=bgg_subset_matches, left_on='bgg_name', right_on='bgg_name') # keep only relevant columns: subset_matches = subset_matches[[ 'category_bga_id', 'bgg_category_key', 'bga_name', 'bgg_name', 'category_bga_url' ]] # concat all: all_categories = pd.concat( [subset_matches, bga_subset_no_matches, bgg_subset_no_matches], ignore_index=True, sort=False).sort_values(['bga_name']) # create category key: all_categories.insert(0, 'category_key', range(1, 1 + len(all_categories))) # add category_name column: # out of the matched categories we keep the bgg names: all_categories['category_name'] = all_categories['bgg_name'] # if it is a bga category that could not be matched, take bga name instead: all_categories.loc[all_categories['category_name'].isna(), 'category_name'] = all_categories['bga_name'] # export categories df: export_path = '../Data/Joined/Integration/GameInformation/05_Categories_Integrated_with_bga_and_bgg_ids.csv' export_df_to_csv(all_categories, export_path)
def match_online_game_names_and_bgg_names(): onlinegames_filename = get_latest_version_of_file( '../Data/Onlinegames/Raw/Onlineboardgames_table_raw.csv') onlinegames_df = pd.read_csv(onlinegames_filename, sep=';') bgg_filename = get_latest_version_of_file( '../Data/BoardGameGeeks/Processed/GameInformation/01_BGG_Game_Information_*.csv' ) bgg_df = pd.read_csv(bgg_filename, index_col=0) bgg_names = bgg_df['name'].tolist() # Extract only games without BGG ID to match onlinegames_games_without_BGGID = onlinegames_df[ onlinegames_df['BGGID'].isna()] onlinegame_names_without_BGGID = onlinegames_games_without_BGGID[ 'Name'].tolist() # Find exact matches Onlingames - BGG exact_matches = list( set(bgg_names).intersection(set(onlinegame_names_without_BGGID))) # Exact matches as list of dicts (can later be used to create a pd.DF) exact_matches_list_of_dict = [{ 'online_name': x, 'bgg_name': x } for x in exact_matches] # subtract exact matches from datasets: subset_bgg_df = bgg_df[~bgg_df['name'].isin(exact_matches)] subset_onlinegames_df = onlinegames_games_without_BGGID[ ~onlinegames_games_without_BGGID['Name'].isin(exact_matches)] subset_onlinegame_names_without_BGGID = subset_onlinegames_df[ 'Name'].tolist() subset_bgg_df_names = subset_bgg_df['name'].tolist() # Match left over names Onlinegames - BGG match_list = [] for name in subset_onlinegame_names_without_BGGID: match = find_match(name, subset_bgg_df_names, JACCARD_THRESHOLD_GAME_NAME) match_list.append({ 'online_name': name, 'bgg_name': match['name'], 'jaccard_score': match['jaccard_score'] }) # drop entries that could not be matched: match_list = [x for x in match_list if x['jaccard_score'] != ''] # add exact matches to match_list: match_list = match_list + exact_matches_list_of_dict matches_df = pd.DataFrame(match_list) # merge matches and bgg to get bgg ids: merge_1 = pd.merge(left=matches_df, right=bgg_df, left_on='bgg_name', right_on='name') matches_df = merge_1[['bgg_name', 'online_name', 'bgg_game_id']] # merge matches and online games df: merge_2 = pd.merge(left=onlinegames_games_without_BGGID, right=matches_df, left_on='Name', right_on='online_name') merge_2['BGGID'] = merge_2['bgg_game_id'] # keep only columns from original online games df: merge_2 = merge_2[[ 'Onlinegamelink ID', 'Name', 'Onlinegamelink', 'Origin', 'BGGID' ]] # create a temp_df that contains all games out of the online games df that were not matched in the process # (the ones that had been matched previously and the ones that could not be matched in the process) temp_df = onlinegames_df[~onlinegames_df['Onlinegamelink ID']. isin(merge_2['Onlinegamelink ID'].tolist())] # combine both to get the full dataset with the additional information about the bgg_game_ids out of the games that # were successfully matched: onlinegames_df = pd.concat([temp_df, merge_2]) onlinegames_df.drop_duplicates(subset=['Onlinegamelink ID'], inplace=True) ## export online games: # rename a few columns onlinegames_df.rename(columns={ 'Name': 'name', 'Onlinegamelink ID': 'online_game_id', 'Onlinegamelink': 'url', 'Origin': 'origin', 'BGGID': 'bgg_id' }, inplace=True) onlinegames_df = onlinegames_df.drop(columns={'Unnamed: 0'}) # If bgg_id has to be int (Beware of nAn conversion!) #onlinegames_df['bgg_id'] = onlinegames_df['bgg_id'].fillna(0.0).astype(int) #onlinegames_df['bgg_id'] = onlinegames_df['bgg_id'].astype(int) # drop online games without bgg_id: onlinegames_df = onlinegames_df[~onlinegames_df['bgg_id'].isna()] # export result to csv: export_path = '../Data/Onlinegames/Processed/online_games.csv' export_df_to_csv(onlinegames_df, export_path)
def match_game_names(): """ This function matches bga and bgg boardgames based on their game names and the year in which they were published. This is how it works: - We calculate n-grams with n=3 for each boardgamename. - By removing stopwords that appear in many games that don't add much meaning to the game title we can reduce the number of false-positives and false-negatives. Examples: the stopwords 'board' and 'game' are removed: bga_name = '7 Wonders' bgg_name = '7 Wonders - The Board Game' -> this would result in a rather low jaccard score without removing the stopwords. bga_name = 'Settlers - The Board Game' bgg_name = '7 Wonder - The Board Game' -> this would result in a rather high jaccard score considering that both do not refer to the same game. - We then compare the similarity of a bga candidate and a bgg candidate by calculating the jaccard similarity. - The candidate with the highest jaccard score is chosen. Only if the jaccard score of that candidate exceeds our threshold the games are matched. Scalability Challenge: - However, there is one issue with that strategy: Computing the jaccard similarity requires comparisons of ca. 8,000 bga games and ca. 19,000 bgg games [ O(n) = n^2 ]. Comparing all bga_games and all bgg_games would lead to an extremely long run time, which we want to avoid. -> 8,000 x 19,000 = 152,000,000 comparisons. Therefore we adjusted our approach: 1) First, we find games that can be matched exactly. By this we mean games that have exactly the same name in both datasets. Since there are some games with duplicate game names that do not refer to the same game, we also include the year of publication. Therefore only games with exactly the same name and exactly the same year of publication are matched in this step. We can then subtract these games from the their datasets to decrease the sizes of games that have to be compared to: ca. 3,000 bga games and ca. 15,000 bgg games. -> 3,000 x 15,000 = 45,000,000 (complexity reduced by ~70%) 2) This is still quite a lot of comparisons. However, we made another observation. We also tried matching games by only their game_name (not also taking the year_published into consideration). In the set of games that could be matched exactly, in almost all cases the publish years are the same, which makes sense obviously. 3) Therefore we can further reduce complexity by grouping by publish years and comparing only games that have the same publish year. To make sure we don't lose games because the publish years deviate by one year, we also compare to games published in the years one year before and after. This further reduces the number of comparisons to: ~ 1,000,000 Hence, by applying the similarity function only to the most promising pairs we reduced the number of required comparisons by 98%. """ # Import bgg and bga data: bgg_filename = get_latest_version_of_file( '../Data/BoardGameGeeks/Processed/GameInformation/01_BGG_Game_Information_*.csv' ) bgg_df = pd.read_csv(bgg_filename, index_col=0) bgg_names = bgg_df['name'].tolist() bga_filename = get_latest_version_of_file( '../Data/BoardGameAtlas/Processed/API/01_BGA_Game_Information_*.json') bga_df = import_json_to_dataframe(bga_filename) bga_names = bga_df['name'].tolist() # Create lists with bga and bgg ids: bgg_ids = bgg_df['bgg_game_id'].tolist() bga_ids = bga_df['bga_game_id'].tolist() # Check duplicate names: bgg_duplicate_names = set([x for x in bgg_names if bgg_names.count(x) > 1]) bga_duplicate_names = set([x for x in bga_names if bga_names.count(x) > 1]) ## find exact matches (game_name, year_published): exact_matches_join_df = pd.merge(left=bgg_df, right=bga_df, left_on=['name', 'year_published'], right_on=['name', 'year_published']) # create list of ids of exactly matched games: exact_matches_bgg_ids = exact_matches_join_df['bgg_game_id'].tolist() exact_matches_bga_ids = exact_matches_join_df['bga_game_id'].tolist() # subtract exact matches from datasets to reduce their size: subset_bgg_df = bgg_df[~bgg_df['bgg_game_id'].isin(exact_matches_bgg_ids)] subset_bga_df = bga_df[~bga_df['bga_game_id'].isin(exact_matches_bga_ids)] subset_bgg_df.rename(columns={'year_published': 'year_published_bgg'}, inplace=True) subset_bga_df.rename(columns={'year_published': 'year_published_bga'}, inplace=True) ## In the next part we now want to apply name matching. Our first task is to find candidates so that we don't ## have to compare all games from one dataset with all games from the other dataset. We do so by grouping by ## their year of publication. ## First, we need some preprocessing steps so that we can actually set up our candidates: # Extract years from bga dataset: # A lot of type casting due to unexpected errors with float and set all_years = subset_bga_df['year_published_bga'].dropna().tolist() all_years = list(map(int, all_years)) years = list(set(all_years)) years.sort(reverse=True) # Do not apply name matching to games where to publish_year is missing: print('Dropped ' + str(subset_bgg_df['year_published_bgg'].isna().sum()) + ' rows from bga_dataset from name_matching') print('Dropped ' + str(subset_bga_df['year_published_bga'].isna().sum()) + ' rows from bgg_dataset from name_matching') subset_bgg_df.dropna(inplace=True) subset_bga_df.dropna(inplace=True) # strip of '.0' at the end of each year by converting to int: 2018.0 -> 2018 subset_bga_df["year_published_bga"] = subset_bga_df[ "year_published_bga"].astype(int) # create a dictionary to group all bgg games by their year of publication # during the name matching process we will only compare the names of games with the same publication year bgg_dic_grouped_by_year = {} bga_dic_grouped_by_year = {} # fill the previously created dictionaries that include all the games that were published in a certain year for year in years: bgg_dic_grouped_by_year[year] = subset_bgg_df[ subset_bgg_df['year_published_bgg'] == year].to_dict('records') bga_dic_grouped_by_year[year] = subset_bga_df[ subset_bga_df['year_published_bga'] == year].to_dict('records') ## Now we get to the interesting part: ## We iterate over all bga_games which we found no exact bgg_matches for. We then create a list with potential ## candidates including all bgg_games that were published in the same year or one year before or after. ## For these candidates we then apply name_matching using the jaccard similarity. for year in years: for bga_game in bga_dic_grouped_by_year[year]: input_string = bga_game['name'] candidate_list = [] # create candidate_list with all bgg games that were published in the same year as the bga_game: for bgg_game in bgg_dic_grouped_by_year[year]: candidate_list.append(bgg_game['name']) # also check bgg games that were published in the previous year and one year later: if year + 1 in bgg_dic_grouped_by_year: for bgg_game in bgg_dic_grouped_by_year[year + 1]: candidate_list.append(bgg_game['name']) if year - 1 in bgg_dic_grouped_by_year: for bgg_game in bgg_dic_grouped_by_year[year - 1]: candidate_list.append(bgg_game['name']) # Try to match the input_string (target BGA Game name) one of the games in the candidate_list (bgg games). # The match with the highest jaccard similarity is returned. If there is no match, or the Jaccard threshold # can not be exceeded then an empty string is returned. match = find_match(input_string, candidate_list, JACCARD_THRESHOLD_GAME_NAME) bga_game['match'] = match['name'] bga_game['jaccard_score'] = match['jaccard_score'] global COMPARISONS print('Number of comparisons: ' + str(COMPARISONS)) bga_list_matches = [] for year in years: for bga_game in bga_dic_grouped_by_year[year]: bga_list_matches.append(bga_game) # turn list of dictionaries back to data frame: jaccard_matches_df = pd.DataFrame(bga_list_matches) # just for debugging and inspecting results: analyse_df = pd.DataFrame(bga_list_matches) analyse_df = analyse_df[analyse_df['jaccard_score'] != ''] analyse_df = analyse_df[['name', 'match', 'jaccard_score']] analyse_df = analyse_df.sort_values('jaccard_score', ascending=False) ## We have now succesfully found a large number of games that could be matched. All that's left to do is # creating a dataframe that contains the matched BGA and BGG IDs. We do so in three steps: # 1) Prepare DF containing BGA and BGG IDs of games that could be matched exactly by name and year_published # 2) Prepare DF containing BGA and BGG IDs of games that could be matched by string matching (jaccard method) # 3) Concatenate both data frames # 1) Exact matches # Keep only ID columns: exact_matches_join_df = exact_matches_join_df[[ 'bgg_game_id', 'bga_game_id' ]] # 2) Jaccard matches # Cut of rows where the jaccard threshold wasn't reached (-> no match) jaccard_matches_df = jaccard_matches_df[jaccard_matches_df['match'] != ''] jaccard_matches_df = jaccard_matches_df[[ 'bga_game_id', 'name', 'year_published_bga', 'match', 'jaccard_score' ]] jaccard_matches_df.rename(columns={'name': 'bga_name'}, inplace=True) # Join both datasets jaccard_matches_join_df = pd.merge( left=bgg_df[['bgg_game_id', 'name', 'year_published']], right=jaccard_matches_df, left_on=['name', 'year_published'], right_on=['match', 'year_published_bga']) jaccard_matches_join_df = jaccard_matches_join_df[[ 'bgg_game_id', 'bga_game_id' ]] # 3) Concat both dfs matched_game_ids_df = pd.concat( [exact_matches_join_df, jaccard_matches_join_df]) # 4) Store matches to csv: export_df_to_csv( matched_game_ids_df, '../Data/Joined/Integration/GameInformation/matched_bga_and_bgg_ids.csv' )