def normalize_bga_game_categories_relation(): # import bga_games_category_relation: categories_bga_games_relation_path_fuzzy = '../Data/BoardGameAtlas/Processed/API/05_BGA_Game_Categories_Relation_*.json' categories_bga_games_relation_path = get_latest_version_of_file( categories_bga_games_relation_path_fuzzy) categories_bga_games_relation_df = import_json_to_dataframe( categories_bga_games_relation_path) # import bga categories: categories_bga = pd.read_csv( '../Data/BoardGameAtlas/Raw/API/categories/all_bga_categories.csv', index_col=0) # import game keys: game_keys = pd.read_csv( '../Data/Joined/Integration/GameKeys/Keys_All_Games_Integrated.csv', index_col=0) # join games_category_relation table to replace bga_game_id column with game_keys column: categories_bga_games_relation_df = pd.merge( left=categories_bga_games_relation_df, right=game_keys, left_on='game_id', right_on='bga_game_id') # normalize by only keeping game_key and category_id categories_bga_games_relation_df = categories_bga_games_relation_df[[ 'game_key', 'category_id' ]] # export df export_path = '../Data/BoardGameAtlas/Processed/API/05_BGA_Game_Categories_Relation_Cleaned.csv' export_df_to_csv(categories_bga_games_relation_df, export_path)
def integrate_game_name_relation_tables(): # Import BGA game_name_relation table: fuzzy_import_path_1 = '../Data/BoardGameAtlas/Processed/API/06_BGA_Game_Names_Relation_*.json' import_path_1 = get_latest_version_of_file(fuzzy_import_path_1) names_bga = import_json_to_dataframe(import_path_1) # Import BGG game_name_relation table: fuzzy_import_path_2 = '../Data/BoardGameGeeks/Processed/GameInformation/06_BGG_Game_Name_Relation_*.csv' import_path_2 = get_latest_version_of_file(fuzzy_import_path_2) names_bgg = pd.read_csv(import_path_2, index_col=0) # Import Game keys: import_path_3 = '../Data/Joined/Integration/GameKeys/Keys_All_Games_Integrated.csv' game_keys = pd.read_csv(import_path_3, index_col=0) # Replace bga 'game_id' with 'game_key' names_bga = pd.merge(left=names_bga, right=game_keys, left_on='game_id', right_on='bga_game_id') names_bga = names_bga[['game_key', 'game_name']] # Replace bgg 'game_id' with 'game_key' names_bgg = pd.merge(left=names_bgg, right=game_keys, left_on='bgg_game_id', right_on='bgg_game_id') names_bgg = names_bgg[['game_key', 'game_name']] # Merge both dataframes: names_combined = pd.concat([names_bga, names_bgg]).sort_values('game_key') # Remove duplicates: print( 'Number of duplicate game names in GameNameTranslation table found and dropped: ' + str(len(names_combined) - len(names_combined.drop_duplicates()))) names_combined.drop_duplicates(inplace=True) # Drop duplicates: names_combined.drop_duplicates(inplace=True) # Export result: export_path = '../Data/Joined/Results/GameNameTranslation.csv' export_df_to_csv(names_combined, export_path)
def create_list_of_all_bga_designers(): # import bga designers fuzzy_import_path = '../Data/BoardGameAtlas/Processed/API/03_BGA_Game_designers_Relation*.json' import_path = get_latest_version_of_file(fuzzy_import_path) bga_designers_game_relation = import_json_to_dataframe(import_path) # extract designers ids and designer urls designers = bga_designers_game_relation[['designer_id', 'designer_url']] # keep only unique designers: designers.drop_duplicates(subset='designer_id', keep='first', inplace=True) # export designers to csv: export_path = '../Data/BoardGameAtlas/Processed/API/BGA_All_Unique_designers.csv' export_df_to_csv(designers, export_path)
def normalize_bgg_game_categories_relation(): # import bgg games: categories_bgg_games_relation_path_fuzzy = '../Data/BoardGameGeeks/Processed/GameInformation/05_BGG_Game_category_Relation_*.csv' categories_bgg_games_relation_path = get_latest_version_of_file( categories_bgg_games_relation_path_fuzzy) categories_bgg_games_relation_df = pd.read_csv( categories_bgg_games_relation_path, index_col=0) # create categories list: categories_bgg = pd.DataFrame( categories_bgg_games_relation_df['category_name'].drop_duplicates()) # create temporary key_column: categories_bgg.insert(0, 'bgg_category_key', range(1001, 1001 + len(categories_bgg))) # import game keys: game_keys = pd.read_csv( '../Data/Joined/Integration/GameKeys/Keys_All_Games_Integrated.csv', index_col=0) # join games_category_relation table to replace bga_game_id column with game_keys column: categories_bgg_games_relation_df = pd.merge( left=categories_bgg_games_relation_df, right=game_keys, left_on='bgg_game_id', right_on='bgg_game_id') # replace 'category_name' with 'bgg_category_key' in categories_bgg_games_relation: categories_bgg_games_relation_df = pd.merge( left=categories_bgg_games_relation_df, right=categories_bgg, left_on='category_name', right_on='category_name') # normalize by only keeping game_key and category_id categories_bgg_games_relation_df = categories_bgg_games_relation_df[[ 'game_key', 'bgg_category_key' ]] # export bgg categories: export_path = '../Data/BoardGameGeeks/Raw/BGG_categories.csv' export_df_to_csv(categories_bgg, export_path) # export bgg game_categories_relation: export_path = '../Data/BoardGameGeeks/Processed/GameInformation/05_BGA_Game_Categories_Relation_Cleaned.csv' export_df_to_csv(categories_bgg_games_relation_df, export_path)
def create_id_list_of_included_bga_games(): """ Extracts ids of bga games previously obtained from bga api and create a JSON file containing the IDs. This list will later be used for the BGA Review API requests. """ # import file that contains information on bga_games # since the exact filename is unknown, we have to find file and its latest version first: filename = get_latest_version_of_file( '../Data/BoardGameAtlas/Processed/API/01_BGA_Game_Information_*.json') data = import_json_to_dataframe(filename) # extract ids ids = data['bga_game_id'] # export data to json file filename_export = '../Data/BoardGameAtlas/Processed/API/BGA_Ids_of_boardgames_included.json' export_df_to_json(ids, filename_export)
def merge_scraped_bga_designer_data_and_api_data(): # import scraped bga designer data import_path_1 = '../Data/BoardGameAtlas/Raw/Scrapy/designers/bga_designers.json' designers_scrapy = import_json_to_dataframe(import_path_1) # import api bga designer data import_path_2_fuzzy = '../Data/BoardGameAtlas/Processed/API/03_BGA_Game_designers_Relation_*.json' import_path_2 = get_latest_version_of_file(import_path_2_fuzzy) designers_game_relation = import_json_to_dataframe(import_path_2) # remove list around designer url: designers_scrapy = designers_scrapy.explode('designer_bga_image_url') # merge both dataframes: designers_merged = pd.merge(left=designers_scrapy, right=designers_game_relation, left_on='designer_url', right_on='designer_url') # export df export_path = '../Data/BoardGameAtlas/Processed/API/bga_designers_scrapy_and_api_data_merged.csv' export_df_to_csv(designers_merged, export_path)
def merge_bga_and_bgg_publisher_game_relation(): # import bga_publisher_game_relation import_path_1 = '../Data/BoardGameAtlas/Processed/API/bga_publishers_scrapy_and_api_data_merged.csv' game_publisher_relation_bga = pd.read_csv(import_path_1, index_col=0) game_publisher_relation_bga = game_publisher_relation_bga[[ 'game_id', 'publisher_id' ]] # import bgg_publisher_game_relation import_path_2_fuzzy = '../Data/BoardGameGeeks/Processed/GameInformation/02_BGG_Game_Publisher_Relation_*.csv' import_path_2 = get_latest_version_of_file(import_path_2_fuzzy) game_publisher_relation_bgg = pd.read_csv(import_path_2, index_col=0) game_publisher_relation_bgg = game_publisher_relation_bgg[[ 'bgg_game_id', 'publisher_name' ]] # import publishers import_path_3 = '../Data/Joined/Results/publisher.csv' publishers = pd.read_csv(import_path_3, index_col=0) # import game keys import_path_4 = '../Data/Joined/Integration/GameKeys/Keys_All_Games_Integrated.csv' game_keys = pd.read_csv(import_path_4, index_col=0) # replace bga game ids with game keys game_publisher_relation_bga = pd.merge(left=game_publisher_relation_bga, right=game_keys, left_on='game_id', right_on='bga_game_id') game_publisher_relation_bga = game_publisher_relation_bga[[ 'game_key', 'publisher_id' ]] # replace publisher_bga_id with publisher_key game_publisher_relation_bga = pd.merge(left=publishers, right=game_publisher_relation_bga, left_on='publisher_bga_id', right_on='publisher_id') game_publisher_relation_bga = game_publisher_relation_bga[[ 'game_key', 'publisher_key' ]] # replace bgg game ids with game keys game_publisher_relation_bgg = pd.merge(left=game_publisher_relation_bgg, right=game_keys, left_on='bgg_game_id', right_on='bgg_game_id') game_publisher_relation_bgg = game_publisher_relation_bgg[[ 'game_key', 'publisher_name' ]] # replace bgg publisher name with publisher key game_publisher_relation_bgg = pd.merge(left=game_publisher_relation_bgg, right=publishers, left_on='publisher_name', right_on='bgg_publisher_name') game_publisher_relation_bgg = game_publisher_relation_bgg[[ 'game_key', 'publisher_key' ]] # concat both dataframes: game_publisher_relation_combined = pd.concat( [game_publisher_relation_bga, game_publisher_relation_bgg]) # remove duplicates: game_publisher_relation_combined.drop_duplicates(inplace=True) # export game_publisher relation: export_path = '../Data/Joined/Results/Publisher_Game_Relation.csv' export_df_to_csv(game_publisher_relation_combined, export_path)
def merge_bga_and_bgg_designers(): # import bga data: import_path_1 = '../Data/BoardGameAtlas/Processed/API/bga_designers_scrapy_and_api_data_merged.csv' bga_designers_game_relation = pd.read_csv(import_path_1, index_col=0) # import bgg data: import_path_2_fuzzy = '../Data/BoardGameGeeks/Processed/GameInformation/03_BGG_Game_designer_Relation_*.csv' import_path_2 = get_latest_version_of_file(import_path_2_fuzzy) bgg_designers_game_relation = pd.read_csv(import_path_2, index_col=0) # drop NA's from bga_designers_game_relation names bga_designers_na = bga_designers_game_relation[ bga_designers_game_relation['designer_name'].isnull()] if len(bga_designers_na) > 0: print( str(len(bga_designers_na)) + ' rows dropped from bga_game_relation table because designer_names are missing.' ) bga_designers_game_relation = bga_designers_game_relation[ ~bga_designers_game_relation['designer_name'].isnull()] # create designers df: bga_designers_df = bga_designers_game_relation[[ 'designer_name', 'designer_bga_image_url', 'designer_url', 'designer_id' ]].drop_duplicates() bga_designers_df.rename(columns={ 'designer_bga_image_url': 'designer_image_url', 'designer_id': 'designer_bga_id' }, inplace=True) bgg_designers_df = pd.DataFrame( bgg_designers_game_relation[['designer_name']].drop_duplicates()) bgg_designers_df.rename(columns={'designer_name': 'bgg_designer_name'}, inplace=True) # add bgg_designer_key: bgg_designers_df.insert(0, 'designer_bgg_key', range(1, 1 + len(bgg_designers_df))) # extract designer names: bga_designer_names_list = bga_designers_game_relation[ 'designer_name'].drop_duplicates().to_list() bgg_designer_names_list = bgg_designers_game_relation[ 'designer_name'].drop_duplicates().to_list() # match designer names: # get exact name matches: exact_matches = list( set(bga_designer_names_list).intersection(bgg_designer_names_list)) # subsets for data that could not get matched exactly: bga_names_not_matched_list = [ name for name in bga_designer_names_list if name not in exact_matches ] bgg_names_not_matched_list = [ name for name in bgg_designer_names_list if name not in exact_matches ] # jaccard matching for names that could not be matched exactly matches = [] for bga_designer in bga_names_not_matched_list: match = find_match(bga_designer, bgg_names_not_matched_list, JACCARD_THRESHOLD_DESIGNERS) matches.append({ 'bga_name': bga_designer, 'bgg_name': match['name'], 'jaccard_score': match['jaccard_score'] }) # create list of matched designer names: jaccard_matches_bga = [ designer['bga_name'] for designer in matches if designer['jaccard_score'] != '' ] jaccard_matches_bgg = [ designer['bgg_name'] for designer in matches if designer['jaccard_score'] != '' ] # create list of a games matched: all_matches_bga = exact_matches + jaccard_matches_bga all_matches_bgg = exact_matches + jaccard_matches_bgg # create dataframe of matched designers: jaccard_matches_df = pd.DataFrame(matches) jaccard_matches_df = jaccard_matches_df[ jaccard_matches_df['jaccard_score'] != ''].sort_values('jaccard_score', ascending=False) del jaccard_matches_df['jaccard_score'] # 1) Create DF of all designers that could be matched # a) exact matches # b) jaccard matches # 2) Create DF of bga designers that could not be matched # 3) Create DF ob bgg designers that could not be matched # 4) Concat all DFs to one designers df # Structure: designer_key | designer_name | designer_bga_id | designer_bgg_id | designer_url | designer_image_url # 1) a) bga_exact_matches = bga_designers_df[ bga_designers_df['designer_name'].isin(exact_matches)] bgg_exact_matches = bgg_designers_df[ bgg_designers_df['bgg_designer_name'].isin(exact_matches)] joined_exact_matches = pd.merge(left=bga_exact_matches, right=bgg_exact_matches, left_on='designer_name', right_on='bgg_designer_name') # 1) b) bga_jaccard_matches = pd.merge(left=bga_designers_df, right=jaccard_matches_df, left_on='designer_name', right_on='bga_name') bgg_jaccard_matches = pd.merge(left=bgg_designers_df, right=jaccard_matches_df, left_on='bgg_designer_name', right_on='bgg_name') joined_jaccard_matches = pd.merge(left=bga_jaccard_matches, right=bgg_jaccard_matches, left_on='designer_name', right_on='bga_name') # drop columns not needed joined_jaccard_matches = joined_jaccard_matches[[ 'designer_name', 'designer_bga_id', 'designer_bgg_key', 'bgg_designer_name', 'designer_url', 'designer_image_url' ]] # 2) bga_no_matches = bga_designers_df[~bga_designers_df['designer_name']. isin(all_matches_bga)] # 3) bgg_no_matches = bgg_designers_df[~bgg_designers_df['bgg_designer_name']. isin(all_matches_bgg)] # add designers column: bgg_no_matches['designer_name'] = bgg_no_matches['bgg_designer_name'] # 4) Create large dataframe by concatenating all dataframes: # size: 473 [1a] + 7 [1b] + 25 [2] + 5928 [3] = 6433 designers_df = pd.concat([ joined_exact_matches, joined_jaccard_matches, bga_no_matches, bgg_no_matches ]) # add designer key: designers_df.insert(0, 'designer_key', range(1, 1 + len(designers_df))) # remove duplicates: designers_df.drop_duplicates(inplace=True) # export designers export_path = '../Data/Joined/Results/Designer.csv' export_df_to_csv(designers_df, export_path)
def merge_online_games(): # import all online game csvs to dataframes yucata_scrape_filename = get_latest_version_of_file( '../Data/Onlinegames/Yucata/Raw/Yucata_all_data_raw.csv') yucata_scrape_df = pd.read_csv(yucata_scrape_filename, sep=';') boardgamearena_scrape_filename = get_latest_version_of_file( '../Data/Onlinegames/Boardgamearena/Raw/Boardgamearena_all_data_raw.csv' ) boardgamearena_scrape_df = pd.read_csv(boardgamearena_scrape_filename, sep=';') tabletopia_scrape_filename = get_latest_version_of_file( '../Data/Tabletopia/Tabletopia/Raw/tabletopia_all_data_raw.csv') tabletopia_scrape_df = pd.read_csv(tabletopia_scrape_filename, sep=',') # drop potential duplicates yucata_scrape_df.drop_duplicates(inplace=True) boardgamearena_scrape_df.drop_duplicates(inplace=True) tabletopia_scrape_df.drop_duplicates(inplace=True) # rename columns yucata_scrape_df.rename(columns={ 'YucataName': 'Name', 'YucataPlayers': 'yucata_players', 'YucataTime': 'playtime', 'YucataAuthor': 'yucata_author', 'YucataLink': 'Onlinegamelink', 'YucataBoardGameGeekIF': 'BGGID' }, inplace=True) boardgamearena_scrape_df.rename(columns={ 'game_name_boardgamearena': 'Name', 'game_url_boardgamearena': 'Onlinegamelink' }, inplace=True) tabletopia_scrape_df.rename(columns={ 'game_name_tabletopia': 'Name', 'game_url_tabletopia': 'Onlinegamelink', 'bgg_link_tabletopia': 'bgg_url', }, inplace=True) # Add origin of data as text Origin = 'Yucata' yucata_scrape_df['Origin'] = Origin Origin = 'Boardgamearena' boardgamearena_scrape_df['Origin'] = Origin Origin = 'Tabletopia' tabletopia_scrape_df['Origin'] = Origin # Extract BGG ID from dfs tabletopia_scrape_df['BGGID'] = tabletopia_scrape_df[ 'bgg_url'].str.extract('e/(.+?)/') # Set Delete unnecessary columns and rename all atrributes to be the same yucata_scrape_df = yucata_scrape_df.drop([ 'yucata_players', 'playtime', 'YucataAge', 'yucata_author', 'YucataIllustrator', 'YucataPublisher', 'YucataDeveloper', 'YucataOnlinesince' ], 1) boardgamearena_scrape_df = boardgamearena_scrape_df.drop(columns=[ 'number_players_boardgamearena', 'playing_time_boardgamearena', 'game_strategy_boardgamearena', 'game_interaction_boardgamearena', 'game_complexity_boardgamearena', 'game_luck_boardgamearena', 'rounds_played_boardgamearena', 'available_since_boardgamearena', 'version_boardgamearena', 'description_boardgamearena', 'author_name_boardgamearena', 'graphicer_name_boardgamearena', 'publisher_name_boardgamearena', 'basegame_release_year_boardgamearena', 'developer_name_boardgamearena' ]) tabletopia_scrape_df = tabletopia_scrape_df.drop(columns=[ 'player_min_age_tabletopia', 'number_players_tabletopia', 'playing_time_tabletopia', 'rating_tabletopia', 'designer_tabletopia', 'illustrator_tabletopia', 'publisher_tabletopia', 'author_tabletopia', 'genre_tabletopia', 'description_tabletopia', 'bgg_url' ]) # merge dataframes onlinegames_merge_df = pd.DataFrame( columns=['Name', 'Onlinegamelink', 'Origin', 'BGGID']) onlinegames_merge_df = onlinegames_merge_df.append(yucata_scrape_df) onlinegames_merge_df = onlinegames_merge_df.append( boardgamearena_scrape_df) onlinegames_merge_df = onlinegames_merge_df.append(tabletopia_scrape_df) # create primary keys onlinegames_merge_df.insert(0, 'Onlinegamelink ID', range(1, 1 + len(onlinegames_merge_df))) # export dataframe export_path = '../Data/Onlinegames/Raw/Onlineboardgames_table_raw.csv' onlinegames_merge_df.to_csv(export_path, na_rep='NULL', sep=';')
def match_online_game_names_and_bgg_names(): onlinegames_filename = get_latest_version_of_file( '../Data/Onlinegames/Raw/Onlineboardgames_table_raw.csv') onlinegames_df = pd.read_csv(onlinegames_filename, sep=';') bgg_filename = get_latest_version_of_file( '../Data/BoardGameGeeks/Processed/GameInformation/01_BGG_Game_Information_*.csv' ) bgg_df = pd.read_csv(bgg_filename, index_col=0) bgg_names = bgg_df['name'].tolist() # Extract only games without BGG ID to match onlinegames_games_without_BGGID = onlinegames_df[ onlinegames_df['BGGID'].isna()] onlinegame_names_without_BGGID = onlinegames_games_without_BGGID[ 'Name'].tolist() # Find exact matches Onlingames - BGG exact_matches = list( set(bgg_names).intersection(set(onlinegame_names_without_BGGID))) # Exact matches as list of dicts (can later be used to create a pd.DF) exact_matches_list_of_dict = [{ 'online_name': x, 'bgg_name': x } for x in exact_matches] # subtract exact matches from datasets: subset_bgg_df = bgg_df[~bgg_df['name'].isin(exact_matches)] subset_onlinegames_df = onlinegames_games_without_BGGID[ ~onlinegames_games_without_BGGID['Name'].isin(exact_matches)] subset_onlinegame_names_without_BGGID = subset_onlinegames_df[ 'Name'].tolist() subset_bgg_df_names = subset_bgg_df['name'].tolist() # Match left over names Onlinegames - BGG match_list = [] for name in subset_onlinegame_names_without_BGGID: match = find_match(name, subset_bgg_df_names, JACCARD_THRESHOLD_GAME_NAME) match_list.append({ 'online_name': name, 'bgg_name': match['name'], 'jaccard_score': match['jaccard_score'] }) # drop entries that could not be matched: match_list = [x for x in match_list if x['jaccard_score'] != ''] # add exact matches to match_list: match_list = match_list + exact_matches_list_of_dict matches_df = pd.DataFrame(match_list) # merge matches and bgg to get bgg ids: merge_1 = pd.merge(left=matches_df, right=bgg_df, left_on='bgg_name', right_on='name') matches_df = merge_1[['bgg_name', 'online_name', 'bgg_game_id']] # merge matches and online games df: merge_2 = pd.merge(left=onlinegames_games_without_BGGID, right=matches_df, left_on='Name', right_on='online_name') merge_2['BGGID'] = merge_2['bgg_game_id'] # keep only columns from original online games df: merge_2 = merge_2[[ 'Onlinegamelink ID', 'Name', 'Onlinegamelink', 'Origin', 'BGGID' ]] # create a temp_df that contains all games out of the online games df that were not matched in the process # (the ones that had been matched previously and the ones that could not be matched in the process) temp_df = onlinegames_df[~onlinegames_df['Onlinegamelink ID']. isin(merge_2['Onlinegamelink ID'].tolist())] # combine both to get the full dataset with the additional information about the bgg_game_ids out of the games that # were successfully matched: onlinegames_df = pd.concat([temp_df, merge_2]) onlinegames_df.drop_duplicates(subset=['Onlinegamelink ID'], inplace=True) ## export online games: # rename a few columns onlinegames_df.rename(columns={ 'Name': 'name', 'Onlinegamelink ID': 'online_game_id', 'Onlinegamelink': 'url', 'Origin': 'origin', 'BGGID': 'bgg_id' }, inplace=True) onlinegames_df = onlinegames_df.drop(columns={'Unnamed: 0'}) # If bgg_id has to be int (Beware of nAn conversion!) #onlinegames_df['bgg_id'] = onlinegames_df['bgg_id'].fillna(0.0).astype(int) #onlinegames_df['bgg_id'] = onlinegames_df['bgg_id'].astype(int) # drop online games without bgg_id: onlinegames_df = onlinegames_df[~onlinegames_df['bgg_id'].isna()] # export result to csv: export_path = '../Data/Onlinegames/Processed/online_games.csv' export_df_to_csv(onlinegames_df, export_path)
def match_game_names(): """ This function matches bga and bgg boardgames based on their game names and the year in which they were published. This is how it works: - We calculate n-grams with n=3 for each boardgamename. - By removing stopwords that appear in many games that don't add much meaning to the game title we can reduce the number of false-positives and false-negatives. Examples: the stopwords 'board' and 'game' are removed: bga_name = '7 Wonders' bgg_name = '7 Wonders - The Board Game' -> this would result in a rather low jaccard score without removing the stopwords. bga_name = 'Settlers - The Board Game' bgg_name = '7 Wonder - The Board Game' -> this would result in a rather high jaccard score considering that both do not refer to the same game. - We then compare the similarity of a bga candidate and a bgg candidate by calculating the jaccard similarity. - The candidate with the highest jaccard score is chosen. Only if the jaccard score of that candidate exceeds our threshold the games are matched. Scalability Challenge: - However, there is one issue with that strategy: Computing the jaccard similarity requires comparisons of ca. 8,000 bga games and ca. 19,000 bgg games [ O(n) = n^2 ]. Comparing all bga_games and all bgg_games would lead to an extremely long run time, which we want to avoid. -> 8,000 x 19,000 = 152,000,000 comparisons. Therefore we adjusted our approach: 1) First, we find games that can be matched exactly. By this we mean games that have exactly the same name in both datasets. Since there are some games with duplicate game names that do not refer to the same game, we also include the year of publication. Therefore only games with exactly the same name and exactly the same year of publication are matched in this step. We can then subtract these games from the their datasets to decrease the sizes of games that have to be compared to: ca. 3,000 bga games and ca. 15,000 bgg games. -> 3,000 x 15,000 = 45,000,000 (complexity reduced by ~70%) 2) This is still quite a lot of comparisons. However, we made another observation. We also tried matching games by only their game_name (not also taking the year_published into consideration). In the set of games that could be matched exactly, in almost all cases the publish years are the same, which makes sense obviously. 3) Therefore we can further reduce complexity by grouping by publish years and comparing only games that have the same publish year. To make sure we don't lose games because the publish years deviate by one year, we also compare to games published in the years one year before and after. This further reduces the number of comparisons to: ~ 1,000,000 Hence, by applying the similarity function only to the most promising pairs we reduced the number of required comparisons by 98%. """ # Import bgg and bga data: bgg_filename = get_latest_version_of_file( '../Data/BoardGameGeeks/Processed/GameInformation/01_BGG_Game_Information_*.csv' ) bgg_df = pd.read_csv(bgg_filename, index_col=0) bgg_names = bgg_df['name'].tolist() bga_filename = get_latest_version_of_file( '../Data/BoardGameAtlas/Processed/API/01_BGA_Game_Information_*.json') bga_df = import_json_to_dataframe(bga_filename) bga_names = bga_df['name'].tolist() # Create lists with bga and bgg ids: bgg_ids = bgg_df['bgg_game_id'].tolist() bga_ids = bga_df['bga_game_id'].tolist() # Check duplicate names: bgg_duplicate_names = set([x for x in bgg_names if bgg_names.count(x) > 1]) bga_duplicate_names = set([x for x in bga_names if bga_names.count(x) > 1]) ## find exact matches (game_name, year_published): exact_matches_join_df = pd.merge(left=bgg_df, right=bga_df, left_on=['name', 'year_published'], right_on=['name', 'year_published']) # create list of ids of exactly matched games: exact_matches_bgg_ids = exact_matches_join_df['bgg_game_id'].tolist() exact_matches_bga_ids = exact_matches_join_df['bga_game_id'].tolist() # subtract exact matches from datasets to reduce their size: subset_bgg_df = bgg_df[~bgg_df['bgg_game_id'].isin(exact_matches_bgg_ids)] subset_bga_df = bga_df[~bga_df['bga_game_id'].isin(exact_matches_bga_ids)] subset_bgg_df.rename(columns={'year_published': 'year_published_bgg'}, inplace=True) subset_bga_df.rename(columns={'year_published': 'year_published_bga'}, inplace=True) ## In the next part we now want to apply name matching. Our first task is to find candidates so that we don't ## have to compare all games from one dataset with all games from the other dataset. We do so by grouping by ## their year of publication. ## First, we need some preprocessing steps so that we can actually set up our candidates: # Extract years from bga dataset: # A lot of type casting due to unexpected errors with float and set all_years = subset_bga_df['year_published_bga'].dropna().tolist() all_years = list(map(int, all_years)) years = list(set(all_years)) years.sort(reverse=True) # Do not apply name matching to games where to publish_year is missing: print('Dropped ' + str(subset_bgg_df['year_published_bgg'].isna().sum()) + ' rows from bga_dataset from name_matching') print('Dropped ' + str(subset_bga_df['year_published_bga'].isna().sum()) + ' rows from bgg_dataset from name_matching') subset_bgg_df.dropna(inplace=True) subset_bga_df.dropna(inplace=True) # strip of '.0' at the end of each year by converting to int: 2018.0 -> 2018 subset_bga_df["year_published_bga"] = subset_bga_df[ "year_published_bga"].astype(int) # create a dictionary to group all bgg games by their year of publication # during the name matching process we will only compare the names of games with the same publication year bgg_dic_grouped_by_year = {} bga_dic_grouped_by_year = {} # fill the previously created dictionaries that include all the games that were published in a certain year for year in years: bgg_dic_grouped_by_year[year] = subset_bgg_df[ subset_bgg_df['year_published_bgg'] == year].to_dict('records') bga_dic_grouped_by_year[year] = subset_bga_df[ subset_bga_df['year_published_bga'] == year].to_dict('records') ## Now we get to the interesting part: ## We iterate over all bga_games which we found no exact bgg_matches for. We then create a list with potential ## candidates including all bgg_games that were published in the same year or one year before or after. ## For these candidates we then apply name_matching using the jaccard similarity. for year in years: for bga_game in bga_dic_grouped_by_year[year]: input_string = bga_game['name'] candidate_list = [] # create candidate_list with all bgg games that were published in the same year as the bga_game: for bgg_game in bgg_dic_grouped_by_year[year]: candidate_list.append(bgg_game['name']) # also check bgg games that were published in the previous year and one year later: if year + 1 in bgg_dic_grouped_by_year: for bgg_game in bgg_dic_grouped_by_year[year + 1]: candidate_list.append(bgg_game['name']) if year - 1 in bgg_dic_grouped_by_year: for bgg_game in bgg_dic_grouped_by_year[year - 1]: candidate_list.append(bgg_game['name']) # Try to match the input_string (target BGA Game name) one of the games in the candidate_list (bgg games). # The match with the highest jaccard similarity is returned. If there is no match, or the Jaccard threshold # can not be exceeded then an empty string is returned. match = find_match(input_string, candidate_list, JACCARD_THRESHOLD_GAME_NAME) bga_game['match'] = match['name'] bga_game['jaccard_score'] = match['jaccard_score'] global COMPARISONS print('Number of comparisons: ' + str(COMPARISONS)) bga_list_matches = [] for year in years: for bga_game in bga_dic_grouped_by_year[year]: bga_list_matches.append(bga_game) # turn list of dictionaries back to data frame: jaccard_matches_df = pd.DataFrame(bga_list_matches) # just for debugging and inspecting results: analyse_df = pd.DataFrame(bga_list_matches) analyse_df = analyse_df[analyse_df['jaccard_score'] != ''] analyse_df = analyse_df[['name', 'match', 'jaccard_score']] analyse_df = analyse_df.sort_values('jaccard_score', ascending=False) ## We have now succesfully found a large number of games that could be matched. All that's left to do is # creating a dataframe that contains the matched BGA and BGG IDs. We do so in three steps: # 1) Prepare DF containing BGA and BGG IDs of games that could be matched exactly by name and year_published # 2) Prepare DF containing BGA and BGG IDs of games that could be matched by string matching (jaccard method) # 3) Concatenate both data frames # 1) Exact matches # Keep only ID columns: exact_matches_join_df = exact_matches_join_df[[ 'bgg_game_id', 'bga_game_id' ]] # 2) Jaccard matches # Cut of rows where the jaccard threshold wasn't reached (-> no match) jaccard_matches_df = jaccard_matches_df[jaccard_matches_df['match'] != ''] jaccard_matches_df = jaccard_matches_df[[ 'bga_game_id', 'name', 'year_published_bga', 'match', 'jaccard_score' ]] jaccard_matches_df.rename(columns={'name': 'bga_name'}, inplace=True) # Join both datasets jaccard_matches_join_df = pd.merge( left=bgg_df[['bgg_game_id', 'name', 'year_published']], right=jaccard_matches_df, left_on=['name', 'year_published'], right_on=['match', 'year_published_bga']) jaccard_matches_join_df = jaccard_matches_join_df[[ 'bgg_game_id', 'bga_game_id' ]] # 3) Concat both dfs matched_game_ids_df = pd.concat( [exact_matches_join_df, jaccard_matches_join_df]) # 4) Store matches to csv: export_df_to_csv( matched_game_ids_df, '../Data/Joined/Integration/GameInformation/matched_bga_and_bgg_ids.csv' )
def merge_game_information(): ''' Function merges the boardgames of the previously matched games. For the matched games there are four types of columns: a) columns that exist in both datasets but we only need to keep one of them (can include conflicting values): (e.g. name, year_published, min_players, ...) In this case we chose to keep the bgg columns! ["trust-your-friends" avoidance strategy as in case of contradicting values we keep values based on which data source they come from] b) columns that exist in both datasets but we want to keep both: (e.g. bga_game_id/bgg_game_id, num_user_ratings, average_user_rating, bga_rank/bgg_rank, ...) c) columns that exist only in the bgg dataset: (e.g. num_user_comments, bgg_average_weight, ...) d) columns that exist only in the bga dataset: (e.g. reddit_all_time_count, bga_game_url, ...) ''' # import data: # bgg game information dataset: bgg_filename = get_latest_version_of_file( '../Data/BoardGameGeeks/Processed/GameInformation/01_BGG_Game_Information_*.csv' ) bgg_df = pd.read_csv(bgg_filename, index_col=0) # bga game information dataset: bga_filename = get_latest_version_of_file( '../Data/BoardGameAtlas/Processed/API/01_BGA_Game_Information_*.json') bga_df = import_json_to_dataframe(bga_filename) # 1) this leaves us with three groups: # a) Matched Games # b) BGG Games that could not be matched # c) BGA Games that could not be matched # 1a) matched games: ids_matched_games_df = pd.read_csv( '../Data/Joined/Integration/GameInformation/matched_bga_and_bgg_ids.csv', index_col=0) bgg_subset_matches = bgg_df[bgg_df['bgg_game_id'].isin( ids_matched_games_df['bgg_game_id'])] bga_subset_matches = bga_df[bga_df['bga_game_id'].isin( ids_matched_games_df['bga_game_id'])] # 1b) BGG games no matched: bgg_subset_no_matches = bgg_df[~bgg_df['bgg_game_id']. isin(ids_matched_games_df['bgg_game_id'])] # 1c) BGA games no matched: bga_subset_no_matches = bga_df[~bga_df['bga_game_id']. isin(ids_matched_games_df['bga_game_id'])] # 2) # For the matched games there are three types of columns: # a) columns that exist in both datasets but we only need to keep one of them: # (e.g. name, year_published, min_players, ...) # In this case we chose to keep the bgg columns! It doesn't really matter which ones you keep though! # b) columns that exist in both datasets but we want to keep both: # (e.g. bga_game_id/bgg_game_id, num_user_ratings, average_user_rating, bga_rank/bgg_rank, ...) # c) columns that exist only in the bgg dataset: # (e.g. num_user_comments, bgg_average_weight, ...) # d) columns that exist only in the bga dataset: # (e.g. reddit_all_time_count, bga_game_url, ...) # 2a) drop columns from bga dataset: drop_bga_columns = [ 'name', 'year_published', 'min_players', 'max_players', 'min_playtime', 'max_playtime', 'min_age', 'game_description', 'image_url', 'thumbnail_url' ] bga_subset_matches.drop(columns=drop_bga_columns, inplace=True) # add 'matched_bgg_id' column: bga_subset_matches = pd.merge(left=bga_subset_matches, right=ids_matched_games_df, left_on='bga_game_id', right_on='bga_game_id') # merge both datasets: matched_games_df = pd.merge(left=bgg_subset_matches, right=bga_subset_matches, left_on=['bgg_game_id'], right_on=['bgg_game_id']) # Handle duplicate ids in matched_games_df: # remove duplicates: # duplicate bgg_ids: matched_games_df.drop_duplicates(subset=['bgg_game_id'], keep='first', inplace=True) # duplicate bga_ids: matched_games_df.drop_duplicates(subset=['bga_game_id'], keep='first', inplace=True) # In a last (union) step we now have to concatenate all three dataframes to one big dataframes: games_df = matched_games_df.append( [bgg_subset_no_matches, bga_subset_no_matches], ignore_index=True, sort=False) # reorder columns: cols_to_order = [ 'name', 'bgg_game_id', 'bga_game_id', 'year_published', 'min_players', 'max_players', 'min_playtime', 'max_playtime', 'min_age', 'bgg_average_user_rating', 'bga_average_user_rating', 'bgg_num_user_ratings', 'bga_num_user_ratings' ] new_columns = cols_to_order + ( games_df.columns.drop(cols_to_order).tolist()) games_df = games_df[new_columns] # create new unique key_column: games_df.insert(0, 'game_key', range(100001, 100001 + len(games_df))) # create key_csv that contains bga_game_id, bgg_game_id and game_key: key_df = games_df[['game_key', 'bga_game_id', 'bgg_game_id']] # check if there are any duplicates in game_df: games_df_duplicates = len(games_df) - len(games_df.drop_duplicates()) if games_df_duplicates > 0: print('Warning. ' + str(games_df_duplicates) + ' duplicates found in BoardGameTable: ') games_df.drop_duplicates(inplace=True) print('Duplicates removed!') # check if there are any duplicates in key_df: count_duplicates_bgg = len(key_df[~key_df['bgg_game_id'].isnull()]) - len( key_df[~key_df['bgg_game_id'].isnull()].drop_duplicates( subset='bgg_game_id')) count_duplicates_bga = len(key_df[~key_df['bga_game_id'].isnull()]) - len( key_df[~key_df['bga_game_id'].isnull()].drop_duplicates( subset='bga_game_id')) if (count_duplicates_bga + count_duplicates_bga) > 0: print('Warning. Duplicates found: ') print('BGG_game_ids: ' + str(count_duplicates_bgg)) print('BGA_game_ids: ' + str(count_duplicates_bga)) key_df.drop_duplicates(inplace=True) print('Duplicates removed:') # Fix badly encoded symbols # Insert quotation marks games_df['game_description'] = games_df['game_description'].str.replace( r'"', '\'') games_df['game_description'] = games_df['game_description'].str.replace( r'”', '\'') games_df['game_description'] = games_df['game_description'].str.replace( r'’', '\'') games_df['game_description'] = games_df['game_description'].str.replace( r'“', '\'') games_df['game_description'] = games_df['game_description'].str.replace( r'&', '&') games_df['game_description'] = games_df['game_description'].str.replace( r'é', 'e') # Insert Umlaute games_df['game_description'] = games_df['game_description'].str.replace( r'ä', 'ä') games_df['game_description'] = games_df['game_description'].str.replace( r'Ü', 'ü') games_df['game_description'] = games_df['game_description'].str.replace( r'ü', 'ü') games_df['game_description'] = games_df['game_description'].str.replace( r'ö', 'ö') games_df['game_description'] = games_df['game_description'].str.replace( r'ß', 'ß') # Insert dashes & non-breaking space games_df['game_description'] = games_df['game_description'].str.replace( r'–', '-') games_df['game_description'] = games_df['game_description'].str.replace( r'—', '-') games_df['game_description'] = games_df['game_description'].str.replace( r' ', ' ') games_df['game_description'] = games_df['game_description'].str.replace( r'×', 'x') games_df['game_description'] = games_df['game_description'].str.replace( r'­', '-') # Kick html characters games_df['game_description'] = games_df['game_description'].str.replace( r'&#...;', '') games_df['game_description'] = games_df['game_description'].str.replace( r'&#..;', ' ') games_df['game_description'] = games_df['game_description'].str.replace( r'&#.;', '') games_df['game_description'] = games_df['game_description'].str.replace( r'.....;', '') games_df['game_description'] = games_df['game_description'].str.replace( r'....;', '') games_df['game_description'] = games_df['game_description'].str.replace( r'...;', '') games_df['game_description'] = games_df['game_description'].str.replace( r'..;', '') games_df['game_description'] = games_df['game_description'].str.replace( r'.;', '') # Remove double semicolon and double spaces games_df['game_description'] = games_df['game_description'].str.replace( r';;', ' ') games_df['game_description'] = games_df['game_description'].str.replace( ' +', ' ') games_df['game_description'] = games_df['game_description'].str.strip() # export to csv: export_df_to_csv(games_df, '../Data/Joined/Results/BoardGames.csv') export_df_to_csv( key_df, '../Data/Joined/Integration/GameKeys/Keys_All_Games_Integrated.csv')