def normalize_bga_game_categories_relation():
    # import bga_games_category_relation:
    categories_bga_games_relation_path_fuzzy = '../Data/BoardGameAtlas/Processed/API/05_BGA_Game_Categories_Relation_*.json'
    categories_bga_games_relation_path = get_latest_version_of_file(
        categories_bga_games_relation_path_fuzzy)
    categories_bga_games_relation_df = import_json_to_dataframe(
        categories_bga_games_relation_path)

    # import bga categories:
    categories_bga = pd.read_csv(
        '../Data/BoardGameAtlas/Raw/API/categories/all_bga_categories.csv',
        index_col=0)

    # import game keys:
    game_keys = pd.read_csv(
        '../Data/Joined/Integration/GameKeys/Keys_All_Games_Integrated.csv',
        index_col=0)

    # join games_category_relation table to replace bga_game_id column with game_keys column:
    categories_bga_games_relation_df = pd.merge(
        left=categories_bga_games_relation_df,
        right=game_keys,
        left_on='game_id',
        right_on='bga_game_id')

    # normalize by only keeping game_key and category_id
    categories_bga_games_relation_df = categories_bga_games_relation_df[[
        'game_key', 'category_id'
    ]]

    # export df
    export_path = '../Data/BoardGameAtlas/Processed/API/05_BGA_Game_Categories_Relation_Cleaned.csv'
    export_df_to_csv(categories_bga_games_relation_df, export_path)
def integrate_game_name_relation_tables():
    # Import BGA game_name_relation table:
    fuzzy_import_path_1 = '../Data/BoardGameAtlas/Processed/API/06_BGA_Game_Names_Relation_*.json'
    import_path_1 = get_latest_version_of_file(fuzzy_import_path_1)
    names_bga = import_json_to_dataframe(import_path_1)

    # Import BGG game_name_relation table:
    fuzzy_import_path_2 = '../Data/BoardGameGeeks/Processed/GameInformation/06_BGG_Game_Name_Relation_*.csv'
    import_path_2 = get_latest_version_of_file(fuzzy_import_path_2)
    names_bgg = pd.read_csv(import_path_2, index_col=0)

    # Import Game keys:
    import_path_3 = '../Data/Joined/Integration/GameKeys/Keys_All_Games_Integrated.csv'
    game_keys = pd.read_csv(import_path_3, index_col=0)

    # Replace bga 'game_id' with 'game_key'
    names_bga = pd.merge(left=names_bga,
                         right=game_keys,
                         left_on='game_id',
                         right_on='bga_game_id')
    names_bga = names_bga[['game_key', 'game_name']]

    # Replace bgg 'game_id' with 'game_key'
    names_bgg = pd.merge(left=names_bgg,
                         right=game_keys,
                         left_on='bgg_game_id',
                         right_on='bgg_game_id')
    names_bgg = names_bgg[['game_key', 'game_name']]

    # Merge both dataframes:
    names_combined = pd.concat([names_bga, names_bgg]).sort_values('game_key')

    # Remove duplicates:
    print(
        'Number of duplicate game names in GameNameTranslation table found and dropped: '
        + str(len(names_combined) - len(names_combined.drop_duplicates())))
    names_combined.drop_duplicates(inplace=True)

    # Drop duplicates:
    names_combined.drop_duplicates(inplace=True)

    # Export result:
    export_path = '../Data/Joined/Results/GameNameTranslation.csv'
    export_df_to_csv(names_combined, export_path)
def create_list_of_all_bga_designers():
    # import bga designers
    fuzzy_import_path = '../Data/BoardGameAtlas/Processed/API/03_BGA_Game_designers_Relation*.json'
    import_path = get_latest_version_of_file(fuzzy_import_path)
    bga_designers_game_relation = import_json_to_dataframe(import_path)

    # extract designers ids and designer urls
    designers = bga_designers_game_relation[['designer_id', 'designer_url']]

    # keep only unique designers:
    designers.drop_duplicates(subset='designer_id', keep='first', inplace=True)

    # export designers to csv:
    export_path = '../Data/BoardGameAtlas/Processed/API/BGA_All_Unique_designers.csv'
    export_df_to_csv(designers, export_path)
def normalize_bgg_game_categories_relation():
    # import bgg games:
    categories_bgg_games_relation_path_fuzzy = '../Data/BoardGameGeeks/Processed/GameInformation/05_BGG_Game_category_Relation_*.csv'
    categories_bgg_games_relation_path = get_latest_version_of_file(
        categories_bgg_games_relation_path_fuzzy)
    categories_bgg_games_relation_df = pd.read_csv(
        categories_bgg_games_relation_path, index_col=0)

    # create categories list:
    categories_bgg = pd.DataFrame(
        categories_bgg_games_relation_df['category_name'].drop_duplicates())

    # create temporary key_column:
    categories_bgg.insert(0, 'bgg_category_key',
                          range(1001, 1001 + len(categories_bgg)))

    # import game keys:
    game_keys = pd.read_csv(
        '../Data/Joined/Integration/GameKeys/Keys_All_Games_Integrated.csv',
        index_col=0)

    # join games_category_relation table to replace bga_game_id column with game_keys column:
    categories_bgg_games_relation_df = pd.merge(
        left=categories_bgg_games_relation_df,
        right=game_keys,
        left_on='bgg_game_id',
        right_on='bgg_game_id')

    # replace 'category_name' with 'bgg_category_key' in categories_bgg_games_relation:
    categories_bgg_games_relation_df = pd.merge(
        left=categories_bgg_games_relation_df,
        right=categories_bgg,
        left_on='category_name',
        right_on='category_name')

    # normalize by only keeping game_key and category_id
    categories_bgg_games_relation_df = categories_bgg_games_relation_df[[
        'game_key', 'bgg_category_key'
    ]]

    # export bgg categories:
    export_path = '../Data/BoardGameGeeks/Raw/BGG_categories.csv'
    export_df_to_csv(categories_bgg, export_path)

    # export bgg game_categories_relation:
    export_path = '../Data/BoardGameGeeks/Processed/GameInformation/05_BGA_Game_Categories_Relation_Cleaned.csv'
    export_df_to_csv(categories_bgg_games_relation_df, export_path)
Exemple #5
0
def create_id_list_of_included_bga_games():
    """
    Extracts ids of bga games previously obtained from bga api and create a JSON file containing the IDs.
    This list will later be used for the BGA Review API requests.
    """

    # import file that contains information on bga_games
    # since the exact filename is unknown, we have to find file and its latest version first:
    filename = get_latest_version_of_file(
        '../Data/BoardGameAtlas/Processed/API/01_BGA_Game_Information_*.json')

    data = import_json_to_dataframe(filename)

    # extract ids
    ids = data['bga_game_id']

    # export data to json file
    filename_export = '../Data/BoardGameAtlas/Processed/API/BGA_Ids_of_boardgames_included.json'
    export_df_to_json(ids, filename_export)
def merge_scraped_bga_designer_data_and_api_data():
    # import scraped bga designer data
    import_path_1 = '../Data/BoardGameAtlas/Raw/Scrapy/designers/bga_designers.json'
    designers_scrapy = import_json_to_dataframe(import_path_1)

    # import api bga designer data
    import_path_2_fuzzy = '../Data/BoardGameAtlas/Processed/API/03_BGA_Game_designers_Relation_*.json'
    import_path_2 = get_latest_version_of_file(import_path_2_fuzzy)
    designers_game_relation = import_json_to_dataframe(import_path_2)

    # remove list around designer url:
    designers_scrapy = designers_scrapy.explode('designer_bga_image_url')

    # merge both dataframes:
    designers_merged = pd.merge(left=designers_scrapy,
                                right=designers_game_relation,
                                left_on='designer_url',
                                right_on='designer_url')

    # export df
    export_path = '../Data/BoardGameAtlas/Processed/API/bga_designers_scrapy_and_api_data_merged.csv'
    export_df_to_csv(designers_merged, export_path)
def merge_bga_and_bgg_publisher_game_relation():
    # import bga_publisher_game_relation
    import_path_1 = '../Data/BoardGameAtlas/Processed/API/bga_publishers_scrapy_and_api_data_merged.csv'
    game_publisher_relation_bga = pd.read_csv(import_path_1, index_col=0)
    game_publisher_relation_bga = game_publisher_relation_bga[[
        'game_id', 'publisher_id'
    ]]

    # import bgg_publisher_game_relation
    import_path_2_fuzzy = '../Data/BoardGameGeeks/Processed/GameInformation/02_BGG_Game_Publisher_Relation_*.csv'
    import_path_2 = get_latest_version_of_file(import_path_2_fuzzy)
    game_publisher_relation_bgg = pd.read_csv(import_path_2, index_col=0)
    game_publisher_relation_bgg = game_publisher_relation_bgg[[
        'bgg_game_id', 'publisher_name'
    ]]

    # import publishers
    import_path_3 = '../Data/Joined/Results/publisher.csv'
    publishers = pd.read_csv(import_path_3, index_col=0)

    # import game keys
    import_path_4 = '../Data/Joined/Integration/GameKeys/Keys_All_Games_Integrated.csv'
    game_keys = pd.read_csv(import_path_4, index_col=0)

    # replace bga game ids with game keys
    game_publisher_relation_bga = pd.merge(left=game_publisher_relation_bga,
                                           right=game_keys,
                                           left_on='game_id',
                                           right_on='bga_game_id')
    game_publisher_relation_bga = game_publisher_relation_bga[[
        'game_key', 'publisher_id'
    ]]

    # replace publisher_bga_id with publisher_key
    game_publisher_relation_bga = pd.merge(left=publishers,
                                           right=game_publisher_relation_bga,
                                           left_on='publisher_bga_id',
                                           right_on='publisher_id')
    game_publisher_relation_bga = game_publisher_relation_bga[[
        'game_key', 'publisher_key'
    ]]

    # replace bgg game ids with game keys
    game_publisher_relation_bgg = pd.merge(left=game_publisher_relation_bgg,
                                           right=game_keys,
                                           left_on='bgg_game_id',
                                           right_on='bgg_game_id')
    game_publisher_relation_bgg = game_publisher_relation_bgg[[
        'game_key', 'publisher_name'
    ]]

    # replace bgg publisher name with publisher key
    game_publisher_relation_bgg = pd.merge(left=game_publisher_relation_bgg,
                                           right=publishers,
                                           left_on='publisher_name',
                                           right_on='bgg_publisher_name')
    game_publisher_relation_bgg = game_publisher_relation_bgg[[
        'game_key', 'publisher_key'
    ]]

    # concat both dataframes:
    game_publisher_relation_combined = pd.concat(
        [game_publisher_relation_bga, game_publisher_relation_bgg])

    # remove duplicates:
    game_publisher_relation_combined.drop_duplicates(inplace=True)

    # export game_publisher relation:
    export_path = '../Data/Joined/Results/Publisher_Game_Relation.csv'
    export_df_to_csv(game_publisher_relation_combined, export_path)
def merge_bga_and_bgg_designers():
    # import bga data:
    import_path_1 = '../Data/BoardGameAtlas/Processed/API/bga_designers_scrapy_and_api_data_merged.csv'
    bga_designers_game_relation = pd.read_csv(import_path_1, index_col=0)

    # import bgg data:
    import_path_2_fuzzy = '../Data/BoardGameGeeks/Processed/GameInformation/03_BGG_Game_designer_Relation_*.csv'
    import_path_2 = get_latest_version_of_file(import_path_2_fuzzy)
    bgg_designers_game_relation = pd.read_csv(import_path_2, index_col=0)

    # drop NA's from bga_designers_game_relation names
    bga_designers_na = bga_designers_game_relation[
        bga_designers_game_relation['designer_name'].isnull()]
    if len(bga_designers_na) > 0:
        print(
            str(len(bga_designers_na)) +
            ' rows dropped from bga_game_relation table because designer_names are missing.'
        )
        bga_designers_game_relation = bga_designers_game_relation[
            ~bga_designers_game_relation['designer_name'].isnull()]

    # create designers df:
    bga_designers_df = bga_designers_game_relation[[
        'designer_name', 'designer_bga_image_url', 'designer_url',
        'designer_id'
    ]].drop_duplicates()
    bga_designers_df.rename(columns={
        'designer_bga_image_url': 'designer_image_url',
        'designer_id': 'designer_bga_id'
    },
                            inplace=True)

    bgg_designers_df = pd.DataFrame(
        bgg_designers_game_relation[['designer_name']].drop_duplicates())
    bgg_designers_df.rename(columns={'designer_name': 'bgg_designer_name'},
                            inplace=True)

    # add bgg_designer_key:
    bgg_designers_df.insert(0, 'designer_bgg_key',
                            range(1, 1 + len(bgg_designers_df)))

    # extract designer names:
    bga_designer_names_list = bga_designers_game_relation[
        'designer_name'].drop_duplicates().to_list()
    bgg_designer_names_list = bgg_designers_game_relation[
        'designer_name'].drop_duplicates().to_list()

    # match designer names:
    # get exact name matches:
    exact_matches = list(
        set(bga_designer_names_list).intersection(bgg_designer_names_list))

    # subsets for data that could not get matched exactly:
    bga_names_not_matched_list = [
        name for name in bga_designer_names_list if name not in exact_matches
    ]
    bgg_names_not_matched_list = [
        name for name in bgg_designer_names_list if name not in exact_matches
    ]

    # jaccard matching for names that could not be matched exactly
    matches = []
    for bga_designer in bga_names_not_matched_list:
        match = find_match(bga_designer, bgg_names_not_matched_list,
                           JACCARD_THRESHOLD_DESIGNERS)
        matches.append({
            'bga_name': bga_designer,
            'bgg_name': match['name'],
            'jaccard_score': match['jaccard_score']
        })

    # create list of matched designer names:
    jaccard_matches_bga = [
        designer['bga_name'] for designer in matches
        if designer['jaccard_score'] != ''
    ]
    jaccard_matches_bgg = [
        designer['bgg_name'] for designer in matches
        if designer['jaccard_score'] != ''
    ]

    # create list of a games matched:
    all_matches_bga = exact_matches + jaccard_matches_bga
    all_matches_bgg = exact_matches + jaccard_matches_bgg

    # create dataframe of matched designers:
    jaccard_matches_df = pd.DataFrame(matches)
    jaccard_matches_df = jaccard_matches_df[
        jaccard_matches_df['jaccard_score'] != ''].sort_values('jaccard_score',
                                                               ascending=False)
    del jaccard_matches_df['jaccard_score']

    # 1) Create DF of all designers that could be matched
    #       a) exact matches
    #       b) jaccard matches
    # 2) Create DF of bga designers that could not be matched
    # 3) Create DF ob bgg designers that could not be matched
    # 4) Concat all DFs to one designers df

    # Structure: designer_key | designer_name | designer_bga_id | designer_bgg_id | designer_url | designer_image_url
    # 1) a)
    bga_exact_matches = bga_designers_df[
        bga_designers_df['designer_name'].isin(exact_matches)]
    bgg_exact_matches = bgg_designers_df[
        bgg_designers_df['bgg_designer_name'].isin(exact_matches)]

    joined_exact_matches = pd.merge(left=bga_exact_matches,
                                    right=bgg_exact_matches,
                                    left_on='designer_name',
                                    right_on='bgg_designer_name')

    # 1) b)
    bga_jaccard_matches = pd.merge(left=bga_designers_df,
                                   right=jaccard_matches_df,
                                   left_on='designer_name',
                                   right_on='bga_name')
    bgg_jaccard_matches = pd.merge(left=bgg_designers_df,
                                   right=jaccard_matches_df,
                                   left_on='bgg_designer_name',
                                   right_on='bgg_name')

    joined_jaccard_matches = pd.merge(left=bga_jaccard_matches,
                                      right=bgg_jaccard_matches,
                                      left_on='designer_name',
                                      right_on='bga_name')
    # drop columns not needed
    joined_jaccard_matches = joined_jaccard_matches[[
        'designer_name', 'designer_bga_id', 'designer_bgg_key',
        'bgg_designer_name', 'designer_url', 'designer_image_url'
    ]]

    # 2)
    bga_no_matches = bga_designers_df[~bga_designers_df['designer_name'].
                                      isin(all_matches_bga)]

    # 3)
    bgg_no_matches = bgg_designers_df[~bgg_designers_df['bgg_designer_name'].
                                      isin(all_matches_bgg)]
    # add designers column:
    bgg_no_matches['designer_name'] = bgg_no_matches['bgg_designer_name']

    # 4) Create large dataframe by concatenating all dataframes:
    # size: 473 [1a] + 7 [1b] + 25 [2] + 5928 [3] = 6433
    designers_df = pd.concat([
        joined_exact_matches, joined_jaccard_matches, bga_no_matches,
        bgg_no_matches
    ])

    # add designer key:
    designers_df.insert(0, 'designer_key', range(1, 1 + len(designers_df)))

    # remove duplicates:
    designers_df.drop_duplicates(inplace=True)

    # export designers
    export_path = '../Data/Joined/Results/Designer.csv'
    export_df_to_csv(designers_df, export_path)
def merge_online_games():
    # import all online game csvs to dataframes
    yucata_scrape_filename = get_latest_version_of_file(
        '../Data/Onlinegames/Yucata/Raw/Yucata_all_data_raw.csv')
    yucata_scrape_df = pd.read_csv(yucata_scrape_filename, sep=';')
    boardgamearena_scrape_filename = get_latest_version_of_file(
        '../Data/Onlinegames/Boardgamearena/Raw/Boardgamearena_all_data_raw.csv'
    )
    boardgamearena_scrape_df = pd.read_csv(boardgamearena_scrape_filename,
                                           sep=';')
    tabletopia_scrape_filename = get_latest_version_of_file(
        '../Data/Tabletopia/Tabletopia/Raw/tabletopia_all_data_raw.csv')
    tabletopia_scrape_df = pd.read_csv(tabletopia_scrape_filename, sep=',')

    # drop potential duplicates
    yucata_scrape_df.drop_duplicates(inplace=True)
    boardgamearena_scrape_df.drop_duplicates(inplace=True)
    tabletopia_scrape_df.drop_duplicates(inplace=True)

    # rename columns
    yucata_scrape_df.rename(columns={
        'YucataName': 'Name',
        'YucataPlayers': 'yucata_players',
        'YucataTime': 'playtime',
        'YucataAuthor': 'yucata_author',
        'YucataLink': 'Onlinegamelink',
        'YucataBoardGameGeekIF': 'BGGID'
    },
                            inplace=True)
    boardgamearena_scrape_df.rename(columns={
        'game_name_boardgamearena': 'Name',
        'game_url_boardgamearena': 'Onlinegamelink'
    },
                                    inplace=True)
    tabletopia_scrape_df.rename(columns={
        'game_name_tabletopia': 'Name',
        'game_url_tabletopia': 'Onlinegamelink',
        'bgg_link_tabletopia': 'bgg_url',
    },
                                inplace=True)

    # Add origin of data as text
    Origin = 'Yucata'
    yucata_scrape_df['Origin'] = Origin
    Origin = 'Boardgamearena'
    boardgamearena_scrape_df['Origin'] = Origin
    Origin = 'Tabletopia'
    tabletopia_scrape_df['Origin'] = Origin

    # Extract BGG ID from dfs
    tabletopia_scrape_df['BGGID'] = tabletopia_scrape_df[
        'bgg_url'].str.extract('e/(.+?)/')

    # Set Delete unnecessary columns and rename all atrributes to be the same
    yucata_scrape_df = yucata_scrape_df.drop([
        'yucata_players', 'playtime', 'YucataAge', 'yucata_author',
        'YucataIllustrator', 'YucataPublisher', 'YucataDeveloper',
        'YucataOnlinesince'
    ], 1)
    boardgamearena_scrape_df = boardgamearena_scrape_df.drop(columns=[
        'number_players_boardgamearena', 'playing_time_boardgamearena',
        'game_strategy_boardgamearena', 'game_interaction_boardgamearena',
        'game_complexity_boardgamearena', 'game_luck_boardgamearena',
        'rounds_played_boardgamearena', 'available_since_boardgamearena',
        'version_boardgamearena', 'description_boardgamearena',
        'author_name_boardgamearena', 'graphicer_name_boardgamearena',
        'publisher_name_boardgamearena',
        'basegame_release_year_boardgamearena', 'developer_name_boardgamearena'
    ])
    tabletopia_scrape_df = tabletopia_scrape_df.drop(columns=[
        'player_min_age_tabletopia', 'number_players_tabletopia',
        'playing_time_tabletopia', 'rating_tabletopia', 'designer_tabletopia',
        'illustrator_tabletopia', 'publisher_tabletopia', 'author_tabletopia',
        'genre_tabletopia', 'description_tabletopia', 'bgg_url'
    ])

    # merge dataframes
    onlinegames_merge_df = pd.DataFrame(
        columns=['Name', 'Onlinegamelink', 'Origin', 'BGGID'])
    onlinegames_merge_df = onlinegames_merge_df.append(yucata_scrape_df)
    onlinegames_merge_df = onlinegames_merge_df.append(
        boardgamearena_scrape_df)
    onlinegames_merge_df = onlinegames_merge_df.append(tabletopia_scrape_df)

    # create primary keys
    onlinegames_merge_df.insert(0, 'Onlinegamelink ID',
                                range(1, 1 + len(onlinegames_merge_df)))

    # export dataframe
    export_path = '../Data/Onlinegames/Raw/Onlineboardgames_table_raw.csv'
    onlinegames_merge_df.to_csv(export_path, na_rep='NULL', sep=';')
def match_online_game_names_and_bgg_names():
    onlinegames_filename = get_latest_version_of_file(
        '../Data/Onlinegames/Raw/Onlineboardgames_table_raw.csv')

    onlinegames_df = pd.read_csv(onlinegames_filename, sep=';')

    bgg_filename = get_latest_version_of_file(
        '../Data/BoardGameGeeks/Processed/GameInformation/01_BGG_Game_Information_*.csv'
    )
    bgg_df = pd.read_csv(bgg_filename, index_col=0)
    bgg_names = bgg_df['name'].tolist()

    # Extract only games without BGG ID to match
    onlinegames_games_without_BGGID = onlinegames_df[
        onlinegames_df['BGGID'].isna()]
    onlinegame_names_without_BGGID = onlinegames_games_without_BGGID[
        'Name'].tolist()

    # Find exact matches Onlingames - BGG
    exact_matches = list(
        set(bgg_names).intersection(set(onlinegame_names_without_BGGID)))

    # Exact matches as list of dicts (can later be used to create a pd.DF)
    exact_matches_list_of_dict = [{
        'online_name': x,
        'bgg_name': x
    } for x in exact_matches]

    # subtract exact matches from datasets:
    subset_bgg_df = bgg_df[~bgg_df['name'].isin(exact_matches)]
    subset_onlinegames_df = onlinegames_games_without_BGGID[
        ~onlinegames_games_without_BGGID['Name'].isin(exact_matches)]
    subset_onlinegame_names_without_BGGID = subset_onlinegames_df[
        'Name'].tolist()
    subset_bgg_df_names = subset_bgg_df['name'].tolist()

    # Match left over names Onlinegames - BGG
    match_list = []

    for name in subset_onlinegame_names_without_BGGID:
        match = find_match(name, subset_bgg_df_names,
                           JACCARD_THRESHOLD_GAME_NAME)
        match_list.append({
            'online_name': name,
            'bgg_name': match['name'],
            'jaccard_score': match['jaccard_score']
        })

    # drop entries that could not be matched:
    match_list = [x for x in match_list if x['jaccard_score'] != '']

    # add exact matches to match_list:
    match_list = match_list + exact_matches_list_of_dict

    matches_df = pd.DataFrame(match_list)

    # merge matches and bgg to get bgg ids:
    merge_1 = pd.merge(left=matches_df,
                       right=bgg_df,
                       left_on='bgg_name',
                       right_on='name')
    matches_df = merge_1[['bgg_name', 'online_name', 'bgg_game_id']]

    # merge matches and online games df:
    merge_2 = pd.merge(left=onlinegames_games_without_BGGID,
                       right=matches_df,
                       left_on='Name',
                       right_on='online_name')
    merge_2['BGGID'] = merge_2['bgg_game_id']

    # keep only columns from original online games df:
    merge_2 = merge_2[[
        'Onlinegamelink ID', 'Name', 'Onlinegamelink', 'Origin', 'BGGID'
    ]]

    # create a temp_df that contains all games out of the online games df that were not matched in the process
    # (the ones that had been matched previously and the ones that could not be matched in the process)
    temp_df = onlinegames_df[~onlinegames_df['Onlinegamelink ID'].
                             isin(merge_2['Onlinegamelink ID'].tolist())]

    # combine both to get the full dataset with the additional information about the bgg_game_ids out of the games that
    # were successfully matched:
    onlinegames_df = pd.concat([temp_df, merge_2])
    onlinegames_df.drop_duplicates(subset=['Onlinegamelink ID'], inplace=True)

    ## export online games:
    # rename a few columns
    onlinegames_df.rename(columns={
        'Name': 'name',
        'Onlinegamelink ID': 'online_game_id',
        'Onlinegamelink': 'url',
        'Origin': 'origin',
        'BGGID': 'bgg_id'
    },
                          inplace=True)
    onlinegames_df = onlinegames_df.drop(columns={'Unnamed: 0'})

    # If bgg_id has to be int (Beware of nAn conversion!)
    #onlinegames_df['bgg_id'] = onlinegames_df['bgg_id'].fillna(0.0).astype(int)
    #onlinegames_df['bgg_id'] = onlinegames_df['bgg_id'].astype(int)

    # drop online games without bgg_id:
    onlinegames_df = onlinegames_df[~onlinegames_df['bgg_id'].isna()]

    # export result to csv:
    export_path = '../Data/Onlinegames/Processed/online_games.csv'
    export_df_to_csv(onlinegames_df, export_path)
Exemple #11
0
def match_game_names():
    """
    This function matches bga and bgg boardgames based on their game names and the year in which they were published.
    This is how it works:
    - We calculate n-grams with n=3 for each boardgamename.
    - By removing stopwords that appear in many games that don't add much meaning to the game title we can
    reduce the number of false-positives and false-negatives.
    Examples: the stopwords 'board' and 'game' are removed:
        bga_name = '7 Wonders'
        bgg_name = '7 Wonders - The Board Game'
        -> this would result in a rather low jaccard score without removing the stopwords.

        bga_name = 'Settlers - The Board Game'
        bgg_name = '7 Wonder - The Board Game'
        -> this would result in a rather high jaccard score considering that both do not refer to the same game.
    - We then compare the similarity of a bga candidate and a bgg candidate by calculating the jaccard similarity.
    - The candidate with the highest jaccard score is chosen. Only if the jaccard score of that candidate exceeds our
    threshold the games are matched.

    Scalability Challenge:
    - However, there is one issue with that strategy: Computing the jaccard similarity requires comparisons of
    ca. 8,000 bga games and ca. 19,000 bgg games [ O(n) = n^2 ]. Comparing all bga_games and all bgg_games
    would lead to an extremely long run time, which we want to avoid.
    -> 8,000 x 19,000 = 152,000,000 comparisons.

    Therefore we adjusted our approach:
    1) First, we find games that can be matched exactly. By this we mean games that have exactly the same name in both
     datasets. Since there are some games with duplicate game names that do not refer to the same game, we also include
     the year of publication. Therefore only games with exactly the same name and exactly the same year of publication
     are matched in this step. We can then subtract these games from the their datasets to decrease
    the sizes of games that have to be compared to: ca. 3,000 bga games and ca. 15,000 bgg games.
    -> 3,000 x 15,000 = 45,000,000 (complexity reduced by ~70%)
    2) This is still quite a lot of comparisons. However, we made another observation. We also tried matching games
    by only their game_name (not also taking the year_published into consideration). In the set of games that could
    be matched exactly, in almost all cases the publish years are the same, which makes sense obviously.
    3) Therefore we can further reduce complexity by grouping by publish years and comparing only games that have
    the same publish year. To make sure we don't lose games because the publish years deviate by one year, we also
    compare to games published in the years one year before and after.
    This further reduces the number of comparisons to: ~ 1,000,000
    Hence, by applying the similarity function only to the most promising pairs we reduced the number of required
    comparisons by 98%.
    """

    # Import bgg and bga data:
    bgg_filename = get_latest_version_of_file(
        '../Data/BoardGameGeeks/Processed/GameInformation/01_BGG_Game_Information_*.csv'
    )
    bgg_df = pd.read_csv(bgg_filename, index_col=0)
    bgg_names = bgg_df['name'].tolist()

    bga_filename = get_latest_version_of_file(
        '../Data/BoardGameAtlas/Processed/API/01_BGA_Game_Information_*.json')
    bga_df = import_json_to_dataframe(bga_filename)
    bga_names = bga_df['name'].tolist()

    # Create lists with bga and bgg ids:
    bgg_ids = bgg_df['bgg_game_id'].tolist()
    bga_ids = bga_df['bga_game_id'].tolist()

    # Check duplicate names:
    bgg_duplicate_names = set([x for x in bgg_names if bgg_names.count(x) > 1])
    bga_duplicate_names = set([x for x in bga_names if bga_names.count(x) > 1])

    ## find exact matches (game_name, year_published):
    exact_matches_join_df = pd.merge(left=bgg_df,
                                     right=bga_df,
                                     left_on=['name', 'year_published'],
                                     right_on=['name', 'year_published'])

    # create list of ids of exactly matched games:
    exact_matches_bgg_ids = exact_matches_join_df['bgg_game_id'].tolist()
    exact_matches_bga_ids = exact_matches_join_df['bga_game_id'].tolist()

    # subtract exact matches from datasets to reduce their size:
    subset_bgg_df = bgg_df[~bgg_df['bgg_game_id'].isin(exact_matches_bgg_ids)]
    subset_bga_df = bga_df[~bga_df['bga_game_id'].isin(exact_matches_bga_ids)]
    subset_bgg_df.rename(columns={'year_published': 'year_published_bgg'},
                         inplace=True)
    subset_bga_df.rename(columns={'year_published': 'year_published_bga'},
                         inplace=True)

    ## In the next part we now want to apply name matching. Our first task is to find candidates so that we don't
    ## have to compare all games from one dataset with all games from the other dataset. We do so by grouping by
    ## their year of publication.
    ## First, we need some preprocessing steps so that we can actually set up our candidates:

    # Extract years from bga dataset:
    # A lot of type casting due to unexpected errors with float and set
    all_years = subset_bga_df['year_published_bga'].dropna().tolist()
    all_years = list(map(int, all_years))
    years = list(set(all_years))
    years.sort(reverse=True)

    # Do not apply name matching to games where to publish_year is missing:
    print('Dropped ' + str(subset_bgg_df['year_published_bgg'].isna().sum()) +
          ' rows from bga_dataset from name_matching')
    print('Dropped ' + str(subset_bga_df['year_published_bga'].isna().sum()) +
          ' rows from bgg_dataset from name_matching')
    subset_bgg_df.dropna(inplace=True)
    subset_bga_df.dropna(inplace=True)

    # strip of '.0' at the end of each year by converting to int: 2018.0 -> 2018
    subset_bga_df["year_published_bga"] = subset_bga_df[
        "year_published_bga"].astype(int)

    # create a dictionary to group all bgg games by their year of publication
    # during the name matching process we will only compare the names of games with the same publication year
    bgg_dic_grouped_by_year = {}
    bga_dic_grouped_by_year = {}

    # fill the previously created dictionaries that include all the games that were published in a certain year
    for year in years:
        bgg_dic_grouped_by_year[year] = subset_bgg_df[
            subset_bgg_df['year_published_bgg'] == year].to_dict('records')
        bga_dic_grouped_by_year[year] = subset_bga_df[
            subset_bga_df['year_published_bga'] == year].to_dict('records')

    ## Now we get to the interesting part:
    ## We iterate over all bga_games which we found no exact bgg_matches for. We then create a list with potential
    ## candidates including all bgg_games that were published in the same year or one year before or after.
    ## For these candidates we then apply name_matching using the jaccard similarity.
    for year in years:
        for bga_game in bga_dic_grouped_by_year[year]:
            input_string = bga_game['name']

            candidate_list = []
            # create candidate_list with all bgg games that were published in the same year as the bga_game:
            for bgg_game in bgg_dic_grouped_by_year[year]:
                candidate_list.append(bgg_game['name'])

            # also check bgg games that were published in the previous year and one year later:
            if year + 1 in bgg_dic_grouped_by_year:
                for bgg_game in bgg_dic_grouped_by_year[year + 1]:
                    candidate_list.append(bgg_game['name'])
            if year - 1 in bgg_dic_grouped_by_year:
                for bgg_game in bgg_dic_grouped_by_year[year - 1]:
                    candidate_list.append(bgg_game['name'])

            # Try to match the input_string (target BGA Game name) one of the games in the candidate_list (bgg games).
            # The match with the highest jaccard similarity is returned. If there is no match, or the Jaccard threshold
            # can not be exceeded then an empty string is returned.
            match = find_match(input_string, candidate_list,
                               JACCARD_THRESHOLD_GAME_NAME)
            bga_game['match'] = match['name']
            bga_game['jaccard_score'] = match['jaccard_score']

    global COMPARISONS
    print('Number of comparisons: ' + str(COMPARISONS))

    bga_list_matches = []
    for year in years:
        for bga_game in bga_dic_grouped_by_year[year]:
            bga_list_matches.append(bga_game)

    # turn list of dictionaries back to data frame:
    jaccard_matches_df = pd.DataFrame(bga_list_matches)

    # just for debugging and inspecting results:
    analyse_df = pd.DataFrame(bga_list_matches)
    analyse_df = analyse_df[analyse_df['jaccard_score'] != '']
    analyse_df = analyse_df[['name', 'match', 'jaccard_score']]
    analyse_df = analyse_df.sort_values('jaccard_score', ascending=False)

    ## We have now succesfully found a large number of games that could be matched. All that's left to do is
    #  creating a dataframe that contains the matched BGA and BGG IDs. We do so in three steps:
    # 1) Prepare DF containing BGA and BGG IDs of games that could be matched exactly by name and year_published
    # 2) Prepare DF containing BGA and BGG IDs of games that could be matched by string matching (jaccard method)
    # 3) Concatenate both data frames

    # 1) Exact matches
    # Keep only ID columns:
    exact_matches_join_df = exact_matches_join_df[[
        'bgg_game_id', 'bga_game_id'
    ]]

    # 2) Jaccard matches
    # Cut of rows where the jaccard threshold wasn't reached (-> no match)
    jaccard_matches_df = jaccard_matches_df[jaccard_matches_df['match'] != '']
    jaccard_matches_df = jaccard_matches_df[[
        'bga_game_id', 'name', 'year_published_bga', 'match', 'jaccard_score'
    ]]
    jaccard_matches_df.rename(columns={'name': 'bga_name'}, inplace=True)

    # Join both datasets
    jaccard_matches_join_df = pd.merge(
        left=bgg_df[['bgg_game_id', 'name', 'year_published']],
        right=jaccard_matches_df,
        left_on=['name', 'year_published'],
        right_on=['match', 'year_published_bga'])
    jaccard_matches_join_df = jaccard_matches_join_df[[
        'bgg_game_id', 'bga_game_id'
    ]]

    # 3) Concat both dfs
    matched_game_ids_df = pd.concat(
        [exact_matches_join_df, jaccard_matches_join_df])

    # 4) Store matches to csv:
    export_df_to_csv(
        matched_game_ids_df,
        '../Data/Joined/Integration/GameInformation/matched_bga_and_bgg_ids.csv'
    )
Exemple #12
0
def merge_game_information():
    '''
    Function merges the boardgames of the previously matched games.

    For the matched games there are four types of columns:
        a) columns that exist in both datasets but we only need to keep one of them (can include conflicting values):
            (e.g. name, year_published, min_players, ...)
            In this case we chose to keep the bgg columns! ["trust-your-friends" avoidance strategy as in case of
            contradicting values we keep values based on which data source they come from]
        b) columns that exist in both datasets but we want to keep both:
            (e.g. bga_game_id/bgg_game_id, num_user_ratings, average_user_rating, bga_rank/bgg_rank, ...)
        c) columns that exist only in the bgg dataset:
            (e.g. num_user_comments, bgg_average_weight, ...)
        d) columns that exist only in the bga dataset:
            (e.g. reddit_all_time_count, bga_game_url, ...)
    '''
    # import data:
    # bgg game information dataset:
    bgg_filename = get_latest_version_of_file(
        '../Data/BoardGameGeeks/Processed/GameInformation/01_BGG_Game_Information_*.csv'
    )
    bgg_df = pd.read_csv(bgg_filename, index_col=0)
    # bga game information dataset:
    bga_filename = get_latest_version_of_file(
        '../Data/BoardGameAtlas/Processed/API/01_BGA_Game_Information_*.json')
    bga_df = import_json_to_dataframe(bga_filename)

    # 1) this leaves us with three groups:
    # a) Matched Games
    # b) BGG Games that could not be matched
    # c) BGA Games that could not be matched

    # 1a) matched games:
    ids_matched_games_df = pd.read_csv(
        '../Data/Joined/Integration/GameInformation/matched_bga_and_bgg_ids.csv',
        index_col=0)
    bgg_subset_matches = bgg_df[bgg_df['bgg_game_id'].isin(
        ids_matched_games_df['bgg_game_id'])]
    bga_subset_matches = bga_df[bga_df['bga_game_id'].isin(
        ids_matched_games_df['bga_game_id'])]
    # 1b) BGG games no matched:
    bgg_subset_no_matches = bgg_df[~bgg_df['bgg_game_id'].
                                   isin(ids_matched_games_df['bgg_game_id'])]
    # 1c) BGA games no matched:
    bga_subset_no_matches = bga_df[~bga_df['bga_game_id'].
                                   isin(ids_matched_games_df['bga_game_id'])]

    # 2)
    # For the matched games there are three types of columns:
    #   a) columns that exist in both datasets but we only need to keep one of them:
    #       (e.g. name, year_published, min_players, ...)
    #       In this case we chose to keep the bgg columns! It doesn't really matter which ones you keep though!
    #   b) columns that exist in both datasets but we want to keep both:
    #       (e.g. bga_game_id/bgg_game_id, num_user_ratings, average_user_rating, bga_rank/bgg_rank, ...)
    #   c) columns that exist only in the bgg dataset:
    #       (e.g. num_user_comments, bgg_average_weight, ...)
    #   d) columns that exist only in the bga dataset:
    #       (e.g. reddit_all_time_count, bga_game_url, ...)

    # 2a) drop columns from bga dataset:
    drop_bga_columns = [
        'name', 'year_published', 'min_players', 'max_players', 'min_playtime',
        'max_playtime', 'min_age', 'game_description', 'image_url',
        'thumbnail_url'
    ]
    bga_subset_matches.drop(columns=drop_bga_columns, inplace=True)

    # add 'matched_bgg_id' column:
    bga_subset_matches = pd.merge(left=bga_subset_matches,
                                  right=ids_matched_games_df,
                                  left_on='bga_game_id',
                                  right_on='bga_game_id')

    # merge both datasets:
    matched_games_df = pd.merge(left=bgg_subset_matches,
                                right=bga_subset_matches,
                                left_on=['bgg_game_id'],
                                right_on=['bgg_game_id'])

    # Handle duplicate ids in matched_games_df:
    # remove duplicates:
    # duplicate bgg_ids:
    matched_games_df.drop_duplicates(subset=['bgg_game_id'],
                                     keep='first',
                                     inplace=True)
    # duplicate bga_ids:
    matched_games_df.drop_duplicates(subset=['bga_game_id'],
                                     keep='first',
                                     inplace=True)

    # In a last (union) step we now have to concatenate all three dataframes to one big dataframes:
    games_df = matched_games_df.append(
        [bgg_subset_no_matches, bga_subset_no_matches],
        ignore_index=True,
        sort=False)

    # reorder columns:
    cols_to_order = [
        'name', 'bgg_game_id', 'bga_game_id', 'year_published', 'min_players',
        'max_players', 'min_playtime', 'max_playtime', 'min_age',
        'bgg_average_user_rating', 'bga_average_user_rating',
        'bgg_num_user_ratings', 'bga_num_user_ratings'
    ]
    new_columns = cols_to_order + (
        games_df.columns.drop(cols_to_order).tolist())
    games_df = games_df[new_columns]

    # create new unique key_column:
    games_df.insert(0, 'game_key', range(100001, 100001 + len(games_df)))

    # create key_csv that contains bga_game_id, bgg_game_id and game_key:
    key_df = games_df[['game_key', 'bga_game_id', 'bgg_game_id']]

    # check if there are any duplicates in game_df:
    games_df_duplicates = len(games_df) - len(games_df.drop_duplicates())

    if games_df_duplicates > 0:
        print('Warning. ' + str(games_df_duplicates) +
              ' duplicates found in BoardGameTable: ')
        games_df.drop_duplicates(inplace=True)
        print('Duplicates removed!')

    # check if there are any duplicates in key_df:
    count_duplicates_bgg = len(key_df[~key_df['bgg_game_id'].isnull()]) - len(
        key_df[~key_df['bgg_game_id'].isnull()].drop_duplicates(
            subset='bgg_game_id'))
    count_duplicates_bga = len(key_df[~key_df['bga_game_id'].isnull()]) - len(
        key_df[~key_df['bga_game_id'].isnull()].drop_duplicates(
            subset='bga_game_id'))

    if (count_duplicates_bga + count_duplicates_bga) > 0:
        print('Warning. Duplicates found: ')
        print('BGG_game_ids: ' + str(count_duplicates_bgg))
        print('BGA_game_ids: ' + str(count_duplicates_bga))
        key_df.drop_duplicates(inplace=True)
        print('Duplicates removed:')

    # Fix badly encoded symbols
    # Insert quotation marks
    games_df['game_description'] = games_df['game_description'].str.replace(
        r'"', '\'')
    games_df['game_description'] = games_df['game_description'].str.replace(
        r'”', '\'')
    games_df['game_description'] = games_df['game_description'].str.replace(
        r'’', '\'')
    games_df['game_description'] = games_df['game_description'].str.replace(
        r'“', '\'')
    games_df['game_description'] = games_df['game_description'].str.replace(
        r'&', '&')
    games_df['game_description'] = games_df['game_description'].str.replace(
        r'é', 'e')

    # Insert Umlaute
    games_df['game_description'] = games_df['game_description'].str.replace(
        r'ä', 'ä')
    games_df['game_description'] = games_df['game_description'].str.replace(
        r'Ü', 'ü')
    games_df['game_description'] = games_df['game_description'].str.replace(
        r'ü', 'ü')
    games_df['game_description'] = games_df['game_description'].str.replace(
        r'ö', 'ö')
    games_df['game_description'] = games_df['game_description'].str.replace(
        r'ß', 'ß')

    # Insert dashes & non-breaking space
    games_df['game_description'] = games_df['game_description'].str.replace(
        r'–', '-')
    games_df['game_description'] = games_df['game_description'].str.replace(
        r'—', '-')
    games_df['game_description'] = games_df['game_description'].str.replace(
        r' ', ' ')
    games_df['game_description'] = games_df['game_description'].str.replace(
        r'×', 'x')
    games_df['game_description'] = games_df['game_description'].str.replace(
        r'­', '-')

    # Kick html characters
    games_df['game_description'] = games_df['game_description'].str.replace(
        r'&#...;', '')
    games_df['game_description'] = games_df['game_description'].str.replace(
        r'&#..;', ' ')
    games_df['game_description'] = games_df['game_description'].str.replace(
        r'&#.;', '')

    games_df['game_description'] = games_df['game_description'].str.replace(
        r'.....;', '')
    games_df['game_description'] = games_df['game_description'].str.replace(
        r'....;', '')
    games_df['game_description'] = games_df['game_description'].str.replace(
        r'...;', '')
    games_df['game_description'] = games_df['game_description'].str.replace(
        r'..;', '')
    games_df['game_description'] = games_df['game_description'].str.replace(
        r'.;', '')

    # Remove double semicolon and double spaces
    games_df['game_description'] = games_df['game_description'].str.replace(
        r';;', ' ')
    games_df['game_description'] = games_df['game_description'].str.replace(
        ' +', ' ')
    games_df['game_description'] = games_df['game_description'].str.strip()

    # export to csv:
    export_df_to_csv(games_df, '../Data/Joined/Results/BoardGames.csv')
    export_df_to_csv(
        key_df,
        '../Data/Joined/Integration/GameKeys/Keys_All_Games_Integrated.csv')