コード例 #1
0
def get_bga_categories_from_api():
    url = 'https://api.boardgameatlas.com/api/game/categories?'
    params = {'client_id': '16OTwjJZDB'}

    # query API to get all categories
    categories = bga_api_call(url, params)['categories']

    # create categories dataframe
    categories_df = pd.DataFrame(categories)

    # drop checked column:
    del categories_df['checked']

    # rename a few columns
    categories_df.rename(columns={
        'id': 'category_bga_id',
        'name': 'category_name',
        'url': 'category_bga_url'
    },
                         inplace=True)

    # export bga_categories:
    export_df_to_csv(
        categories_df,
        '../Data/BoardGameAtlas/Raw/API/Categories/all_bga_categories.csv')
コード例 #2
0
def extract_users():
    # import reviews df:
    reviews_path = '../Data/Joined/Integration/Reviews/Reviews_All_Games_Integrated.pickle'
    all_reviews = import_pickle_to_dataframe(reviews_path)
    # remove index column from all_reviews.
    # Including index_col=0 in the read_csv statement throws an error for some unknown reason.
    all_reviews.drop(all_reviews.columns[0], axis=1)

    # Create user dataframe:
    users_df = all_reviews.groupby(['user_name', 'review_origin'
                                    ]).size().reset_index(name='num_ratings')

    # Count individual users in both datasets:
    bga_users = users_df[users_df['review_origin'] == 'bga']
    sum_bga_users = len(bga_users)
    bgg_users = users_df[users_df['review_origin'] == 'bgg']
    sum_bgg_users = len(bgg_users)
    print('User count:')
    print('BoardGameAtlas users: ' + str(sum_bga_users))
    print('BoardGameGeeks users: ' + str(sum_bgg_users))

    # Add average rating column to user df:
    users_df['avg_rating'] = all_reviews.groupby(
        ['user_name', 'review_origin'], as_index=False).agg({'rating':
                                                             'mean'})['rating']

    # Rename origin column:
    users_df.rename(columns={'review_origin': 'user_origin'}, inplace=True)

    # Create user_id:
    users_df.insert(0, 'user_key', range(1, 1 + len(users_df)))

    # Export users to csv:
    export_df_to_csv(users_df, '../Data/Joined/Results/User.csv')
コード例 #3
0
def get_bga_mechanics_from_api():
    url = 'https://api.boardgameatlas.com/api/game/mechanics?'
    params = {'client_id': '16OTwjJZDB'}

    # query API to get all mechanics
    mechanics = bga_api_call(url, params)['mechanics']

    # create mechanics dataframe
    mechanics_df = pd.DataFrame(mechanics)

    # drop checked column:
    del mechanics_df['checked']

    # rename a few columns
    mechanics_df.rename(columns={
        'id': 'mechanic_bga_id',
        'name': 'mechanic_name',
        'url': 'mechanic_bga_url'
    },
                        inplace=True)

    # export bga_mechanics:
    export_df_to_csv(
        mechanics_df,
        '../Data/BoardGameAtlas/Raw/API/Mechanics/all_bga_mechanics.csv')
コード例 #4
0
def normalize_bga_game_categories_relation():
    # import bga_games_category_relation:
    categories_bga_games_relation_path_fuzzy = '../Data/BoardGameAtlas/Processed/API/05_BGA_Game_Categories_Relation_*.json'
    categories_bga_games_relation_path = get_latest_version_of_file(
        categories_bga_games_relation_path_fuzzy)
    categories_bga_games_relation_df = import_json_to_dataframe(
        categories_bga_games_relation_path)

    # import bga categories:
    categories_bga = pd.read_csv(
        '../Data/BoardGameAtlas/Raw/API/categories/all_bga_categories.csv',
        index_col=0)

    # import game keys:
    game_keys = pd.read_csv(
        '../Data/Joined/Integration/GameKeys/Keys_All_Games_Integrated.csv',
        index_col=0)

    # join games_category_relation table to replace bga_game_id column with game_keys column:
    categories_bga_games_relation_df = pd.merge(
        left=categories_bga_games_relation_df,
        right=game_keys,
        left_on='game_id',
        right_on='bga_game_id')

    # normalize by only keeping game_key and category_id
    categories_bga_games_relation_df = categories_bga_games_relation_df[[
        'game_key', 'category_id'
    ]]

    # export df
    export_path = '../Data/BoardGameAtlas/Processed/API/05_BGA_Game_Categories_Relation_Cleaned.csv'
    export_df_to_csv(categories_bga_games_relation_df, export_path)
コード例 #5
0
def replace_old_ids_with_new_key_and_concatenate_category_relation_tables():
    # import bga_category_game_relations:
    path_1 = '../Data/BoardGameAtlas/Processed/API/05_BGA_Game_Categories_Relation_Cleaned.csv'
    bga_categories_game_relation = pd.read_csv(path_1, index_col=0)

    # import bgg_category_game_relations:
    path_2 = '../Data/BoardGameGeeks/Processed/GameInformation/05_BGA_Game_Categories_Relation_Cleaned.csv'
    bgg_categories_game_relation = pd.read_csv(path_2, index_col=0)

    # import categories:
    path_3 = '../Data/Joined/Integration/GameInformation/05_categories_Integrated_with_bga_and_bgg_ids.csv'
    categories_df = pd.read_csv(path_3, index_col=0)

    # replace old ids in bga_category_game_relations:
    bga_categories_game_relation = pd.merge(left=bga_categories_game_relation,
                                            right=categories_df,
                                            left_on='category_id',
                                            right_on='category_bga_id')
    bga_categories_game_relation = bga_categories_game_relation[[
        'game_key', 'category_key'
    ]]

    # replace old ids in bgg_category_game_relations:
    bgg_categories_game_relation = pd.merge(left=bgg_categories_game_relation,
                                            right=categories_df,
                                            left_on='bgg_category_key',
                                            right_on='bgg_category_key')
    bgg_categories_game_relation = bgg_categories_game_relation[[
        'game_key', 'category_key'
    ]]

    # delete old bga and bgg id & name columns in categories_df
    categories_df = categories_df[[
        'category_key', 'category_name', 'category_bga_url'
    ]].reset_index(drop=True)

    #
    # CONCATENATE both tables:
    #

    concat_categories_game_relation = pd.concat(
        [bga_categories_game_relation, bgg_categories_game_relation],
        ignore_index=True,
        sort=False).sort_values(['game_key']).reset_index(drop=True)

    # remove duplicates:
    categories_df.drop_duplicates(inplace=True)
    concat_categories_game_relation.drop_duplicates(inplace=True)

    # export categories_game_relation
    export_path_1 = '../Data/Joined/Results/Category_Game_Relation.csv'
    export_df_to_csv(concat_categories_game_relation, export_path_1)

    # export categories_df
    export_path_2 = '../Data/Joined/Results/Categories.csv'
    export_df_to_csv(categories_df, export_path_2)
コード例 #6
0
def create_list_of_all_bga_designers():
    # import bga designers
    fuzzy_import_path = '../Data/BoardGameAtlas/Processed/API/03_BGA_Game_designers_Relation*.json'
    import_path = get_latest_version_of_file(fuzzy_import_path)
    bga_designers_game_relation = import_json_to_dataframe(import_path)

    # extract designers ids and designer urls
    designers = bga_designers_game_relation[['designer_id', 'designer_url']]

    # keep only unique designers:
    designers.drop_duplicates(subset='designer_id', keep='first', inplace=True)

    # export designers to csv:
    export_path = '../Data/BoardGameAtlas/Processed/API/BGA_All_Unique_designers.csv'
    export_df_to_csv(designers, export_path)
コード例 #7
0
def normalize_bgg_game_categories_relation():
    # import bgg games:
    categories_bgg_games_relation_path_fuzzy = '../Data/BoardGameGeeks/Processed/GameInformation/05_BGG_Game_category_Relation_*.csv'
    categories_bgg_games_relation_path = get_latest_version_of_file(
        categories_bgg_games_relation_path_fuzzy)
    categories_bgg_games_relation_df = pd.read_csv(
        categories_bgg_games_relation_path, index_col=0)

    # create categories list:
    categories_bgg = pd.DataFrame(
        categories_bgg_games_relation_df['category_name'].drop_duplicates())

    # create temporary key_column:
    categories_bgg.insert(0, 'bgg_category_key',
                          range(1001, 1001 + len(categories_bgg)))

    # import game keys:
    game_keys = pd.read_csv(
        '../Data/Joined/Integration/GameKeys/Keys_All_Games_Integrated.csv',
        index_col=0)

    # join games_category_relation table to replace bga_game_id column with game_keys column:
    categories_bgg_games_relation_df = pd.merge(
        left=categories_bgg_games_relation_df,
        right=game_keys,
        left_on='bgg_game_id',
        right_on='bgg_game_id')

    # replace 'category_name' with 'bgg_category_key' in categories_bgg_games_relation:
    categories_bgg_games_relation_df = pd.merge(
        left=categories_bgg_games_relation_df,
        right=categories_bgg,
        left_on='category_name',
        right_on='category_name')

    # normalize by only keeping game_key and category_id
    categories_bgg_games_relation_df = categories_bgg_games_relation_df[[
        'game_key', 'bgg_category_key'
    ]]

    # export bgg categories:
    export_path = '../Data/BoardGameGeeks/Raw/BGG_categories.csv'
    export_df_to_csv(categories_bgg, export_path)

    # export bgg game_categories_relation:
    export_path = '../Data/BoardGameGeeks/Processed/GameInformation/05_BGA_Game_Categories_Relation_Cleaned.csv'
    export_df_to_csv(categories_bgg_games_relation_df, export_path)
コード例 #8
0
def integrate_game_name_relation_tables():
    # Import BGA game_name_relation table:
    fuzzy_import_path_1 = '../Data/BoardGameAtlas/Processed/API/06_BGA_Game_Names_Relation_*.json'
    import_path_1 = get_latest_version_of_file(fuzzy_import_path_1)
    names_bga = import_json_to_dataframe(import_path_1)

    # Import BGG game_name_relation table:
    fuzzy_import_path_2 = '../Data/BoardGameGeeks/Processed/GameInformation/06_BGG_Game_Name_Relation_*.csv'
    import_path_2 = get_latest_version_of_file(fuzzy_import_path_2)
    names_bgg = pd.read_csv(import_path_2, index_col=0)

    # Import Game keys:
    import_path_3 = '../Data/Joined/Integration/GameKeys/Keys_All_Games_Integrated.csv'
    game_keys = pd.read_csv(import_path_3, index_col=0)

    # Replace bga 'game_id' with 'game_key'
    names_bga = pd.merge(left=names_bga,
                         right=game_keys,
                         left_on='game_id',
                         right_on='bga_game_id')
    names_bga = names_bga[['game_key', 'game_name']]

    # Replace bgg 'game_id' with 'game_key'
    names_bgg = pd.merge(left=names_bgg,
                         right=game_keys,
                         left_on='bgg_game_id',
                         right_on='bgg_game_id')
    names_bgg = names_bgg[['game_key', 'game_name']]

    # Merge both dataframes:
    names_combined = pd.concat([names_bga, names_bgg]).sort_values('game_key')

    # Remove duplicates:
    print(
        'Number of duplicate game names in GameNameTranslation table found and dropped: '
        + str(len(names_combined) - len(names_combined.drop_duplicates())))
    names_combined.drop_duplicates(inplace=True)

    # Drop duplicates:
    names_combined.drop_duplicates(inplace=True)

    # Export result:
    export_path = '../Data/Joined/Results/GameNameTranslation.csv'
    export_df_to_csv(names_combined, export_path)
コード例 #9
0
def clean_bga_api_review_data():

    filename = '../Data/BoardGameAtlas/Processed/API/bga_all_reviews_for_games_with_more_than_2_reviews.json'
    # first check if file already exists:
    # if file doesn't exist call function to create it:
    if not os.path.isfile(filename):
        gather_bga_api_review_data()

    # import data
    df = import_json_to_dataframe(
        '../Data/BoardGameAtlas/Processed/API/bga_all_reviews_for_games_with_more_than_2_reviews.json'
    )

    # remove reviews with a rating < 1: these ratings are actually errors since the rating scale goes from 1 - 5
    # and not 0 - 5 when rating games on boardgameatlas.com.
    df = df[df['rating'] >= 1]

    # adjust ratings so that they correspond to a scale from 1-10 instead of 1-5:
    # a BGA rating of 1 should result in a 1 on the new scale:  1 * 2.25 - 1.25 =  2.25 - 1.25 =  1
    # a BGA rating of 5 should result in a 10 on the new scale: 5 * 2.25 - 1.25 = 11.25 - 1.25 = 10
    df['rating'] = 2.25 * df['rating'] - 1.25

    # drop column review_title
    del df['review_title']

    # add column has_review_text (0 = only rating, no text; 1 = rating + text)
    df['has_review_text'] = np.where(df['review_text'].isnull(), 0, 1)

    # add column that states the origin of the comment (datasource)
    df['review_origin'] = 'bga'

    # rename columns:
    df.rename(columns={
        'username': '******',
        'date': 'review_date'
    },
              inplace=True)

    export_df_to_csv(
        df,
        '../Data/BoardGameAtlas/Processed/API/bga_all_reviews_for_games_with_more_than_2_reviews_CLEANED.csv'
    )
コード例 #10
0
def merge_scraped_bga_designer_data_and_api_data():
    # import scraped bga designer data
    import_path_1 = '../Data/BoardGameAtlas/Raw/Scrapy/designers/bga_designers.json'
    designers_scrapy = import_json_to_dataframe(import_path_1)

    # import api bga designer data
    import_path_2_fuzzy = '../Data/BoardGameAtlas/Processed/API/03_BGA_Game_designers_Relation_*.json'
    import_path_2 = get_latest_version_of_file(import_path_2_fuzzy)
    designers_game_relation = import_json_to_dataframe(import_path_2)

    # remove list around designer url:
    designers_scrapy = designers_scrapy.explode('designer_bga_image_url')

    # merge both dataframes:
    designers_merged = pd.merge(left=designers_scrapy,
                                right=designers_game_relation,
                                left_on='designer_url',
                                right_on='designer_url')

    # export df
    export_path = '../Data/BoardGameAtlas/Processed/API/bga_designers_scrapy_and_api_data_merged.csv'
    export_df_to_csv(designers_merged, export_path)
コード例 #11
0
def merge_game_information():
    '''
    Function merges the boardgames of the previously matched games.

    For the matched games there are four types of columns:
        a) columns that exist in both datasets but we only need to keep one of them (can include conflicting values):
            (e.g. name, year_published, min_players, ...)
            In this case we chose to keep the bgg columns! ["trust-your-friends" avoidance strategy as in case of
            contradicting values we keep values based on which data source they come from]
        b) columns that exist in both datasets but we want to keep both:
            (e.g. bga_game_id/bgg_game_id, num_user_ratings, average_user_rating, bga_rank/bgg_rank, ...)
        c) columns that exist only in the bgg dataset:
            (e.g. num_user_comments, bgg_average_weight, ...)
        d) columns that exist only in the bga dataset:
            (e.g. reddit_all_time_count, bga_game_url, ...)
    '''
    # import data:
    # bgg game information dataset:
    bgg_filename = get_latest_version_of_file(
        '../Data/BoardGameGeeks/Processed/GameInformation/01_BGG_Game_Information_*.csv'
    )
    bgg_df = pd.read_csv(bgg_filename, index_col=0)
    # bga game information dataset:
    bga_filename = get_latest_version_of_file(
        '../Data/BoardGameAtlas/Processed/API/01_BGA_Game_Information_*.json')
    bga_df = import_json_to_dataframe(bga_filename)

    # 1) this leaves us with three groups:
    # a) Matched Games
    # b) BGG Games that could not be matched
    # c) BGA Games that could not be matched

    # 1a) matched games:
    ids_matched_games_df = pd.read_csv(
        '../Data/Joined/Integration/GameInformation/matched_bga_and_bgg_ids.csv',
        index_col=0)
    bgg_subset_matches = bgg_df[bgg_df['bgg_game_id'].isin(
        ids_matched_games_df['bgg_game_id'])]
    bga_subset_matches = bga_df[bga_df['bga_game_id'].isin(
        ids_matched_games_df['bga_game_id'])]
    # 1b) BGG games no matched:
    bgg_subset_no_matches = bgg_df[~bgg_df['bgg_game_id'].
                                   isin(ids_matched_games_df['bgg_game_id'])]
    # 1c) BGA games no matched:
    bga_subset_no_matches = bga_df[~bga_df['bga_game_id'].
                                   isin(ids_matched_games_df['bga_game_id'])]

    # 2)
    # For the matched games there are three types of columns:
    #   a) columns that exist in both datasets but we only need to keep one of them:
    #       (e.g. name, year_published, min_players, ...)
    #       In this case we chose to keep the bgg columns! It doesn't really matter which ones you keep though!
    #   b) columns that exist in both datasets but we want to keep both:
    #       (e.g. bga_game_id/bgg_game_id, num_user_ratings, average_user_rating, bga_rank/bgg_rank, ...)
    #   c) columns that exist only in the bgg dataset:
    #       (e.g. num_user_comments, bgg_average_weight, ...)
    #   d) columns that exist only in the bga dataset:
    #       (e.g. reddit_all_time_count, bga_game_url, ...)

    # 2a) drop columns from bga dataset:
    drop_bga_columns = [
        'name', 'year_published', 'min_players', 'max_players', 'min_playtime',
        'max_playtime', 'min_age', 'game_description', 'image_url',
        'thumbnail_url'
    ]
    bga_subset_matches.drop(columns=drop_bga_columns, inplace=True)

    # add 'matched_bgg_id' column:
    bga_subset_matches = pd.merge(left=bga_subset_matches,
                                  right=ids_matched_games_df,
                                  left_on='bga_game_id',
                                  right_on='bga_game_id')

    # merge both datasets:
    matched_games_df = pd.merge(left=bgg_subset_matches,
                                right=bga_subset_matches,
                                left_on=['bgg_game_id'],
                                right_on=['bgg_game_id'])

    # Handle duplicate ids in matched_games_df:
    # remove duplicates:
    # duplicate bgg_ids:
    matched_games_df.drop_duplicates(subset=['bgg_game_id'],
                                     keep='first',
                                     inplace=True)
    # duplicate bga_ids:
    matched_games_df.drop_duplicates(subset=['bga_game_id'],
                                     keep='first',
                                     inplace=True)

    # In a last (union) step we now have to concatenate all three dataframes to one big dataframes:
    games_df = matched_games_df.append(
        [bgg_subset_no_matches, bga_subset_no_matches],
        ignore_index=True,
        sort=False)

    # reorder columns:
    cols_to_order = [
        'name', 'bgg_game_id', 'bga_game_id', 'year_published', 'min_players',
        'max_players', 'min_playtime', 'max_playtime', 'min_age',
        'bgg_average_user_rating', 'bga_average_user_rating',
        'bgg_num_user_ratings', 'bga_num_user_ratings'
    ]
    new_columns = cols_to_order + (
        games_df.columns.drop(cols_to_order).tolist())
    games_df = games_df[new_columns]

    # create new unique key_column:
    games_df.insert(0, 'game_key', range(100001, 100001 + len(games_df)))

    # create key_csv that contains bga_game_id, bgg_game_id and game_key:
    key_df = games_df[['game_key', 'bga_game_id', 'bgg_game_id']]

    # check if there are any duplicates in game_df:
    games_df_duplicates = len(games_df) - len(games_df.drop_duplicates())

    if games_df_duplicates > 0:
        print('Warning. ' + str(games_df_duplicates) +
              ' duplicates found in BoardGameTable: ')
        games_df.drop_duplicates(inplace=True)
        print('Duplicates removed!')

    # check if there are any duplicates in key_df:
    count_duplicates_bgg = len(key_df[~key_df['bgg_game_id'].isnull()]) - len(
        key_df[~key_df['bgg_game_id'].isnull()].drop_duplicates(
            subset='bgg_game_id'))
    count_duplicates_bga = len(key_df[~key_df['bga_game_id'].isnull()]) - len(
        key_df[~key_df['bga_game_id'].isnull()].drop_duplicates(
            subset='bga_game_id'))

    if (count_duplicates_bga + count_duplicates_bga) > 0:
        print('Warning. Duplicates found: ')
        print('BGG_game_ids: ' + str(count_duplicates_bgg))
        print('BGA_game_ids: ' + str(count_duplicates_bga))
        key_df.drop_duplicates(inplace=True)
        print('Duplicates removed:')

    # Fix badly encoded symbols
    # Insert quotation marks
    games_df['game_description'] = games_df['game_description'].str.replace(
        r'&quot;', '\'')
    games_df['game_description'] = games_df['game_description'].str.replace(
        r'&rdquo;', '\'')
    games_df['game_description'] = games_df['game_description'].str.replace(
        r'&rsquo;', '\'')
    games_df['game_description'] = games_df['game_description'].str.replace(
        r'&ldquo;', '\'')
    games_df['game_description'] = games_df['game_description'].str.replace(
        r'&amp;', '&')
    games_df['game_description'] = games_df['game_description'].str.replace(
        r'&eacute;', 'e')

    # Insert Umlaute
    games_df['game_description'] = games_df['game_description'].str.replace(
        r'&auml;', 'ä')
    games_df['game_description'] = games_df['game_description'].str.replace(
        r'&Uuml;', 'ü')
    games_df['game_description'] = games_df['game_description'].str.replace(
        r'&uuml;', 'ü')
    games_df['game_description'] = games_df['game_description'].str.replace(
        r'&ouml;', 'ö')
    games_df['game_description'] = games_df['game_description'].str.replace(
        r'&szlig;', 'ß')

    # Insert dashes & non-breaking space
    games_df['game_description'] = games_df['game_description'].str.replace(
        r'&ndash;', '-')
    games_df['game_description'] = games_df['game_description'].str.replace(
        r'&mdash;', '-')
    games_df['game_description'] = games_df['game_description'].str.replace(
        r'&nbsp;', ' ')
    games_df['game_description'] = games_df['game_description'].str.replace(
        r'&times;', 'x')
    games_df['game_description'] = games_df['game_description'].str.replace(
        r'&shy;', '-')

    # Kick html characters
    games_df['game_description'] = games_df['game_description'].str.replace(
        r'&#...;', '')
    games_df['game_description'] = games_df['game_description'].str.replace(
        r'&#..;', ' ')
    games_df['game_description'] = games_df['game_description'].str.replace(
        r'&#.;', '')

    games_df['game_description'] = games_df['game_description'].str.replace(
        r'.....;', '')
    games_df['game_description'] = games_df['game_description'].str.replace(
        r'....;', '')
    games_df['game_description'] = games_df['game_description'].str.replace(
        r'...;', '')
    games_df['game_description'] = games_df['game_description'].str.replace(
        r'..;', '')
    games_df['game_description'] = games_df['game_description'].str.replace(
        r'.;', '')

    # Remove double semicolon and double spaces
    games_df['game_description'] = games_df['game_description'].str.replace(
        r';;', ' ')
    games_df['game_description'] = games_df['game_description'].str.replace(
        ' +', ' ')
    games_df['game_description'] = games_df['game_description'].str.strip()

    # export to csv:
    export_df_to_csv(games_df, '../Data/Joined/Results/BoardGames.csv')
    export_df_to_csv(
        key_df,
        '../Data/Joined/Integration/GameKeys/Keys_All_Games_Integrated.csv')
コード例 #12
0
def merge_bga_and_bgg_publisher_game_relation():
    # import bga_publisher_game_relation
    import_path_1 = '../Data/BoardGameAtlas/Processed/API/bga_publishers_scrapy_and_api_data_merged.csv'
    game_publisher_relation_bga = pd.read_csv(import_path_1, index_col=0)
    game_publisher_relation_bga = game_publisher_relation_bga[[
        'game_id', 'publisher_id'
    ]]

    # import bgg_publisher_game_relation
    import_path_2_fuzzy = '../Data/BoardGameGeeks/Processed/GameInformation/02_BGG_Game_Publisher_Relation_*.csv'
    import_path_2 = get_latest_version_of_file(import_path_2_fuzzy)
    game_publisher_relation_bgg = pd.read_csv(import_path_2, index_col=0)
    game_publisher_relation_bgg = game_publisher_relation_bgg[[
        'bgg_game_id', 'publisher_name'
    ]]

    # import publishers
    import_path_3 = '../Data/Joined/Results/publisher.csv'
    publishers = pd.read_csv(import_path_3, index_col=0)

    # import game keys
    import_path_4 = '../Data/Joined/Integration/GameKeys/Keys_All_Games_Integrated.csv'
    game_keys = pd.read_csv(import_path_4, index_col=0)

    # replace bga game ids with game keys
    game_publisher_relation_bga = pd.merge(left=game_publisher_relation_bga,
                                           right=game_keys,
                                           left_on='game_id',
                                           right_on='bga_game_id')
    game_publisher_relation_bga = game_publisher_relation_bga[[
        'game_key', 'publisher_id'
    ]]

    # replace publisher_bga_id with publisher_key
    game_publisher_relation_bga = pd.merge(left=publishers,
                                           right=game_publisher_relation_bga,
                                           left_on='publisher_bga_id',
                                           right_on='publisher_id')
    game_publisher_relation_bga = game_publisher_relation_bga[[
        'game_key', 'publisher_key'
    ]]

    # replace bgg game ids with game keys
    game_publisher_relation_bgg = pd.merge(left=game_publisher_relation_bgg,
                                           right=game_keys,
                                           left_on='bgg_game_id',
                                           right_on='bgg_game_id')
    game_publisher_relation_bgg = game_publisher_relation_bgg[[
        'game_key', 'publisher_name'
    ]]

    # replace bgg publisher name with publisher key
    game_publisher_relation_bgg = pd.merge(left=game_publisher_relation_bgg,
                                           right=publishers,
                                           left_on='publisher_name',
                                           right_on='bgg_publisher_name')
    game_publisher_relation_bgg = game_publisher_relation_bgg[[
        'game_key', 'publisher_key'
    ]]

    # concat both dataframes:
    game_publisher_relation_combined = pd.concat(
        [game_publisher_relation_bga, game_publisher_relation_bgg])

    # remove duplicates:
    game_publisher_relation_combined.drop_duplicates(inplace=True)

    # export game_publisher relation:
    export_path = '../Data/Joined/Results/Publisher_Game_Relation.csv'
    export_df_to_csv(game_publisher_relation_combined, export_path)
コード例 #13
0
def merge_bga_and_bgg_designers():
    # import bga data:
    import_path_1 = '../Data/BoardGameAtlas/Processed/API/bga_designers_scrapy_and_api_data_merged.csv'
    bga_designers_game_relation = pd.read_csv(import_path_1, index_col=0)

    # import bgg data:
    import_path_2_fuzzy = '../Data/BoardGameGeeks/Processed/GameInformation/03_BGG_Game_designer_Relation_*.csv'
    import_path_2 = get_latest_version_of_file(import_path_2_fuzzy)
    bgg_designers_game_relation = pd.read_csv(import_path_2, index_col=0)

    # drop NA's from bga_designers_game_relation names
    bga_designers_na = bga_designers_game_relation[
        bga_designers_game_relation['designer_name'].isnull()]
    if len(bga_designers_na) > 0:
        print(
            str(len(bga_designers_na)) +
            ' rows dropped from bga_game_relation table because designer_names are missing.'
        )
        bga_designers_game_relation = bga_designers_game_relation[
            ~bga_designers_game_relation['designer_name'].isnull()]

    # create designers df:
    bga_designers_df = bga_designers_game_relation[[
        'designer_name', 'designer_bga_image_url', 'designer_url',
        'designer_id'
    ]].drop_duplicates()
    bga_designers_df.rename(columns={
        'designer_bga_image_url': 'designer_image_url',
        'designer_id': 'designer_bga_id'
    },
                            inplace=True)

    bgg_designers_df = pd.DataFrame(
        bgg_designers_game_relation[['designer_name']].drop_duplicates())
    bgg_designers_df.rename(columns={'designer_name': 'bgg_designer_name'},
                            inplace=True)

    # add bgg_designer_key:
    bgg_designers_df.insert(0, 'designer_bgg_key',
                            range(1, 1 + len(bgg_designers_df)))

    # extract designer names:
    bga_designer_names_list = bga_designers_game_relation[
        'designer_name'].drop_duplicates().to_list()
    bgg_designer_names_list = bgg_designers_game_relation[
        'designer_name'].drop_duplicates().to_list()

    # match designer names:
    # get exact name matches:
    exact_matches = list(
        set(bga_designer_names_list).intersection(bgg_designer_names_list))

    # subsets for data that could not get matched exactly:
    bga_names_not_matched_list = [
        name for name in bga_designer_names_list if name not in exact_matches
    ]
    bgg_names_not_matched_list = [
        name for name in bgg_designer_names_list if name not in exact_matches
    ]

    # jaccard matching for names that could not be matched exactly
    matches = []
    for bga_designer in bga_names_not_matched_list:
        match = find_match(bga_designer, bgg_names_not_matched_list,
                           JACCARD_THRESHOLD_DESIGNERS)
        matches.append({
            'bga_name': bga_designer,
            'bgg_name': match['name'],
            'jaccard_score': match['jaccard_score']
        })

    # create list of matched designer names:
    jaccard_matches_bga = [
        designer['bga_name'] for designer in matches
        if designer['jaccard_score'] != ''
    ]
    jaccard_matches_bgg = [
        designer['bgg_name'] for designer in matches
        if designer['jaccard_score'] != ''
    ]

    # create list of a games matched:
    all_matches_bga = exact_matches + jaccard_matches_bga
    all_matches_bgg = exact_matches + jaccard_matches_bgg

    # create dataframe of matched designers:
    jaccard_matches_df = pd.DataFrame(matches)
    jaccard_matches_df = jaccard_matches_df[
        jaccard_matches_df['jaccard_score'] != ''].sort_values('jaccard_score',
                                                               ascending=False)
    del jaccard_matches_df['jaccard_score']

    # 1) Create DF of all designers that could be matched
    #       a) exact matches
    #       b) jaccard matches
    # 2) Create DF of bga designers that could not be matched
    # 3) Create DF ob bgg designers that could not be matched
    # 4) Concat all DFs to one designers df

    # Structure: designer_key | designer_name | designer_bga_id | designer_bgg_id | designer_url | designer_image_url
    # 1) a)
    bga_exact_matches = bga_designers_df[
        bga_designers_df['designer_name'].isin(exact_matches)]
    bgg_exact_matches = bgg_designers_df[
        bgg_designers_df['bgg_designer_name'].isin(exact_matches)]

    joined_exact_matches = pd.merge(left=bga_exact_matches,
                                    right=bgg_exact_matches,
                                    left_on='designer_name',
                                    right_on='bgg_designer_name')

    # 1) b)
    bga_jaccard_matches = pd.merge(left=bga_designers_df,
                                   right=jaccard_matches_df,
                                   left_on='designer_name',
                                   right_on='bga_name')
    bgg_jaccard_matches = pd.merge(left=bgg_designers_df,
                                   right=jaccard_matches_df,
                                   left_on='bgg_designer_name',
                                   right_on='bgg_name')

    joined_jaccard_matches = pd.merge(left=bga_jaccard_matches,
                                      right=bgg_jaccard_matches,
                                      left_on='designer_name',
                                      right_on='bga_name')
    # drop columns not needed
    joined_jaccard_matches = joined_jaccard_matches[[
        'designer_name', 'designer_bga_id', 'designer_bgg_key',
        'bgg_designer_name', 'designer_url', 'designer_image_url'
    ]]

    # 2)
    bga_no_matches = bga_designers_df[~bga_designers_df['designer_name'].
                                      isin(all_matches_bga)]

    # 3)
    bgg_no_matches = bgg_designers_df[~bgg_designers_df['bgg_designer_name'].
                                      isin(all_matches_bgg)]
    # add designers column:
    bgg_no_matches['designer_name'] = bgg_no_matches['bgg_designer_name']

    # 4) Create large dataframe by concatenating all dataframes:
    # size: 473 [1a] + 7 [1b] + 25 [2] + 5928 [3] = 6433
    designers_df = pd.concat([
        joined_exact_matches, joined_jaccard_matches, bga_no_matches,
        bgg_no_matches
    ])

    # add designer key:
    designers_df.insert(0, 'designer_key', range(1, 1 + len(designers_df)))

    # remove duplicates:
    designers_df.drop_duplicates(inplace=True)

    # export designers
    export_path = '../Data/Joined/Results/Designer.csv'
    export_df_to_csv(designers_df, export_path)
コード例 #14
0
def clean_reviews():
    # import users:
    users_path = '../Data/Joined/Results/User.csv'
    users_df = pd.read_csv(users_path, index_col=0)
    users_df = users_df[['user_key', 'user_name', 'user_origin']]

    # import reviews:
    reviews_path = '../Data/Joined/Integration/Reviews/Reviews_All_Games_Integrated.pickle'
    all_reviews = import_pickle_to_dataframe(reviews_path)

    # Delete user_id column from review_df which currently holds user_ids from bga_dataset
    # Also delete these columns: review_id, review_date, game_id, bga_game_id and bgg_game_id
    delete_columns = [
        'user_id', 'review_id', 'review_date', 'game_id', 'bga_game_id',
        'bgg_game_id'
    ]
    all_reviews.drop(columns=delete_columns, inplace=True)

    # Match user_name and user_id
    all_reviews = pd.merge(left=all_reviews,
                           right=users_df,
                           left_on=['review_origin', 'user_name'],
                           right_on=['user_origin', 'user_name'])

    # Drop columns user_name and review_origin to normalize:
    delete_columns = ['user_name', 'review_origin']
    all_reviews.drop(columns=delete_columns, inplace=True)

    # Change column order:
    cols_to_order = [
        'game_key', 'user_key', 'rating', 'review_text', 'has_review_text'
    ]
    new_columns = cols_to_order + (
        all_reviews.columns.drop(cols_to_order).tolist())
    all_reviews = all_reviews[new_columns]

    # Drop text columns since we do not use them and they take away a lot of memory:
    del all_reviews['review_text']

    ## Take care of duplicates:
    # check if there are any duplicates
    count_duplicates = len(all_reviews) - len(
        all_reviews.drop_duplicates(subset=['game_key', 'user_key'],
                                    keep='first'))

    if count_duplicates > 0:
        print('Warning. Duplicates ' + str(count_duplicates) + ' found: ')

        # remove duplicates:
        all_reviews.drop_duplicates(subset=['game_key', 'user_key'],
                                    keep='first',
                                    inplace=True)
        print('Found duplicates removed!')

    ## Keep only reviews of users with a certain amount of ratings and reviews of games with >= ... ratings:
    print('Removing reviews of games with less than ' +
          str(MIN_REVIEWS_PER_GAME) + ' reviews and users with less than ' +
          str(MIN_REVIEWS_PER_USER) + ' reviews.')

    reviews_full_dataset = len(all_reviews)
    games_full_dataset = all_reviews['game_key'].nunique()
    users_full_dataset = all_reviews['user_key'].nunique()

    # keep only reviews of games with >= ... reviews:
    num_reviews_per_game = all_reviews.game_key.value_counts()
    all_reviews = all_reviews[all_reviews.game_key.isin(
        num_reviews_per_game.index[num_reviews_per_game.ge(
            MIN_REVIEWS_PER_GAME)])]

    # keep only reviews of users with >= ... reviews:
    num_reviews_per_user = all_reviews.user_key.value_counts()
    all_reviews = all_reviews[all_reviews.user_key.isin(
        num_reviews_per_user.index[num_reviews_per_user.ge(
            MIN_REVIEWS_PER_USER)])]

    ## Track changes:
    # Count reviews, games and users in reduced dataset:
    reviews_reduced_dataset = len(all_reviews)
    games_reduced_dataset = all_reviews['game_key'].nunique()
    users_reduced_dataset = all_reviews['user_key'].nunique()

    # Calculate absolute number of dropped values:
    reviews_dropped_abs = reviews_full_dataset - reviews_reduced_dataset
    games_dropped_abs = games_full_dataset - games_reduced_dataset
    users_dropped_abs = users_full_dataset - users_reduced_dataset

    # Calculate relative number of dropped values:
    reviews_dropped_rel = reviews_dropped_abs / reviews_full_dataset
    games_dropped_rel = games_dropped_abs / games_full_dataset
    users_dropped_rel = users_dropped_abs / users_full_dataset

    # Create a nice table to visualize the results:
    table = [[
        'Reviews', reviews_full_dataset, reviews_reduced_dataset,
        reviews_dropped_abs, reviews_dropped_rel
    ],
             [
                 'Games', games_full_dataset, games_reduced_dataset,
                 games_dropped_abs, games_dropped_rel
             ],
             [
                 'Users', users_full_dataset, users_reduced_dataset,
                 users_dropped_abs, users_dropped_rel
             ]]

    print(
        tabulate(table,
                 headers=[
                     '_', 'Count (full dataset)', 'Count (reduced dataset)',
                     'Num dropped (absolute)', 'Num dropped (relative)'
                 ]))

    # Calculate the size of the utility matrix, prior and after reducing the dataset:
    size_utility_matrix_full_dataset = games_full_dataset * users_full_dataset
    size_utility_matrix_reduced_dataset = games_reduced_dataset * users_reduced_dataset

    print('Reviews dropped: ' + str(reviews_dropped_rel * 100) + ' %')
    print('Utility matrix full dataset: ' + str(games_full_dataset) + ' * ' +
          str(users_full_dataset))
    print('Utility matrix reduced dataset: ' + str(games_reduced_dataset) +
          ' * ' + str(users_reduced_dataset))

    print('Size of utility matrix reduced by: ' +
          str(100 * (1 - (size_utility_matrix_reduced_dataset /
                          size_utility_matrix_full_dataset))) + ' %')

    # Export df:
    export_df_to_csv(all_reviews, '../Data/Joined/Results/Reviews_Reduced.csv')
コード例 #15
0
def clean_bgg_games():
    filename = '../Data/BoardGameGeeks/Raw/games_detailed_info.csv'
    df = pd.read_csv(filename)

    # remove first column (unnamed):
    df.drop(df.columns[0], axis=1, inplace=True)

    # remove further unwanted columns:
    columns_to_remove = [
        'type', 'playingtime', 'boardgameartist', 'boardgamefamily',
        'boardgameexpansion', 'boardgameimplementation', 'median', 'owned',
        'wanting', 'wishing', 'trading', 'numweights', 'suggested_num_players',
        'suggested_playerage', 'suggested_language_dependence',
        'boardgameintegration', 'boardgamecompilation', 'Strategy Game Rank',
        'Family Game Rank', 'Party Game Rank', 'Abstract Game Rank',
        'Thematic Rank', 'War Game Rank', 'Customizable Rank',
        "Children's Game Rank", 'RPG Item Rank', 'Accessory Rank',
        'Video Game Rank', 'Amiga Rank', 'Commodore 64 Rank', 'Arcade Rank',
        'Atari ST Rank'
    ]
    df.drop(columns_to_remove, axis=1, inplace=True)

    # rename columns:
    df.rename(columns={
        'id': 'bgg_game_id',
        'primary': 'name',
        'yearpublished': 'year_published',
        'minplayers': 'min_players',
        'maxplayers': 'max_players',
        'minplaytime': 'min_playtime',
        'maxplaytime': 'max_playtime',
        'minage': 'min_age',
        'usersrated': 'bgg_num_user_ratings',
        'numcomments': 'bgg_num_user_comments',
        'average': 'bgg_average_user_rating',
        'bayesaverage': 'bgg_bayes_average',
        'stddev': 'bgg_stddev',
        'Board Game Rank': 'bgg_rank',
        'description': 'game_description',
        'image': 'image_url',
        'thumbnail': 'thumbnail_url',
        'averageweight': 'bgg_average_weight'
    },
              inplace=True)

    # desired dataframes:
    # 1) main_game_information
    # 2) publishers
    # 3) designers
    # 4) mechanics
    # 5) categories
    # 6) names

    # 2) extract publishers from 'boardgamepublisher' column:
    # create subset dataframe with only id and names/alternate column:
    publishers_df = df[['bgg_game_id', 'boardgamepublisher']]

    # rename columns
    publishers_df.rename(columns={
        'bgg_game_id': 'bgg_game_id',
        'boardgamepublisher': 'publisher_name'
    },
                         inplace=True)

    # add empty column 'publisher_url' and 'publisher_id':
    publishers_df['publisher_url'] = np.nan
    publishers_df['publisher_id'] = np.nan

    # drop nas:
    publishers_df = publishers_df[publishers_df['publisher_name'].notna()]

    # lists are stored as strings and not as lists. Therefore we have to change the class from str to list:
    publishers_df['publisher_name'] = publishers_df['publisher_name'].apply(
        eval)

    # explode transforms each element of the list to a row
    publishers_df = publishers_df.explode('publisher_name')

    # drop column from games dataframe:
    del df['boardgamepublisher']

    # 3) extract designers from 'boardgamedesigner' column (sames procedure as for publishers)
    designers_df = df[['bgg_game_id', 'boardgamedesigner']]
    designers_df.rename(columns={
        'id': 'bgg_game_id',
        'boardgamedesigner': 'designer_name'
    },
                        inplace=True)
    designers_df['designer_url'] = np.nan
    designers_df['designer_id'] = np.nan
    designers_df = designers_df[designers_df['designer_name'].notna()]
    designers_df['designer_name'] = designers_df['designer_name'].apply(eval)
    designers_df = designers_df.explode('designer_name')
    del df['boardgamedesigner']

    # 4) extract designers from 'boardgamemechanic' column
    mechanics_df = df[['bgg_game_id', 'boardgamemechanic']]
    mechanics_df.rename(columns={
        'id': 'bgg_game_id',
        'boardgamemechanic': 'mechanic_name'
    },
                        inplace=True)
    mechanics_df['mechanic_id'] = np.nan
    mechanics_df['mechanic_url'] = np.nan
    mechanics_df = mechanics_df[mechanics_df['mechanic_name'].notna()]
    mechanics_df['mechanic_name'] = mechanics_df['mechanic_name'].apply(eval)
    mechanics_df = mechanics_df.explode('mechanic_name')
    del df['boardgamemechanic']

    # 5) extract designers from 'boardgamecategory' column
    categories_df = df[['bgg_game_id', 'boardgamecategory']]
    categories_df.rename(columns={
        'id': 'bgg_game_id',
        'boardgamecategory': 'category_name'
    },
                         inplace=True)
    categories_df['category_id'] = np.nan
    categories_df['category_url'] = np.nan
    categories_df = categories_df[categories_df['category_name'].notna()]
    categories_df['category_name'] = categories_df['category_name'].apply(eval)
    categories_df = categories_df.explode('category_name')
    del df['boardgamecategory']

    # 6) extract names from 'alternate' column:
    names_df = df[['bgg_game_id', 'alternate']]
    names_df.rename(columns={
        'id': 'bgg_game_id',
        'alternate': 'game_name'
    },
                    inplace=True)
    names_df = names_df[names_df['game_name'].notna()]
    names_df['game_name'] = names_df['game_name'].apply(eval)
    names_df = names_df.explode('game_name')
    del df['alternate']

    # Export all 6 dataframes:
    path = '../Data/BoardGameGeeks/Processed/GameInformation/'
    export_df_to_csv(
        df, path + '01_BGG_Game_Information_' +
        datetime.now().strftime("%d_%m_%Y-%H_%M") + '.csv')
    export_df_to_csv(
        publishers_df, path + '02_BGG_Game_Publisher_Relation_' +
        datetime.now().strftime("%d_%m_%Y-%H_%M") + '.csv')
    export_df_to_csv(
        designers_df, path + '03_BGG_Game_Designer_Relation_' +
        datetime.now().strftime("%d_%m_%Y-%H_%M") + '.csv')
    export_df_to_csv(
        mechanics_df, path + '04_BGG_Game_Mechanic_Relation_' +
        datetime.now().strftime("%d_%m_%Y-%H_%M") + '.csv')
    export_df_to_csv(
        categories_df, path + '05_BGG_Game_Category_Relation_' +
        datetime.now().strftime("%d_%m_%Y-%H_%M") + '.csv')
    export_df_to_csv(
        names_df, path + '06_BGG_Game_Name_Relation_' +
        datetime.now().strftime("%d_%m_%Y-%H_%M") + '.csv')
コード例 #16
0
def match_and_merge_bga_and_bgg_categories():
    # import bga and bgg categories
    bga_categories = pd.read_csv(
        '../Data/BoardGameAtlas/Raw/API/categories/all_bga_categories.csv',
        index_col=0)
    bgg_categories = pd.read_csv(
        '../Data/BoardGameGeeks/Raw/BGG_categories.csv', index_col=0)

    bga_categories_names = bga_categories['category_name'].tolist()
    bgg_categories_names = bgg_categories['category_name'].tolist()

    category_jaccard_threshold = 0.4

    # match categories:
    mecha_list = []
    for bga_category in bga_categories_names:
        match = find_match(bga_category, bgg_categories_names,
                           category_jaccard_threshold)
        mecha_list.append({
            'bga_name': bga_category,
            'bgg_name': match['name'],
            'jaccard_score': match['jaccard_score']
        })

    matches_df = pd.DataFrame(mecha_list)

    # drop entries that could not be matched
    matches_df = matches_df[matches_df['jaccard_score'] != ''].sort_values(
        'jaccard_score', ascending=False)

    bga_names_matched = matches_df['bga_name'].tolist()
    bgg_names_matched = matches_df['bgg_name'].tolist()

    # build subsets depending on if category name was matched or not
    bga_subset_matches = bga_categories[bga_categories['category_name'].isin(
        bga_names_matched)]
    bgg_subset_matches = bgg_categories[bgg_categories['category_name'].isin(
        bgg_names_matched)]

    bga_subset_no_matches = bga_categories[~bga_categories['category_name'].
                                           isin(bga_names_matched)]
    bgg_subset_no_matches = bgg_categories[~bgg_categories['category_name'].
                                           isin(bgg_names_matched)]

    # rename category_name column:
    bga_subset_matches.rename(columns={'category_name': 'bga_name'},
                              inplace=True)
    bgg_subset_matches.rename(columns={'category_name': 'bgg_name'},
                              inplace=True)
    bga_subset_no_matches.rename(columns={'category_name': 'bga_name'},
                                 inplace=True)
    bgg_subset_no_matches.rename(columns={'category_name': 'bgg_name'},
                                 inplace=True)

    # join matches:
    # start with bga subset
    subset_matches = pd.merge(left=bga_subset_matches,
                              right=matches_df,
                              left_on='bga_name',
                              right_on='bga_name')
    # and then also merge with the bgg subset
    subset_matches = pd.merge(left=subset_matches,
                              right=bgg_subset_matches,
                              left_on='bgg_name',
                              right_on='bgg_name')
    # keep only relevant columns:
    subset_matches = subset_matches[[
        'category_bga_id', 'bgg_category_key', 'bga_name', 'bgg_name',
        'category_bga_url'
    ]]

    # concat all:
    all_categories = pd.concat(
        [subset_matches, bga_subset_no_matches, bgg_subset_no_matches],
        ignore_index=True,
        sort=False).sort_values(['bga_name'])

    # create category key:
    all_categories.insert(0, 'category_key', range(1, 1 + len(all_categories)))

    # add category_name column:
    # out of the matched categories we keep the bgg names:
    all_categories['category_name'] = all_categories['bgg_name']
    # if it is a bga category that could not be matched, take bga name instead:
    all_categories.loc[all_categories['category_name'].isna(),
                       'category_name'] = all_categories['bga_name']

    # export categories df:
    export_path = '../Data/Joined/Integration/GameInformation/05_Categories_Integrated_with_bga_and_bgg_ids.csv'
    export_df_to_csv(all_categories, export_path)
def match_online_game_names_and_bgg_names():
    onlinegames_filename = get_latest_version_of_file(
        '../Data/Onlinegames/Raw/Onlineboardgames_table_raw.csv')

    onlinegames_df = pd.read_csv(onlinegames_filename, sep=';')

    bgg_filename = get_latest_version_of_file(
        '../Data/BoardGameGeeks/Processed/GameInformation/01_BGG_Game_Information_*.csv'
    )
    bgg_df = pd.read_csv(bgg_filename, index_col=0)
    bgg_names = bgg_df['name'].tolist()

    # Extract only games without BGG ID to match
    onlinegames_games_without_BGGID = onlinegames_df[
        onlinegames_df['BGGID'].isna()]
    onlinegame_names_without_BGGID = onlinegames_games_without_BGGID[
        'Name'].tolist()

    # Find exact matches Onlingames - BGG
    exact_matches = list(
        set(bgg_names).intersection(set(onlinegame_names_without_BGGID)))

    # Exact matches as list of dicts (can later be used to create a pd.DF)
    exact_matches_list_of_dict = [{
        'online_name': x,
        'bgg_name': x
    } for x in exact_matches]

    # subtract exact matches from datasets:
    subset_bgg_df = bgg_df[~bgg_df['name'].isin(exact_matches)]
    subset_onlinegames_df = onlinegames_games_without_BGGID[
        ~onlinegames_games_without_BGGID['Name'].isin(exact_matches)]
    subset_onlinegame_names_without_BGGID = subset_onlinegames_df[
        'Name'].tolist()
    subset_bgg_df_names = subset_bgg_df['name'].tolist()

    # Match left over names Onlinegames - BGG
    match_list = []

    for name in subset_onlinegame_names_without_BGGID:
        match = find_match(name, subset_bgg_df_names,
                           JACCARD_THRESHOLD_GAME_NAME)
        match_list.append({
            'online_name': name,
            'bgg_name': match['name'],
            'jaccard_score': match['jaccard_score']
        })

    # drop entries that could not be matched:
    match_list = [x for x in match_list if x['jaccard_score'] != '']

    # add exact matches to match_list:
    match_list = match_list + exact_matches_list_of_dict

    matches_df = pd.DataFrame(match_list)

    # merge matches and bgg to get bgg ids:
    merge_1 = pd.merge(left=matches_df,
                       right=bgg_df,
                       left_on='bgg_name',
                       right_on='name')
    matches_df = merge_1[['bgg_name', 'online_name', 'bgg_game_id']]

    # merge matches and online games df:
    merge_2 = pd.merge(left=onlinegames_games_without_BGGID,
                       right=matches_df,
                       left_on='Name',
                       right_on='online_name')
    merge_2['BGGID'] = merge_2['bgg_game_id']

    # keep only columns from original online games df:
    merge_2 = merge_2[[
        'Onlinegamelink ID', 'Name', 'Onlinegamelink', 'Origin', 'BGGID'
    ]]

    # create a temp_df that contains all games out of the online games df that were not matched in the process
    # (the ones that had been matched previously and the ones that could not be matched in the process)
    temp_df = onlinegames_df[~onlinegames_df['Onlinegamelink ID'].
                             isin(merge_2['Onlinegamelink ID'].tolist())]

    # combine both to get the full dataset with the additional information about the bgg_game_ids out of the games that
    # were successfully matched:
    onlinegames_df = pd.concat([temp_df, merge_2])
    onlinegames_df.drop_duplicates(subset=['Onlinegamelink ID'], inplace=True)

    ## export online games:
    # rename a few columns
    onlinegames_df.rename(columns={
        'Name': 'name',
        'Onlinegamelink ID': 'online_game_id',
        'Onlinegamelink': 'url',
        'Origin': 'origin',
        'BGGID': 'bgg_id'
    },
                          inplace=True)
    onlinegames_df = onlinegames_df.drop(columns={'Unnamed: 0'})

    # If bgg_id has to be int (Beware of nAn conversion!)
    #onlinegames_df['bgg_id'] = onlinegames_df['bgg_id'].fillna(0.0).astype(int)
    #onlinegames_df['bgg_id'] = onlinegames_df['bgg_id'].astype(int)

    # drop online games without bgg_id:
    onlinegames_df = onlinegames_df[~onlinegames_df['bgg_id'].isna()]

    # export result to csv:
    export_path = '../Data/Onlinegames/Processed/online_games.csv'
    export_df_to_csv(onlinegames_df, export_path)
コード例 #18
0
def match_game_names():
    """
    This function matches bga and bgg boardgames based on their game names and the year in which they were published.
    This is how it works:
    - We calculate n-grams with n=3 for each boardgamename.
    - By removing stopwords that appear in many games that don't add much meaning to the game title we can
    reduce the number of false-positives and false-negatives.
    Examples: the stopwords 'board' and 'game' are removed:
        bga_name = '7 Wonders'
        bgg_name = '7 Wonders - The Board Game'
        -> this would result in a rather low jaccard score without removing the stopwords.

        bga_name = 'Settlers - The Board Game'
        bgg_name = '7 Wonder - The Board Game'
        -> this would result in a rather high jaccard score considering that both do not refer to the same game.
    - We then compare the similarity of a bga candidate and a bgg candidate by calculating the jaccard similarity.
    - The candidate with the highest jaccard score is chosen. Only if the jaccard score of that candidate exceeds our
    threshold the games are matched.

    Scalability Challenge:
    - However, there is one issue with that strategy: Computing the jaccard similarity requires comparisons of
    ca. 8,000 bga games and ca. 19,000 bgg games [ O(n) = n^2 ]. Comparing all bga_games and all bgg_games
    would lead to an extremely long run time, which we want to avoid.
    -> 8,000 x 19,000 = 152,000,000 comparisons.

    Therefore we adjusted our approach:
    1) First, we find games that can be matched exactly. By this we mean games that have exactly the same name in both
     datasets. Since there are some games with duplicate game names that do not refer to the same game, we also include
     the year of publication. Therefore only games with exactly the same name and exactly the same year of publication
     are matched in this step. We can then subtract these games from the their datasets to decrease
    the sizes of games that have to be compared to: ca. 3,000 bga games and ca. 15,000 bgg games.
    -> 3,000 x 15,000 = 45,000,000 (complexity reduced by ~70%)
    2) This is still quite a lot of comparisons. However, we made another observation. We also tried matching games
    by only their game_name (not also taking the year_published into consideration). In the set of games that could
    be matched exactly, in almost all cases the publish years are the same, which makes sense obviously.
    3) Therefore we can further reduce complexity by grouping by publish years and comparing only games that have
    the same publish year. To make sure we don't lose games because the publish years deviate by one year, we also
    compare to games published in the years one year before and after.
    This further reduces the number of comparisons to: ~ 1,000,000
    Hence, by applying the similarity function only to the most promising pairs we reduced the number of required
    comparisons by 98%.
    """

    # Import bgg and bga data:
    bgg_filename = get_latest_version_of_file(
        '../Data/BoardGameGeeks/Processed/GameInformation/01_BGG_Game_Information_*.csv'
    )
    bgg_df = pd.read_csv(bgg_filename, index_col=0)
    bgg_names = bgg_df['name'].tolist()

    bga_filename = get_latest_version_of_file(
        '../Data/BoardGameAtlas/Processed/API/01_BGA_Game_Information_*.json')
    bga_df = import_json_to_dataframe(bga_filename)
    bga_names = bga_df['name'].tolist()

    # Create lists with bga and bgg ids:
    bgg_ids = bgg_df['bgg_game_id'].tolist()
    bga_ids = bga_df['bga_game_id'].tolist()

    # Check duplicate names:
    bgg_duplicate_names = set([x for x in bgg_names if bgg_names.count(x) > 1])
    bga_duplicate_names = set([x for x in bga_names if bga_names.count(x) > 1])

    ## find exact matches (game_name, year_published):
    exact_matches_join_df = pd.merge(left=bgg_df,
                                     right=bga_df,
                                     left_on=['name', 'year_published'],
                                     right_on=['name', 'year_published'])

    # create list of ids of exactly matched games:
    exact_matches_bgg_ids = exact_matches_join_df['bgg_game_id'].tolist()
    exact_matches_bga_ids = exact_matches_join_df['bga_game_id'].tolist()

    # subtract exact matches from datasets to reduce their size:
    subset_bgg_df = bgg_df[~bgg_df['bgg_game_id'].isin(exact_matches_bgg_ids)]
    subset_bga_df = bga_df[~bga_df['bga_game_id'].isin(exact_matches_bga_ids)]
    subset_bgg_df.rename(columns={'year_published': 'year_published_bgg'},
                         inplace=True)
    subset_bga_df.rename(columns={'year_published': 'year_published_bga'},
                         inplace=True)

    ## In the next part we now want to apply name matching. Our first task is to find candidates so that we don't
    ## have to compare all games from one dataset with all games from the other dataset. We do so by grouping by
    ## their year of publication.
    ## First, we need some preprocessing steps so that we can actually set up our candidates:

    # Extract years from bga dataset:
    # A lot of type casting due to unexpected errors with float and set
    all_years = subset_bga_df['year_published_bga'].dropna().tolist()
    all_years = list(map(int, all_years))
    years = list(set(all_years))
    years.sort(reverse=True)

    # Do not apply name matching to games where to publish_year is missing:
    print('Dropped ' + str(subset_bgg_df['year_published_bgg'].isna().sum()) +
          ' rows from bga_dataset from name_matching')
    print('Dropped ' + str(subset_bga_df['year_published_bga'].isna().sum()) +
          ' rows from bgg_dataset from name_matching')
    subset_bgg_df.dropna(inplace=True)
    subset_bga_df.dropna(inplace=True)

    # strip of '.0' at the end of each year by converting to int: 2018.0 -> 2018
    subset_bga_df["year_published_bga"] = subset_bga_df[
        "year_published_bga"].astype(int)

    # create a dictionary to group all bgg games by their year of publication
    # during the name matching process we will only compare the names of games with the same publication year
    bgg_dic_grouped_by_year = {}
    bga_dic_grouped_by_year = {}

    # fill the previously created dictionaries that include all the games that were published in a certain year
    for year in years:
        bgg_dic_grouped_by_year[year] = subset_bgg_df[
            subset_bgg_df['year_published_bgg'] == year].to_dict('records')
        bga_dic_grouped_by_year[year] = subset_bga_df[
            subset_bga_df['year_published_bga'] == year].to_dict('records')

    ## Now we get to the interesting part:
    ## We iterate over all bga_games which we found no exact bgg_matches for. We then create a list with potential
    ## candidates including all bgg_games that were published in the same year or one year before or after.
    ## For these candidates we then apply name_matching using the jaccard similarity.
    for year in years:
        for bga_game in bga_dic_grouped_by_year[year]:
            input_string = bga_game['name']

            candidate_list = []
            # create candidate_list with all bgg games that were published in the same year as the bga_game:
            for bgg_game in bgg_dic_grouped_by_year[year]:
                candidate_list.append(bgg_game['name'])

            # also check bgg games that were published in the previous year and one year later:
            if year + 1 in bgg_dic_grouped_by_year:
                for bgg_game in bgg_dic_grouped_by_year[year + 1]:
                    candidate_list.append(bgg_game['name'])
            if year - 1 in bgg_dic_grouped_by_year:
                for bgg_game in bgg_dic_grouped_by_year[year - 1]:
                    candidate_list.append(bgg_game['name'])

            # Try to match the input_string (target BGA Game name) one of the games in the candidate_list (bgg games).
            # The match with the highest jaccard similarity is returned. If there is no match, or the Jaccard threshold
            # can not be exceeded then an empty string is returned.
            match = find_match(input_string, candidate_list,
                               JACCARD_THRESHOLD_GAME_NAME)
            bga_game['match'] = match['name']
            bga_game['jaccard_score'] = match['jaccard_score']

    global COMPARISONS
    print('Number of comparisons: ' + str(COMPARISONS))

    bga_list_matches = []
    for year in years:
        for bga_game in bga_dic_grouped_by_year[year]:
            bga_list_matches.append(bga_game)

    # turn list of dictionaries back to data frame:
    jaccard_matches_df = pd.DataFrame(bga_list_matches)

    # just for debugging and inspecting results:
    analyse_df = pd.DataFrame(bga_list_matches)
    analyse_df = analyse_df[analyse_df['jaccard_score'] != '']
    analyse_df = analyse_df[['name', 'match', 'jaccard_score']]
    analyse_df = analyse_df.sort_values('jaccard_score', ascending=False)

    ## We have now succesfully found a large number of games that could be matched. All that's left to do is
    #  creating a dataframe that contains the matched BGA and BGG IDs. We do so in three steps:
    # 1) Prepare DF containing BGA and BGG IDs of games that could be matched exactly by name and year_published
    # 2) Prepare DF containing BGA and BGG IDs of games that could be matched by string matching (jaccard method)
    # 3) Concatenate both data frames

    # 1) Exact matches
    # Keep only ID columns:
    exact_matches_join_df = exact_matches_join_df[[
        'bgg_game_id', 'bga_game_id'
    ]]

    # 2) Jaccard matches
    # Cut of rows where the jaccard threshold wasn't reached (-> no match)
    jaccard_matches_df = jaccard_matches_df[jaccard_matches_df['match'] != '']
    jaccard_matches_df = jaccard_matches_df[[
        'bga_game_id', 'name', 'year_published_bga', 'match', 'jaccard_score'
    ]]
    jaccard_matches_df.rename(columns={'name': 'bga_name'}, inplace=True)

    # Join both datasets
    jaccard_matches_join_df = pd.merge(
        left=bgg_df[['bgg_game_id', 'name', 'year_published']],
        right=jaccard_matches_df,
        left_on=['name', 'year_published'],
        right_on=['match', 'year_published_bga'])
    jaccard_matches_join_df = jaccard_matches_join_df[[
        'bgg_game_id', 'bga_game_id'
    ]]

    # 3) Concat both dfs
    matched_game_ids_df = pd.concat(
        [exact_matches_join_df, jaccard_matches_join_df])

    # 4) Store matches to csv:
    export_df_to_csv(
        matched_game_ids_df,
        '../Data/Joined/Integration/GameInformation/matched_bga_and_bgg_ids.csv'
    )