Esempio n. 1
0
def main():
    school_matching = hlp.return_college_matching_dict()

    local_path = os.path.dirname(os.path.abspath(__file__))
    f = open(os.path.join(local_path, "college_players_build.json"))
    data = json.load(f)
    hlp.return_college_matching_dict()

    matching = hlp.return_matching_dict()

    two_up = os.path.abspath(os.path.join(local_path, "../.."))

    source_dir = os.path.join(two_up, data['source'])  # should work in both mac and windows
    target_dir = os.path.join(two_up, data['target'])
    source = os.path.join(source_dir, data['folder'], data['file'])
    df = pd.read_csv(source)
    df['full_name'] = df[['first_name', 'last_name']].astype(str).apply(' '.join, axis=1)
    df['position_group'] = df['position'].map(matching['position_groups'])
    df['section'] = df['position_group'].map(matching['section'])
    df.rename(columns=data['column_rename'], inplace=True)
    df = df[data['column_order']]

    df['college'] = df['college'].map(school_matching).fillna(df['college']).map(matching['college']).fillna(
        df['college'])

    target_folder = os.path.join(target_dir, data['output_folder'])
    hlp.make_folder_if_not_exists(target_folder)
    target = os.path.join(target_folder, data['output_file'])
    df.to_csv(target, index=False)
Esempio n. 2
0
def add_espn_id():

    local_path = os.path.dirname(os.path.abspath(__file__))
    f = open(os.path.join(local_path, "madden_build.json"))
    data = json.load(f)

    two_up = os.path.abspath(os.path.join(local_path, "../.."))

    source_dir = os.path.join(
        two_up, data['target'])  # should work in both mac and windows
    target_dir = os.path.join(two_up, data['target'])

    source = os.path.join(source_dir, data['output_folder'],
                          data['output_file'])

    df = pd.read_csv(source)
    espn_id_df = hlp.return_id_df(
        ['first_name', 'last_name', 'position_group', 'espn_id'])

    print("fuzzy merging madden outputs")
    df = cm.fuzzy_merge(df,
                        espn_id_df,
                        ['first_name', 'last_name', 'position_group'],
                        ['first_name', 'last_name', 'position_group'],
                        threshold=95,
                        limit=1)
    df = df[data['id_column_order']]

    target_folder = os.path.join(target_dir, data['output_folder'])
    hlp.make_folder_if_not_exists(target_folder)
    target = os.path.join(target_folder, data['output_file'])
    df.to_csv(target, index=False)
Esempio n. 3
0
def main():

    local_path = os.path.dirname(os.path.abspath(__file__))
    f = open(os.path.join(local_path, "facts_cities.json"))
    data = json.load(f)
    two_up = os.path.abspath(os.path.join(local_path, "../.."))
    source_dir = os.path.join(two_up, data['source'])
    target_dir = os.path.join(two_up, data['target'])

    source = os.path.join(source_dir, data['econ_input']['folder'],
                          data['econ_input']['file'])
    df = pd.read_csv(source)

    source = os.path.join(source_dir, data['weather_input']['folder'],
                          data['weather_input']['file'])
    weather_df = pd.read_csv(source)
    weather_df = weather_df[data['weather_keep_columns']]

    df = df.merge(weather_df, on=['fms_city_id'], how='inner')

    df = df[data['keep_columns']]
    df.drop_duplicates(subset='fms_city_id', keep='last', inplace=True)

    target_folder = os.path.join(target_dir, data['output_folder'])
    hlp.make_folder_if_not_exists(target_folder)
    target = os.path.join(target_folder, data['output_file'])
    df.to_csv(target, index=False)
def main():

    local_path = os.path.dirname(os.path.abspath(__file__))
    f = open(os.path.join(local_path, "dimensions_players_build.json"))
    data = json.load(f)
    matching = hlp.return_matching_dict()
    two_up = os.path.abspath(os.path.join(local_path, "../.."))

    source_dir = os.path.join(
        two_up, data['source'])  # should work in both mac and windows
    target_dir = os.path.join(two_up, data['target'])
    source = os.path.join(source_dir, data['draft']['folder'],
                          data['draft']['file'])

    df = pd.read_csv(source)

    ### Read college players in, including hometown ###

    source = os.path.join(source_dir, data['college_players']['folder'],
                          data['college_players']['file'])

    df_players = pd.read_csv(source)
    df_players = df_players[data['college_players_keep']]
    df = pd.merge(df,
                  df_players,
                  left_on=['espn_id'],
                  right_on=['espn_id'],
                  how='left')  # inner join

    #df = df.drop_duplicates(subset='espn_id', keep='first')

    source_dir = os.path.join(
        two_up, data['source'])  # should work in both mac and windows
    source = os.path.join(source_dir, data['draft']['folder'],
                          data['combine_stats']['file'])

    df_combine = pd.read_csv(source)
    df_combine = df_combine[data['combine_stats_keep']]
    df = pd.merge(df,
                  df_combine,
                  left_on=['fms_id'],
                  right_on=['fms_id'],
                  how='left')

    df.rename(columns=data['column_rename'], inplace=True)

    df = df.drop_duplicates(subset='fms_id', keep='last')

    df_college_id = hlp.return_fms_college_id()

    df = df.merge(df_college_id, on='college', how='left')

    df = df[data['column_order']]

    target_folder = os.path.join(target_dir, data['output_folder'])
    hlp.make_folder_if_not_exists(target_folder)
    target = os.path.join(target_folder, data['output_file'])
    df.to_csv(target, index=False)
Esempio n. 5
0
def main():
    print("got to main madden build")
    local_path = os.path.dirname(os.path.abspath(__file__))
    f = open(os.path.join(local_path, "madden_build.json"))
    data = json.load(f)

    matching = hlp.return_matching_dict()  # get global matching dictionary

    two_up = os.path.abspath(os.path.join(local_path, "../.."))

    df = pd.DataFrame(columns=data['columns'])

    source_dir = os.path.join(
        two_up, data['source'])  # should work in both mac and windows
    target_dir = os.path.join(two_up, data['target'])

    counter = 0  # first one will not be matched

    for file in data['file_list']:
        source = os.path.join(source_dir, file['folder'], file['file'])
        temp_df = pd.read_csv(source)
        temp_df.rename(columns=data['column_rename'], inplace=True)
        temp_df['year'] = data['year'][file['file']]  # add year
        temp_df['position_group'] = temp_df['position'].map(
            matching['position_groups'])
        temp_df = temp_df[data['columns']]  # cut all extra columns
        new_column_name = str(data['year'][file['file']]) + "_madden_rating"
        temp_df[new_column_name] = temp_df['madden_rating']
        if counter == 0:
            df = df.append(temp_df)
        else:
            df_1 = cm.fuzzy_merge(
                df,
                temp_df, ['first_name', 'last_name', 'position_group'],
                ['first_name', 'last_name', 'position_group'],
                threshold=95,
                limit=1)  # inner join
            df_2 = pd.concat([temp_df, df_1])
            df = pd.concat([df, df_2])
            df = df.drop_duplicates(
                subset=['first_name', 'last_name', 'position_group'],
                keep='last')

        counter += 1

    df['section'] = df['position_group'].map(matching['section'])

    df.rename(columns=data['column_rename'], inplace=True)
    print(df.columns)
    df = df[data['column_order']]

    target_folder = os.path.join(target_dir, data['output_folder'])
    hlp.make_folder_if_not_exists(target_folder)
    target = os.path.join(target_folder, data['output_file'])
    df.to_csv(target, index=False)
Esempio n. 6
0
def main():

    school_matching = hlp.return_college_matching_dict()

    local_path = os.path.dirname(os.path.abspath(__file__))
    f = open(os.path.join(local_path, "combine_stats_build.json"))
    data = json.load(f)

    matching = hlp.return_matching_dict()

    two_up = os.path.abspath(os.path.join(local_path, "../.."))

    source_dir = os.path.join(
        two_up, data['source'])  # should work in both mac and windows
    target_dir = os.path.join(two_up, data['target'])
    source = os.path.join(source_dir, data['folder'], data['file'])
    df = pd.read_csv(source)

    df['college'] = df['college'].map(hlp.return_college_matching_dict())

    df['first_name'] = df['player'].str.split(' ').str[0]
    df['last_name'] = df['player'].str.split(' ').str[1]
    df['position_group'] = df['pos'].map(matching['position_groups'])
    df['section'] = df['position_group'].map(matching['section'])
    df.rename(columns=data['column_rename'], inplace=True)

    espn_id_df = hlp.return_id_df()
    master_df = hlp.return_fms_id_df()

    df = pd.merge(df,
                  espn_id_df,
                  left_on=['last_name', 'college', 'position_group'],
                  right_on=['last_name', 'college', 'position_group'],
                  how='left')

    df = pd.merge(
        df,
        master_df,
        left_on=['first_name', 'last_name', 'college', 'position_group'],
        right_on=['first_name', 'last_name', 'college', 'position_group'],
        how='left')

    df = df[data['column_order']]

    df['college'] = df['college'].map(school_matching).fillna(
        df['college']).map(matching['college']).fillna(df['college'])

    target_folder = os.path.join(target_dir, data['output_folder'])
    hlp.make_folder_if_not_exists(target_folder)
    target = os.path.join(target_folder, data['output_file'])
    df.to_csv(target, index=False)
def main():

    local_path = os.path.dirname(os.path.abspath(__file__))
    f = open(os.path.join(local_path, "city_economics_build.json"))
    data = json.load(f)

    two_up = os.path.abspath(os.path.join(local_path, "../.."))

    source_dir = os.path.join(
        two_up, data['source'])  # should work in both mac and windows
    target_dir = os.path.join(two_up, data['target'])

    source = os.path.join(source_dir, data['homewtown_econ']['folder'],
                          data['homewtown_econ']['file'])
    df = pd.read_csv(source)

    df.rename(columns=data['column_rename'], inplace=True)

    source = os.path.join(source_dir, data['collegetown_econ']['folder'],
                          data['collegetown_econ']['file'])

    college_town_df = pd.read_csv(source)
    college_town_df.rename(columns=data['column_rename'], inplace=True)

    df = df.append(college_town_df, ignore_index=True)

    city_df = hlp.return_fms_city_id()

    df = df.merge(city_df, on='city_state', how='left')

    for column in data['numerical_columns']:
        df[column] = df[column].apply(
            hlp.currency_to_float)  # convert currency to float, remove $ and ,

    df = df[data['column_keep']]

    df.drop_duplicates(subset='fms_city_id', keep='last', inplace=True)

    target_folder = os.path.join(target_dir, data['output_folder'])
    hlp.make_folder_if_not_exists(target_folder)
    target = os.path.join(target_folder, data['output_file'])
    df.to_csv(target, index=False)
Esempio n. 8
0
def main():

    local_path = os.path.dirname(os.path.abspath(__file__))
    f = open(os.path.join(local_path, "player_master.json"))
    data = json.load(f)

    matching = hlp.return_matching_dict()

    two_up = os.path.abspath(os.path.join(local_path, "../.."))

    source_dir = os.path.join(
        two_up, data['source'])  # should work in both mac and windows
    target_dir = os.path.join(two_up, data['target'])
    source = os.path.join(source_dir, data['folder'], data['file'])
    df = pd.read_csv(source)

    df['college'] = df['college'].map(hlp.return_college_matching_dict())

    df['first_name'] = df['player'].str.split(' ').str[0]
    df['last_name'] = df['player'].str.split(' ').str[1]
    df['position_group'] = df['pos'].map(matching['position_groups'])
    df['section'] = df['position_group'].map(matching['section'])
    df.rename(columns=data['column_rename'], inplace=True)

    espn_id_df = hlp.return_id_df()
    df = pd.merge(df,
                  espn_id_df,
                  left_on=['last_name', 'college', 'position_group'],
                  right_on=['last_name', 'college', 'position_group'],
                  how='left')

    df = df.assign(fms_id=(df['first_name'] + '_' + df['last_name'] + '_' +
                           df['position_group'] + '_' +
                           "draft_year").astype('category').cat.codes)

    df = df[data['keep_columns']]

    target_folder = os.path.join(target_dir, data['output_folder'])
    hlp.make_folder_if_not_exists(target_folder)
    target = os.path.join(target_folder, data['output_file'])
    df.to_csv(target, index=False)
def main():

    local_path = os.path.dirname(os.path.abspath(__file__))
    f = open(os.path.join(local_path, "dimensions_colleges_build.json"))
    data = json.load(f)
    matching = hlp.return_matching_dict()
    two_up = os.path.abspath(os.path.join(local_path, "../.."))

    source_dir = os.path.join(two_up, data['source'])  # should work in both mac and windows
    target_dir = os.path.join(two_up, data['target'])
    source = os.path.join(source_dir, data['colleges']['folder'], data['colleges']['file'])

    df = pd.read_csv(source)

    df = df.drop_duplicates(subset='fms_college_id', keep='last')
    #df = df[data['column_order']]

    target_folder = os.path.join(target_dir, data['output_folder'])
    hlp.make_folder_if_not_exists(target_folder)
    target = os.path.join(target_folder, data['output_file'])
    df.to_csv(target, index=False)
def main():
    school_matching = hlp.return_college_matching_dict()

    local_path = os.path.dirname(os.path.abspath(__file__))
    f = open(os.path.join(local_path, "college_economics_build.json"))
    data = json.load(f)
    hlp.return_college_matching_dict()

    matching = hlp.return_matching_dict()

    two_up = os.path.abspath(os.path.join(local_path, "../.."))

    source_dir = os.path.join(
        two_up, data['source'])  # should work in both mac and windows
    target_dir = os.path.join(two_up, data['target'])
    source = os.path.join(source_dir, data['folder'], data['file'])
    df = pd.read_csv(source)
    df.rename(columns=data['column_rename'], inplace=True)
    df = df[data['column_keep']]

    for column in data['numerical_columns']:
        df[column] = df[column].apply(
            hlp.currency_to_float)  # convert currency to float, remove $ and ,

    df['college'] = df['college'].map(school_matching).fillna(df['college'])
    df['college'] = df['college'].map(matching['college']).fillna(
        df['college'])

    #df = df.groupby('college').mean().reset_index()

    master_college_df = hlp.return_fms_college_id()

    df = df.merge(master_college_df, on='college', how='left')

    df = df[data['column_order']]

    target_folder = os.path.join(target_dir, data['output_folder'])
    hlp.make_folder_if_not_exists(target_folder)
    target = os.path.join(target_folder, data['output_file'])
    df.to_csv(target, index=False)
Esempio n. 11
0
def main():

    local_path = os.path.dirname(os.path.abspath(__file__))
    f = open(os.path.join(local_path, "facts_college_metrics.json"))
    data = json.load(f)
    two_up = os.path.abspath(os.path.join(local_path, "../.."))
    source = os.path.join(two_up, data['dimension_colleges']['folder'],
                          data['dimension_colleges']['file'])
    df = pd.read_csv(source)
    df = df['fms_college_id']

    source_dir = os.path.join(
        two_up, data['source'])  # should work in both mac and windows
    target_dir = os.path.join(two_up, data['target'])
    source = os.path.join(source_dir, data['college_budget']['folder'],
                          data['college_budget']['file'])

    college_budget_df = pd.read_csv(source)

    df = pd.merge(df,
                  college_budget_df,
                  left_on=['fms_college_id'],
                  right_on=['fms_college_id'],
                  how='left')

    df.rename(columns=data['column_rename'], inplace=True)

    #df = df[data['column_order']]
    """
    z_score_list = []  # to add te output df
    for col in data['z_score_columns']:
        col_zscore = col + '_zscore'
        z_score_list.append(col_zscore)
        df[col_zscore] = (df[col] - df[col].mean())/df[col].std(ddof=0)
    """

    target_folder = os.path.join(target_dir, data['output_folder'])
    hlp.make_folder_if_not_exists(target_folder)
    target = os.path.join(target_folder, data['output_file'])
    df.to_csv(target, index=False)
Esempio n. 12
0
def main():
    prefix = "rb_"
    local_path = os.path.dirname(os.path.abspath(__file__))
    f = open(os.path.join(local_path, "reporting_rb.json"))
    data = json.load(f)
    two_up = os.path.abspath(os.path.join(local_path, "../.."))
    target_dir = os.path.join(two_up, data['target'])

    source = os.path.join(two_up, data['dimension_players']['folder'], data['dimension_players']['file'])
    df = pd.read_csv(source)
    df = df[df['position'].str.contains("RB")]

    ### get city IDs for colleges

    source = os.path.join(two_up, data['dimension_colleges']['folder'], data['dimension_colleges']['file'])
    college_city_df = pd.read_csv(source)
    college_city_df = college_city_df[data['dimension_colleges_keep_columns']]

    ### merge dimension players and dimension colleges ###

    df = pd.merge(df, college_city_df, on='fms_college_id', how='left')
    df = df.drop_duplicates(subset='fms_id', keep='last')

    ### player stats ###

    source = os.path.join(two_up, data['facts_player_metrics']['folder'], data['facts_player_metrics']['file'])
    player_stats_df = pd.read_csv(source)

    df = pd.merge(df, player_stats_df, on='fms_id', how='left')
    df = df.drop_duplicates(subset='fms_id', keep='last')

    ### college stats ###

    source = os.path.join(two_up, data['facts_college_metrics']['folder'], data['facts_college_metrics']['file'])
    college_stats_df = pd.read_csv(source)
    college_stats_df = college_stats_df[data['college_stats_keep_columns']]
    college_stats_df = college_stats_df.groupby('fms_college_id').mean().reset_index()


    df = pd.merge(df, college_stats_df, left_on='fms_college_id', right_on='fms_college_id', how='left')
    df = df.drop_duplicates(subset='fms_id', keep='last')


    ### city stats ###

    source = os.path.join(two_up, data['facts_cities_metrics']['folder'], data['facts_cities_metrics']['file'])
    city_stats_df = pd.read_csv(source)

    df = pd.merge(df, city_stats_df, left_on='fms_city_id', right_on='fms_city_id', how='left')
    df = df.drop_duplicates(subset='fms_id', keep='last')


    ### add conference ###

    source = os.path.join(two_up, data['dimension_colleges']['folder'], data['dimension_colleges']['file'])
    conference_df = pd.read_csv(source)
    conference_df = conference_df[data['conference_keep_columns']]

    df = pd.merge(df, conference_df, left_on='fms_college_id', right_on='fms_college_id', how='left')
    df = df.drop_duplicates(subset='fms_id', keep='last')


    ### math transformations ###

    df['hw_ratio'] = df['college_height_inches'] / df['college_weight_pounds']
    df['conference_scale'] = df['conference'].map(data['conference_scale'])
    df['conference_scale'] = df['conference_scale'].fillna(0.7)
    df['conference_scale'] = df['conference_scale'].astype(float)  # convert to float



    for column in data['per_game_columns']:
        new_name = str(column) + '_pg'
        df[new_name] = df[column]/df['rushing_games']
        scaled_name = new_name + "_cf_scaled"
        df[scaled_name] = df[new_name] * df['conference_scale']

    ### apply z score ###


    z_score_list = []  # to add te output df
    for col in data['z_score_columns']:
        col_zscore = prefix + col + '_zscore'
        z_score_list.append(col_zscore)
        mean = df[col].mean()
        stdev = df[col].std(ddof=0)
        min = df[col].min()
        max = df[col].max()
        df[col_zscore] = (df[col] - mean) / stdev
        hlp.write_representative_statistics(col_zscore, mean, stdev, min, max)


    df = df[data['column_order']]

    target_folder = os.path.join(target_dir, data['output_folder'])
    hlp.make_folder_if_not_exists(target_folder)
    target = os.path.join(target_folder, data['output_file'])
    df.to_csv(target, index=False)
Esempio n. 13
0
def main():
    local_path = os.path.dirname(os.path.abspath(__file__))
    f = open(os.path.join(local_path, "college_master.json"))
    data = json.load(f)
    matching = hlp.return_matching_dict()
    school_matching = hlp.return_college_matching_dict()

    two_up = os.path.abspath(os.path.join(local_path, "../.."))

    source_dir = os.path.join(
        two_up, data['source'])  # should work in both mac and windows
    target_dir = os.path.join(two_up, data['target'])

    # pull in combine

    source = os.path.join(source_dir, data['combine']['folder'],
                          data['combine']['file'])
    combine_df = pd.read_csv(source)
    combine_df = combine_df[data['combine_keep']].drop_duplicates(
        subset='college').reset_index(drop=True)
    combine_df['college'] = combine_df['college'].map(school_matching).fillna(
        combine_df['college'])

    # pull in college weather

    source = os.path.join(source_dir, data['college_weather']['folder'],
                          data['college_weather']['file'])
    college_weather_df = pd.read_csv(source)
    college_weather_df.rename(columns=data['college_weather_rename'],
                              inplace=True)
    college_weather_df = college_weather_df[
        data['college_weather_keep']].drop_duplicates(
            subset='college').reset_index(drop=True)
    college_weather_df['college'] = college_weather_df['college'].map(
        school_matching).fillna(college_weather_df['college'])

    # pull in college econ

    source = os.path.join(source_dir, data['college_econ']['folder'],
                          data['college_econ']['file'])
    college_econ_df = pd.read_csv(source)
    college_econ_df.rename(columns=data['college_econ_rename'], inplace=True)
    college_econ_df = college_econ_df[
        data['college_econ_keep']].drop_duplicates(
            subset='college').reset_index(drop=True)
    college_econ_df['college'] = college_econ_df['college'].map(
        school_matching).fillna(college_econ_df['college'])

    # pull in college funding data
    source = os.path.join(source_dir, data['college_budget']['folder'],
                          data['college_budget']['file'])
    college_budget_df = pd.read_csv(source)
    college_budget_df.rename(columns=data['college_budget_rename'],
                             inplace=True)
    college_budget_df = college_budget_df[
        data['college_budget_keep']].drop_duplicates(
            subset='college').reset_index(drop=True)
    college_budget_df['college'] = college_budget_df['college'].map(
        school_matching).fillna(college_budget_df['college'])

    # pull in conferences

    source = os.path.join(source_dir, data['conferences']['folder'],
                          data['conferences']['file'])
    college_conference_df = pd.read_csv(source)
    college_conference_df = college_conference_df.drop_duplicates(
        subset='college').reset_index(drop=True)
    college_conference_df['college'] = college_conference_df['college'].map(
        school_matching).fillna(college_conference_df['college'])

    sources_list = [
        combine_df, college_weather_df, college_econ_df, college_budget_df,
        college_conference_df
    ]

    df, matching_dict = gld.golden_source_merge(sources_list, ['college'], 98)

    matching_dict[
        'Texas'] = 'Texas'  # hand jam Texas so it doesn't match with Texas College

    # remap names
    combine_df['college'] = combine_df['college'].map(matching_dict).fillna(
        combine_df['college'])
    combine_df['college'] = combine_df['college'].map(
        matching['college']).fillna(combine_df['college'])

    college_weather_df['college'] = college_weather_df['college'].map(
        matching_dict).fillna(college_weather_df['college'])
    college_weather_df['college'] = college_weather_df['college'].map(
        matching['college']).fillna(college_weather_df['college'])

    college_econ_df['college'] = college_econ_df['college'].map(
        matching_dict).fillna(college_econ_df['college'])
    college_econ_df['college'] = college_econ_df['college'].map(
        matching['college']).fillna(college_econ_df['college'])

    college_budget_df['college'] = college_budget_df['college'].map(
        matching_dict).fillna(college_budget_df['college'])
    college_budget_df['college'] = college_budget_df['college'].map(
        matching['college']).fillna(college_budget_df['college'])

    college_conference_df['college'] = college_conference_df['college'].map(
        matching_dict).fillna(college_conference_df['college'])
    college_conference_df['college'] = college_conference_df['college'].map(
        matching['college']).fillna(college_conference_df['college'])

    df = df.merge(
        combine_df, how='left',
        on='college').drop_duplicates(subset='college').reset_index(drop=True)
    df = df.merge(
        college_weather_df, how='left',
        on='college').drop_duplicates(subset='college').reset_index(drop=True)
    df = df.merge(
        college_econ_df, how='left',
        on='college').drop_duplicates(subset='college').reset_index(drop=True)

    df = df.merge(
        college_budget_df, how='left',
        on='college').drop_duplicates(subset='college').reset_index(drop=True)

    df = df.merge(
        college_conference_df, how='left',
        on='college').drop_duplicates(subset='college').reset_index(drop=True)

    df['city'] = df['city_state'].apply(lambda x: x.split(',')[0]
                                        if isinstance(x, str) else "")
    df['state'] = df['city_state'].apply(lambda x: x.split(',')[1]
                                         if isinstance(x, str) else "")
    df = df.assign(fms_college_id=(df['college']).astype('category').cat.codes)

    geo_df = hlp.return_fms_city_id()

    df = df.merge(geo_df, on=['city_state'], how='left')

    df = df[data['keep_columns']]

    new_dict = {}
    new_dict['college'] = matching_dict

    matching.update(new_dict)
    hlp.write_matching_dict(matching)

    # drop duplicates

    df.drop_duplicates(subset='fms_college_id', keep='last', inplace=True)

    target_folder = os.path.join(target_dir, data['output_folder'])
    hlp.make_folder_if_not_exists(target_folder)
    target = os.path.join(target_folder, data['output_file'])
    df.to_csv(target, index=False)
Esempio n. 14
0
def main():

    local_path = os.path.dirname(os.path.abspath(__file__))
    f = open(os.path.join(local_path, "geo_master.json"))
    data = json.load(f)
    matching = hlp.return_matching_dict()

    two_up = os.path.abspath(os.path.join(local_path, "../.."))

    source_dir = os.path.join(two_up, data['source'])  # should work in both mac and windows
    target_dir = os.path.join(two_up, data['target'])

    # pull in colleges

    source = os.path.join(source_dir, data['colleges']['folder'], data['colleges']['file'])
    college_cities_df = pd.read_csv(source)

    college_cities_df = college_cities_df[data['college_keep']]
    college_cities_df.rename(columns=data["college_df_rename"], inplace=True)


    source = os.path.join(source_dir, data['hometowns']['folder'], data['hometowns']['file'])
    hometown_df = pd.read_csv(source)

    hometown_df = hometown_df[data['hometowns_keep']]
    hometown_df.rename(columns=data["hometown_df_rename"], inplace=True)

    sources_list = [college_cities_df, hometown_df]

    df, matching_dict = gld.golden_source_merge(sources_list, ['city_state'], 98)
    hometown_df['city_state'] = hometown_df['city_state'].map(matching_dict).fillna(hometown_df['city_state'])
    college_cities_df['city_state'] = college_cities_df['city_state'].map(matching_dict).fillna(college_cities_df['city_state'])


    df = df.merge(hometown_df, how='left', on='city_state')
    df = df.merge(college_cities_df, how='left', on='city_state')
    df['latitude'] = df['latitude_x'].combine_first(df['latitude_y'])
    df['longitude'] = df['longitude_x'].combine_first(df['longitude_y'])

    df['city'] = df['city_state'].apply(lambda x: x.split(',')[0])
    df['state'] = df['city_state'].apply(lambda x: x.split(',')[1])
    df = df.assign(fms_city_id=(df['city_state']).astype('category').cat.codes)
    df['country'] = ""  # to be filled in later

    df = df[data['keep_columns']]

    new_dict = {}
    new_dict['cities'] = matching_dict



    matching.update(new_dict)
    hlp.write_matching_dict(matching)

    # drop duplicates

    df.drop_duplicates(subset='fms_city_id', keep='last', inplace=True)

    target_folder = os.path.join(target_dir, data['output_folder'])
    hlp.make_folder_if_not_exists(target_folder)
    target = os.path.join(target_folder, data['output_file'])
    df.to_csv(target, index=False)
def main():

    local_path = os.path.dirname(os.path.abspath(__file__))
    f = open(os.path.join(local_path, "facts_player_metrics.json"))
    data = json.load(f)
    two_up = os.path.abspath(os.path.join(local_path, "../.."))
    source = os.path.join(two_up, data['dimension_players']['folder'],
                          data['dimension_players']['file'])
    df = pd.read_csv(source)
    df = df['fms_id']

    ###  madden stats ###

    source_dir = os.path.join(
        two_up, data['source'])  # should work in both mac and windows
    target_dir = os.path.join(two_up, data['target'])
    source = os.path.join(source_dir, data['madden_ratings']['folder'],
                          data['madden_ratings']['file'])

    madden_df = pd.read_csv(source)
    madden_df = madden_df[data["madden_keep_pre"]]
    # drop duplicates, need to fix this later
    madden_df = madden_df.drop_duplicates(subset='fms_id', keep='last')

    madden_df['max_madden'] = np.nanmax(madden_df[madden_df.columns.difference(
        ['fms_id'])].values,
                                        axis=1)

    madden_df = madden_df[data['madden_keep_post']]

    df = pd.merge(df,
                  madden_df,
                  left_on=['fms_id'],
                  right_on=['fms_id'],
                  how='left')

    ### combine stats ###

    source = os.path.join(source_dir, data['combine_stats']['folder'],
                          data['combine_stats']['file'])
    combine_df = pd.read_csv(source)

    # drop duplicates, need to fix this later
    combine_df = combine_df.drop_duplicates(subset='fms_id', keep='last')
    df = pd.merge(df,
                  combine_df,
                  left_on=['fms_id'],
                  right_on=['fms_id'],
                  how='left')

    ### college stats ###

    source = os.path.join(source_dir, data['college_stats']['folder'],
                          data['college_stats']['file'])

    df_college_stats = pd.read_csv(source)

    # drop duplicates, need to fix this later
    df_college_stats = df_college_stats.drop_duplicates(subset='fms_id',
                                                        keep='last')
    df = pd.merge(df, df_college_stats, on='fms_id', how='left')  # left join

    df.rename(columns=data['column_rename'], inplace=True)
    df = df[data['column_order']]

    target_folder = os.path.join(target_dir, data['output_folder'])
    hlp.make_folder_if_not_exists(target_folder)
    target = os.path.join(target_folder, data['output_file'])
    df.to_csv(target, index=False)