Esempio n. 1
0
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas.api.types import CategoricalDtype
from loadData import preProcess, filterColumn, filterMedalsOnly

# load data
hun_df = preProcess('./data/athlete_events.csv')
medal_reversed = CategoricalDtype(categories=reversed(
    hun_df.Medal.cat.categories),
                                  ordered=True)
hun_df['Medal'] = hun_df['Medal'].astype(medal_reversed)

# filter dataframe to contain rows with medals only, then select Summer Games only
filtered_df = filterMedalsOnly(hun_df)
filtered_df = filterColumn(filtered_df, 'Season', 'Summer')

# count medals in team events as one
hun_df_no_duplicates = filtered_df.drop_duplicates(['Games', 'Event', 'Medal'
                                                    ]).reset_index(drop=True)

# cross tabulate Sex and Medal columns so that we get medal count by gender
df2 = pd.crosstab(hun_df_no_duplicates['Sex'], hun_df_no_duplicates['Medal'])

cols = ['#D4AF37', '#BCC6CC', '#cd7f32']
ax = df2.plot.barh(stacked=True, figsize=(8, 4), color=cols)
handles, labels = ax.get_legend_handles_labels()

ax.legend(handles, labels, bbox_to_anchor=(1.0, 1.0), frameon=False)
plt.title('Medals won by Hungary according to gender',
          size=16,
Esempio n. 2
0
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from loadData import preProcess, filterColumn

# Load and preprocess data
df = preProcess('./data/athlete_events.csv')

# select summer games only
df = filterColumn(df, 'Season', 'Summer')
# select a subset of data and drop duplicate ID from the same year
df = df[['Year', 'ID', 'Age',
         'Sex']].drop_duplicates(['Year', 'ID']).reset_index(drop=True)

# drop ID column
df.drop('ID', axis=1, inplace=True)
df2 = df.copy()
df.set_index(['Year', 'Sex'], inplace=True, append=True)
#group by year and sex columns, and calculate average age for each group
df_grouped = df.groupby(level=['Year', 'Sex'])['Age'].mean()
# Move 'Sex' level out of row index to columns index
avg_age_vs_time = df_grouped.unstack()

fig, ax = plt.subplots(figsize=(8, 6))

avg_age_vs_time.plot(ax=ax, linewidth=3)
sns.scatterplot(x="Year",
                y="Age",
                data=df2,
                hue="Sex",
Esempio n. 3
0
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pandas.api.types import CategoricalDtype
from loadData import preProcess, filterColumn

# load data 
hun_df = preProcess('./data/athlete_events.csv')

# select summer games only
summer_df = filterColumn(hun_df, 'Season', 'Summer')

fig, ax = plt.subplots(figsize=(14,6))

# create boxplot
sns.boxplot(x="Year", 
y="Age", 
ax=ax, 
hue="Sex", 
palette={"Male": "#18a1cd", "Female":"#fa8c00"}, 
data=summer_df)
        
ax.set_xlabel('Year', size=14, labelpad=10)
ax.set_ylabel('Age (in years)', size=14)
ax.set_title('Age distribution of Hungarian athletes in Summer Olympics', size=16, pad=20, weight='heavy')
plt.show()
Esempio n. 4
0
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from loadData import preProcess, filterColumn

# Load and preprocess data
hun_df = preProcess('./data/athlete_events.csv')

# drop duplicate entries from the same Olympic Game
hun_df_distinct_ids = hun_df.drop_duplicates(['Year',
                                              'ID']).reset_index(drop=True)

# select Summer Games only
hun_df_distinct_ids = filterColumn(hun_df_distinct_ids, 'Season', 'Summer')

colours = ['#18a1cd', '#fa8c00']

fig, ax = plt.subplots(figsize=(6, 4))
ax.pie(hun_df_distinct_ids['Sex'].value_counts(sort=False),
       colors=colours,
       startangle=60,
       wedgeprops={
           'linewidth': 0.5,
           'edgecolor': 'lightgrey',
           'width': 0.7
       },
       autopct='  %.0f%%',
       pctdistance=0.6,
       labeldistance=1.1)

# Equal aspect ratio ensures that pie is drawn as a circle.