def get_desc(data, factor1, factor2, measurement1, measurement1_std, measurement1_ste, measurement2, measurement2_std, measurement2_ste): stats = data.groupby([factor1, factor2]).mean() stats_std = data.groupby([factor1, factor2]).std() stats = stats.reset_index() stats_std = stats_std.reset_index() stats[measurement1_std] = stats_std[measurement1] stats[measurement1_ste] = stats[measurement1_std] / np.sqrt(stats['n']) stats[measurement2_std] = stats_std[measurement2] stats[measurement2_ste] = stats[measurement2_std] / np.sqrt(stats['n']) stats['conf1'] = stats[measurement1_ste] * sp.stats.t._ppf( (1 + confidence) / 2., stats['n'] - 1) stats[lconf1] = stats[measurement1] - stats['conf1'] stats[hconf1] = stats[measurement1] + stats['conf1'] stats['conf2'] = stats[measurement1_ste] * sp.stats.t._ppf( (1 + confidence) / 2., stats['n'] - 1) stats[lconf2] = stats[measurement2] - stats['conf2'] stats[hconf2] = stats[measurement2] + stats['conf2'] stats = stats[[ factor1, 'n', factor2, measurement1, measurement1_ste, measurement1_std, lconf1, hconf1, measurement2, measurement2_ste, measurement2_std, lconf2, hconf2 ]] return stats
def read_nyt_csv(uri, meta, earliest_date, latest_date): stats = pandas.read_csv(uri) stats = stats[stats.state.isin(meta.State)][['date', 'state', 'deaths']] stats.columns = ['Date', 'State', 'Deaths'] stats.Date = [pandas.Period(str(v)) for v in stats.Date] stats = stats[stats.Date >= earliest_date] if latest_date: stats = stats[stats.Date <= latest_date] stats = stats.set_index(['State', 'Date']).sort_index() # Pull in the statistics for states stats = stats.join(meta.set_index('State')) # Remove territories stats = stats[~stats.ST.isin(['AS', 'GU', 'MP', 'PR', 'VI'])] return stats.reset_index()
def pull_gamelogs(player_key): year = 1999 #first year of Lamar Odom's career (oldest player) complete_stats = pd.DataFrame() while (year <= 2019): #URL to scrape url = 'https://www.basketball-reference.com/players/g/{}/gamelog/{}'.format(player_key, year) try: #HTML from given URL html = urlopen(url) soup = BeautifulSoup(html) rows = soup.findAll('tbody')[0].findAll('tr') player_stats = [[td.getText() for td in rows[i].findAll('td')] for i in range(len(rows))] stats = pd.DataFrame(player_stats, columns = headers) stats = stats[stats['G'].notnull()] stats = stats.reset_index(drop=True) complete_stats = complete_stats.append(stats) except: pass year+=1 return complete_stats
def compute_descriptive_stats(df, year, stats_for, rcat_col="rcat", nuts2_col="nuts2_name", nuts3_col="nuts3_name", area_col_suffix="_name", growth_col_suffix="_growth", ratio_col_suffix="_gdp_ratio", veh_col="vehicle_measure", area_name_col="area_name", area_level_col="area_level", year_col="year", gdp_col="gdp", gdp_unit_col="gdp_unit", growth_col="gdp_growth", ratio_col="gdp_ratio", road_all_indicator="all", national="nat", nat_level="nuts1", nat_gdp_col="gdp", nat_gdp_unit_col="gdp_unit", nat_gdp_growth_col="gdp_growth"): """compute_descriptive_stats Computes descriptive stats with pandas describe function for NUTS2 & 3 areas to compare against that area's gdp. Arguments: df (DataFrame): Contains the traffic data by observation point that we wish to aggregate year (string): Year the df corresponds to. stats_for (list): List of columns that we want to aggreagate. rcat_col (string): Column in df that stores the road category. nuts2_col (string): Column in df that stores the nuts2 category. nuts3_col (string): Column in df that stores the nuts3 category. area_col_suffix (string): Suffix to add to the nuts level that gives the column in df that stores the area name. growth_col_suffix (string): Suffix to add to the nuts level that gives the column in df that stores the gdp growth. ratio_col_suffix (string): Suffix to add to the nuts level that gives the column in df that stores the gdp ratio on previous year. veh_col (string): Name of the column in describe_df where the traffic flow measure is stored area_name_col (string): Name of the column in describe_df where the area name is stored. area_level_col (string): Name of the column in describe_df where the nuts level is stored. year_level_col (string): Name of the column in describe_df where the year is stored. gdp_col (string): Name of the column in describe_df where the gdp value is stored. gdp_unit_col (string): Name of the column in describe_df where the gdp unit is stored. growth_col (string): Name of the column in describe_df where the gdp growth value is stored. ratio_col (string): Name of the column in describe_df where the gdp ratio on previous year value is stored. road_all_indicator (string): Value in rcat_col in describe_df that corrisponds to stats for all road categories, national (string): label to go in df[area_col] indicating this is stats for the nation nat_level (string): label to go in df[area_level_col] indicating this is stats for the nation Returns: describe_df (DataFrame): Contains descriptive stats by road type and NUTS area for observations in df Todo: refactor with yearly descriptive stats, see comments """ stats_lst = list() nuts2_areas = df[nuts2_col].dropna().unique().tolist() nuts3_areas = df[nuts3_col].dropna().unique().tolist() road_cats = df[rcat_col].unique().tolist() road_cats.append(road_all_indicator) #combine nuts2 and nuts3 areas nuts_areas = [(area, "nuts2") for area in nuts2_areas] for area in nuts3_areas: nuts_areas.append((area, "nuts3")) nuts_areas.append((national, nat_level)) #all areas #flat is better than nested for product in it.product(nuts_areas, road_cats): #unpack area_name = product[0][0] area_col = product[0][1] road_cat = product[1] #subselect by nuts area and road category if (area_name is national) & (road_cat is road_all_indicator): # include everything mask = [True] * df.shape[0] elif area_name is national: # all of a particular road in the nation mask = df[rcat_col] == road_cat elif road_cat is road_all_indicator: # all roads in a nuts area mask = df[area_col + area_col_suffix] == area_name else: #only certain road types in a nuts area mask = ((df[area_col + area_col_suffix] == area_name) & (df[rcat_col] == road_cat)) #to store data in tall format wrt measure type stats = df[mask][stats_for].describe().T stats.reset_index(inplace=True) stats.rename(columns={"index": veh_col}, inplace=True) #we know what this responds to stats[area_name_col] = area_name stats[area_level_col] = area_col stats[year_col] = year stats[rcat_col] = road_cat #get the gdp... if area_name is national: #...of the nation #take first value as they are all the same stats[gdp_col] = df[nat_gdp_col][0] stats[gdp_unit_col] = df[nat_gdp_unit_col][0] stats[growth_col] = df[nat_gdp_growth_col][0] stats[ratio_col] = 1 + (df[nat_gdp_col][0] / 100) else: #...of each nuts area area_mask = df[area_col + area_col_suffix] == area_name stats[gdp_col] = df[area_mask][area_col + "_value"].unique()[0] stats[gdp_unit_col] = df[area_mask][area_col + "_unit"].unique()[0] stats[growth_col] = df[area_mask][area_col + growth_col_suffix].unique()[0] stats[ratio_col] = df[area_mask][area_col + ratio_col_suffix].unique()[0] stats_lst.append(stats) describe_df = pd.concat(stats_lst) return describe_df