Ejemplo n.º 1
0
def get_desc(data, factor1, factor2, measurement1, measurement1_std,
             measurement1_ste, measurement2, measurement2_std,
             measurement2_ste):
    stats = data.groupby([factor1, factor2]).mean()
    stats_std = data.groupby([factor1, factor2]).std()
    stats = stats.reset_index()
    stats_std = stats_std.reset_index()
    stats[measurement1_std] = stats_std[measurement1]
    stats[measurement1_ste] = stats[measurement1_std] / np.sqrt(stats['n'])
    stats[measurement2_std] = stats_std[measurement2]
    stats[measurement2_ste] = stats[measurement2_std] / np.sqrt(stats['n'])
    stats['conf1'] = stats[measurement1_ste] * sp.stats.t._ppf(
        (1 + confidence) / 2., stats['n'] - 1)
    stats[lconf1] = stats[measurement1] - stats['conf1']
    stats[hconf1] = stats[measurement1] + stats['conf1']
    stats['conf2'] = stats[measurement1_ste] * sp.stats.t._ppf(
        (1 + confidence) / 2., stats['n'] - 1)
    stats[lconf2] = stats[measurement2] - stats['conf2']
    stats[hconf2] = stats[measurement2] + stats['conf2']
    stats = stats[[
        factor1, 'n', factor2, measurement1, measurement1_ste,
        measurement1_std, lconf1, hconf1, measurement2, measurement2_ste,
        measurement2_std, lconf2, hconf2
    ]]
    return stats
Ejemplo n.º 2
0
def read_nyt_csv(uri, meta, earliest_date, latest_date):
    stats = pandas.read_csv(uri)
    stats = stats[stats.state.isin(meta.State)][['date', 'state', 'deaths']]
    stats.columns = ['Date', 'State', 'Deaths']
    stats.Date = [pandas.Period(str(v)) for v in stats.Date]
    stats = stats[stats.Date >= earliest_date]
    if latest_date:
        stats = stats[stats.Date <= latest_date]

    stats = stats.set_index(['State', 'Date']).sort_index()
    # Pull in the statistics for states
    stats = stats.join(meta.set_index('State'))

    # Remove territories
    stats = stats[~stats.ST.isin(['AS', 'GU', 'MP', 'PR', 'VI'])]

    return stats.reset_index()
Ejemplo n.º 3
0
def pull_gamelogs(player_key):
    year = 1999 #first year of Lamar Odom's career (oldest player)
    complete_stats = pd.DataFrame()
    while (year <= 2019):
        #URL to scrape
        url = 'https://www.basketball-reference.com/players/g/{}/gamelog/{}'.format(player_key, year)
        try:
            #HTML from given URL
            html = urlopen(url)
            soup = BeautifulSoup(html)
            rows = soup.findAll('tbody')[0].findAll('tr')
            player_stats = [[td.getText() for td in rows[i].findAll('td')] for i in range(len(rows))]
            stats = pd.DataFrame(player_stats, columns = headers)
            stats = stats[stats['G'].notnull()]
            stats = stats.reset_index(drop=True)
            complete_stats = complete_stats.append(stats)
        except:
            pass
        year+=1
    return complete_stats
Ejemplo n.º 4
0
def compute_descriptive_stats(df,
                              year,
                              stats_for,
                              rcat_col="rcat",
                              nuts2_col="nuts2_name",
                              nuts3_col="nuts3_name",
                              area_col_suffix="_name",
                              growth_col_suffix="_growth",
                              ratio_col_suffix="_gdp_ratio",
                              veh_col="vehicle_measure",
                              area_name_col="area_name",
                              area_level_col="area_level",
                              year_col="year",
                              gdp_col="gdp",
                              gdp_unit_col="gdp_unit",
                              growth_col="gdp_growth",
                              ratio_col="gdp_ratio",
                              road_all_indicator="all",
                              national="nat",
                              nat_level="nuts1",
                              nat_gdp_col="gdp",
                              nat_gdp_unit_col="gdp_unit",
                              nat_gdp_growth_col="gdp_growth"):
    """compute_descriptive_stats
    
    Computes descriptive stats with pandas describe function for NUTS2 & 3 
    areas to compare against that area's gdp.
    
    Arguments:
        df (DataFrame): Contains the traffic data by observation point that
            we wish to aggregate
        year (string): Year the df corresponds to.
        stats_for (list): List of columns that we want to aggreagate.
        rcat_col (string): Column in df that stores the road category.
        nuts2_col (string): Column in df that stores the nuts2 category.
        nuts3_col (string): Column in df that stores the nuts3 category.
        area_col_suffix (string): Suffix to add to the nuts level that gives
            the column in df that stores the area name.
        growth_col_suffix (string): Suffix to add to the nuts level that gives
            the column in df that stores the gdp growth.
        ratio_col_suffix (string): Suffix to add to the nuts level that gives
            the column in df that stores the gdp ratio on previous year.
        veh_col (string): Name of the column in describe_df where the traffic
            flow measure is stored
        area_name_col (string): Name of the column in describe_df where the 
            area name is stored.
        area_level_col (string): Name of the column in describe_df where the 
            nuts level is stored.
        year_level_col (string): Name of the column in describe_df where the 
            year is stored.
        gdp_col (string): Name of the column in describe_df where the 
            gdp value is stored.
        gdp_unit_col (string): Name of the column in describe_df where the 
            gdp unit is stored.
        growth_col (string): Name of the column in describe_df where the 
            gdp growth value is stored.
        ratio_col (string): Name of the column in describe_df where the 
            gdp ratio on previous year value is stored.
        road_all_indicator (string): Value in rcat_col in describe_df that
            corrisponds to stats for all road categories,
        national (string): label to go in df[area_col] indicating this is
            stats for the nation
        nat_level (string): label to go in df[area_level_col] indicating this 
            is stats for the nation    
            
    Returns: 
        describe_df (DataFrame): Contains descriptive stats by road type 
        and NUTS area for observations in df
        
    Todo:
        refactor with yearly descriptive stats, see comments

    """
    stats_lst = list()

    nuts2_areas = df[nuts2_col].dropna().unique().tolist()
    nuts3_areas = df[nuts3_col].dropna().unique().tolist()

    road_cats = df[rcat_col].unique().tolist()
    road_cats.append(road_all_indicator)

    #combine nuts2 and nuts3 areas
    nuts_areas = [(area, "nuts2") for area in nuts2_areas]
    for area in nuts3_areas:
        nuts_areas.append((area, "nuts3"))
    nuts_areas.append((national, nat_level))  #all areas

    #flat is better than nested
    for product in it.product(nuts_areas, road_cats):

        #unpack
        area_name = product[0][0]
        area_col = product[0][1]
        road_cat = product[1]

        #subselect by nuts area and road category

        if (area_name is national) & (road_cat is road_all_indicator):
            # include everything
            mask = [True] * df.shape[0]
        elif area_name is national:
            # all of a particular road in the nation
            mask = df[rcat_col] == road_cat
        elif road_cat is road_all_indicator:
            # all roads in a nuts area
            mask = df[area_col + area_col_suffix] == area_name
        else:
            #only certain road types in a nuts area
            mask = ((df[area_col + area_col_suffix] == area_name)
                    & (df[rcat_col] == road_cat))

        #to store data in tall format wrt measure type
        stats = df[mask][stats_for].describe().T
        stats.reset_index(inplace=True)
        stats.rename(columns={"index": veh_col}, inplace=True)

        #we know what this responds to
        stats[area_name_col] = area_name
        stats[area_level_col] = area_col
        stats[year_col] = year
        stats[rcat_col] = road_cat

        #get the gdp...

        if area_name is national:
            #...of the nation
            #take first value as they are all the same
            stats[gdp_col] = df[nat_gdp_col][0]
            stats[gdp_unit_col] = df[nat_gdp_unit_col][0]
            stats[growth_col] = df[nat_gdp_growth_col][0]

            stats[ratio_col] = 1 + (df[nat_gdp_col][0] / 100)
        else:
            #...of each nuts area
            area_mask = df[area_col + area_col_suffix] == area_name
            stats[gdp_col] = df[area_mask][area_col + "_value"].unique()[0]
            stats[gdp_unit_col] = df[area_mask][area_col + "_unit"].unique()[0]
            stats[growth_col] = df[area_mask][area_col +
                                              growth_col_suffix].unique()[0]
            stats[ratio_col] = df[area_mask][area_col +
                                             ratio_col_suffix].unique()[0]
        stats_lst.append(stats)

    describe_df = pd.concat(stats_lst)

    return describe_df