Ejemplos de median en Python, ejemplos de wquantiles.median en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: plot_contours.py Proyecto: clark2668/ehe_studies

def find_contours_2D(x_values, y_values, xbins, weights=None, c1=16, c2=84):
    """
	Find upper and lower contours and median
	x_values = array, input for hist2d for x axis (typically truth)
	y_values = array, input for hist2d for y axis (typically reconstruction)
	xbins = values for the starting edge of the x bins (output from hist2d)
	c1 = percentage for lower contour bound (16% - 84% means a 68% band, so c1 = 16)
	c2 = percentage for upper contour bound (16% - 84% means a 68% band, so c2=84)
	Returns:
		x = values for xbins, repeated for plotting (i.e. [0,0,1,1,2,2,...]
		y_median = values for y value medians per bin, repeated for plotting (i.e. [40,40,20,20,50,50,...]
		y_lower = values for y value lower limits per bin, repeated for plotting (i.e. [30,30,10,10,20,20,...]
		y_upper = values for y value upper limits per bin, repeated for plotting (i.e. [50,50,40,40,60,60,...]
	"""
    if weights is not None:
        import wquantiles as wq
    y_values = numpy.array(y_values)
    indices = numpy.digitize(x_values, xbins)
    r1_save = []
    r2_save = []
    median_save = []
    for i in range(1, len(xbins)):
        mask = indices == i
        if len(y_values[mask]) > 0:
            if weights is None:
                r1, m, r2 = numpy.percentile(y_values[mask], [c1, 50, c2])
            else:
                r1 = wq.quantile(y_values[mask], weights[mask], c1 / 100.)
                r2 = wq.quantile(y_values[mask], weights[mask], c2 / 100.)
                m = wq.median(y_values[mask], weights[mask])
        else:
            #print(i,'empty bin')
            r1 = numpy.nan
            m = numpy.nan
            r2 = numpy.nan
        median_save.append(m)
        r1_save.append(r1)
        r2_save.append(r2)
    median = numpy.array(median_save)
    lower = numpy.array(r1_save)
    upper = numpy.array(r2_save)

    # this is a funny way of outputting the result
    # which was in the original code we borrowed from the oscnext folks
    # remove it for now
    # x = list(itertools.chain(*zip(xbins[:-1],xbins[1:])))
    # y_median = list(itertools.chain(*zip(median,median)))
    # y_lower = list(itertools.chain(*zip(lower,lower)))
    # y_upper = list(itertools.chain(*zip(upper,upper)))
    # return x, y_median, y_lower, y_upper

    # the first return with the [1:] and [:-1] is about locating the bin centers
    return (xbins[1:] + xbins[:-1]) / 2, median, lower, upper

Ejemplo n.º 2

0

Mostrar archivo

Archivo: deskew.py Proyecto: sepastian/origami

    def __init__(self, lines=None, skew=None):
        if skew is None:
            assert lines is not None
            angles = np.array([line.angle for line in lines.values()])
            lengths = np.array([line.length for line in lines.values()])
            skew = wquantiles.median(angles, lengths)
        else:
            assert skew is not None

        self._skew = skew
        self._matrix = cv2.getRotationMatrix2D((0, 0), skew * (180 / math.pi),
                                               1)
        self._shapely_matrix = to_shapely_matrix(self._matrix)

Ejemplo n.º 3

0

Mostrar archivo

def func_median(data, mask, map_clusters=None):
    """
    Compute weighted median. This is a "non-discrete" implementation of the median, in that it computes the mean between
    the middle discrete values. For more context, see: https://github.com/nudomarinero/wquantiles/issues/4
    :param data: nd-array: input data
    :param mask: (n+1)d-array: input mask
    :param map_clusters: not used
    :return:
    """
    # Check if mask has an additional dimension (in case it is a label). If so, select the first label
    if mask.ndim == data.ndim + 1:
        mask = mask[..., 0]
    data, mask = data.reshape(-1), mask.reshape(-1)
    return wquantiles.median(data, mask), None

Ejemplo n.º 4

0

Mostrar archivo

Archivo: 1)Location Estimates of Population and Murder.py Proyecto: Mehmetzahitangi/practicalStatistics

from scipy.stats import trim_mean
import pandas as pd
import numpy as np
import wquantiles 
# wquantiles import yazdığımızda yüklenmiyorsa pip install wquantiles ile yükleyip sonra burda import edebiliriz)
# dataylı bilgi https://pypi.org/project/wquantiles/#description

state = pd.read_csv("state.csv")
print(state.head(8))

print("Mean: ", state["Population"].mean())
print("Trimmed Mean: ", trim_mean(state["Population"], 0.1))
print("Median: ", state["Population"].median())
## Trimmed mean is close to median because we cut bootm and top 10% data

# numpy for weighted mean
print("Murder Rate Mean: ", state["Murder.Rate"].mean())
print("Weighted mean: ", np.average(state["Murder.Rate"], weights = state["Population"]))
print("Weighted median: ",  wquantiles.median(state['Murder.Rate'], weights=state['Population']))

Ejemplo n.º 5

0

Mostrar archivo

Archivo: Pamp2comp.py Proyecto: drheitor/Musetools

     except:
         pass
 os.remove(mtnm)
 
 #now we construct the weighted median flux
 mdfx = []
 weights = np.array(weights)
 #taking the lambda from the reference OB, since all of them are the same
 reflbd = star_db[obj][star][hiob]['rblbd']
 #this loop uses the wq package to take the weighted median for each flux point
 for n, f in enumerate(reflbd):
     fxs = []
     for ob in star_db[obj][star].keys():
         fxs.append(star_db[obj][star][ob]['rbflx'][n])
     fxs = np.array(fxs)
     mdfx.append(wq.median(fxs, weights))
 
 #an issue with Etoile makes it add some zeros to the end of the files
 #we correct this here
 rejl = []
 for n, f in enumerate(mdfx):
     if f == 0:
         rejl.append(n)
 
 #now we save the combined spectrum
 comblb = np.delete(reflbd, rejl)
 combfx = np.delete(mdfx, rejl)
 star_db[obj][star]['combspec'] = [comblb, combfx]
 
 #plt.plot(comblb, combfx ,'k-', lw=0.7, label='combined')

Ejemplo n.º 6

0

Mostrar archivo

def stat(x, key):
    return wq.median(x["time"], x[key])

Ejemplo n.º 7

0

Mostrar archivo

def wmedian2(df, column_name, weights_name='wt0'):

    df = df.dropna(subset=[column_name, weights_name])

    return wq.median(df[column_name], df[weights_name])

Ejemplo n.º 8

0

Mostrar archivo

def game_predictions(df,
                     home_team,
                     away_team,
                     last_n_games='all',
                     outer_opp_win_pct=True,
                     central_tendency='mean',
                     distribution='poisson',
                     inner_opp_win_pct=True,
                     weight_home=1,
                     weight_away=1,
                     n_simulations=1000):
    # suppress the SettingWithCopyWarning
    pd.options.mode.chained_assignment = None
    # drop unplayed games
    df = df.dropna(subset=['home_score'])

    # get win pct for each team for weighting later
    # get all teams
    list_all_teams = list(df['home_team']) + list(df['away_team'])
    # get the unique ones
    list_teams_unique = list(dict.fromkeys(list_all_teams))
    # get win pct for each team
    list_win_pct = []
    for team in list_teams_unique:
        # subset to where home team or away team == team
        df_subset = df[(df['home_team'] == team) | (df['away_team'] == team)]
        # see how many times team is in winning_team
        n_wins = list(df_subset['winning_team']).count(team)
        # get number of games
        n_games = df_subset.shape[0]
        # get win pct
        win_pct = n_wins / n_games
        # if we have zero win pct make it .01
        if win_pct == 0:
            win_pct = 0.01
        # append to list
        list_win_pct.append(win_pct)
    # match win_pct with team
    df_win_pct = pd.DataFrame({
        'team': list_teams_unique,
        'win_pct': list_win_pct
    })

    # -------------------------------------------------------------------------

    # 1. get all the games where the home_team was playing
    df_home = df[(df['home_team'] == home_team) |
                 (df['away_team'] == home_team)]

    # to prevent errors
    # get n_rows
    n_rows = df_home.shape[0]
    if (last_n_games != 'all') and last_n_games > n_rows:
        last_n_games = n_rows
    # use last n_games
    if last_n_games == 'all':
        df_home = df_home
    else:
        df_home = df_home.iloc[-last_n_games:]

    # rename the columns because it helps some of the logic later
    df_home.rename(columns={
        'home_score': 'home_pts',
        'away_score': 'away_pts'
    },
                   inplace=True)
    # get points scored by home team (name the col home_score so it will match with the other logic we have)
    df_home['home_score'] = df_home.apply(
        lambda x: x['home_pts']
        if x['home_team'] == home_team else x['away_pts'],
        axis=1)
    # get the points allowed by the home team
    df_home['away_score'] = df_home.apply(
        lambda x: x['home_pts']
        if x['home_team'] != home_team else x['away_pts'],
        axis=1)
    # mark games where the home_team is home with a number (i.e., 2)
    df_home['weights'] = df_home.apply(lambda x: weight_home
                                       if x['home_team'] == home_team else 1,
                                       axis=1)

    # if we choose to weight each game by the opponents win %
    if outer_opp_win_pct == True:
        # get the name of the opponent
        df_home['opponent_name'] = df_home.apply(
            lambda x: x['home_team']
            if x['away_team'] == home_team else x['away_team'],
            axis=1)
        # merge with df_win_pct to get opponent win %
        df_home = pd.merge(left=df_home,
                           right=df_win_pct,
                           left_on='opponent_name',
                           right_on='team',
                           how='left')
        # multiply 'weights' by 'win_pct'
        df_home['weights'] = df_home['weights'] * df_home['win_pct']

    # save weights
    list_weights = list(df_home['weights'])
    # some logic to catch errors
    if np.sum(list_weights) == 0:
        list_weights = [1 for x in list_weights]

    # get the central tendency number
    if central_tendency == 'mean':
        # calculate mean of home_score
        home_home_score_avg = np.average(df_home['home_score'],
                                         weights=list_weights)
        # get mean of away_score
        home_opponent_score_avg = np.average(df_home['away_score'],
                                             weights=list_weights)
    else:
        # calculate median home_score
        home_home_score_avg = weighted.median(df_home['home_score'],
                                              weights=list_weights)
        # get median of away_score
        home_opponent_score_avg = weighted.median(df_home['away_score'],
                                                  weights=list_weights)

    # if distribution == 'poisson'
    if distribution == 'poisson':
        # draw a random number from a poisson distribution for predicted home score
        list_pred_home_home_score = list(
            np.random.poisson(home_home_score_avg, n_simulations))
        # draw a random number from a poisson distribution for predicted away score
        list_pred_home_opponent_score = list(
            np.random.poisson(home_opponent_score_avg, n_simulations))
    # if distribution == 'normal'
    else:
        # calculate sd of home_score (for normal distribution)
        home_home_score_sd = np.sqrt(
            np.average((df_home['home_score'] - home_home_score_avg)**2,
                       weights=list_weights))
        # get sd of away_score
        home_opponent_score_sd = np.sqrt(
            np.average((df_home['away_score'] - home_opponent_score_avg)**2,
                       weights=list_weights))
        # draw a random number from a normal distribution
        list_pred_home_home_score = list(
            np.random.normal(loc=home_home_score_avg,
                             scale=home_home_score_sd,
                             size=n_simulations))
        # draw a random number from a normal distribution
        list_pred_home_opponent_score = list(
            np.random.normal(loc=home_opponent_score_avg,
                             scale=home_opponent_score_sd,
                             size=n_simulations))

    # -------------------------------------------------------------------------

    # 2. repeat the same steps but using the away team
    # get all the games where the away_team was playing
    df_away = df[(df['home_team'] == away_team) |
                 (df['away_team'] == away_team)]

    # to prevent errors
    # get n_rows
    n_rows = df_away.shape[0]
    if (last_n_games != 'all') and (last_n_games > n_rows):
        last_n_games = n_rows
    # use last n_games
    if last_n_games == 'all':
        df_away = df_away
    else:
        df_away = df_away.iloc[-last_n_games:]

    # rename the columns because it helps some of the logic later
    df_away.rename(columns={
        'home_score': 'home_pts',
        'away_score': 'away_pts'
    },
                   inplace=True)
    # get points scored by away team (name the col away_score so it will match with the other logic we have)
    df_away['away_score'] = df_away.apply(
        lambda x: x['away_pts']
        if x['away_team'] == away_team else x['home_pts'],
        axis=1)
    # get the points allowed by the home team
    df_away['home_score'] = df_away.apply(
        lambda x: x['away_pts']
        if x['away_team'] != away_team else x['home_pts'],
        axis=1)
    # mark games where the away_team is away with a number (i.e., 2)
    df_away['weights'] = df_away.apply(lambda x: weight_away
                                       if x['away_team'] == away_team else 1,
                                       axis=1)

    # if we choose to weight each game by the opponents win %
    if outer_opp_win_pct == True:
        # get the name of the opponent
        df_away['opponent_name'] = df_away.apply(
            lambda x: x['away_team']
            if x['home_team'] == away_team else x['home_team'],
            axis=1)
        # merge with df_win_pct to get opponent win %
        df_away = pd.merge(left=df_away,
                           right=df_win_pct,
                           left_on='opponent_name',
                           right_on='team',
                           how='left')
        # multiply 'weights' by 'win_pct'
        df_away['weights'] = df_away['weights'] * df_away['win_pct']

    # save weights
    list_weights = list(df_away['weights'])
    # some logic to catch errors
    if np.sum(list_weights) == 0:
        list_weights = [1 for x in list_weights]

    # get the central tendency number
    if central_tendency == 'mean':
        # calculate mean of home_score
        away_away_score_avg = np.average(df_away['away_score'],
                                         weights=list_weights)
        # get mean of away_score
        away_opponent_score_avg = np.average(df_away['home_score'],
                                             weights=list_weights)
    else:  # i.e., median
        # calculate median home_score
        away_away_score_avg = weighted.median(df_away['away_score'],
                                              weights=list_weights)
        # get median of away_score
        away_opponent_score_avg = weighted.median(df_away['home_score'],
                                                  weights=list_weights)

    # if distribution == 'poisson'
    if distribution == 'poisson':
        # draw a random number from a poisson distribution for predicted away score
        list_pred_away_away_score = list(
            np.random.poisson(away_away_score_avg, n_simulations))
        # draw a random number from a poisson distribution for predicted home score
        list_pred_away_opponent_score = list(
            np.random.poisson(away_opponent_score_avg, n_simulations))
    # if distribution == 'normal'
    else:
        # calculate sd of home_score (for normal distribution)
        away_away_score_sd = np.sqrt(
            np.average((df_away['away_score'] - away_away_score_avg)**2,
                       weights=list_weights))
        # get sd of away_score
        away_opponent_score_sd = np.sqrt(
            np.average((df_away['home_score'] - away_opponent_score_avg)**2,
                       weights=list_weights))
        # draw a random number from a normal distribution
        list_pred_away_away_score = list(
            np.random.normal(loc=away_away_score_avg,
                             scale=away_away_score_sd,
                             size=n_simulations))
        # draw a random number from a normal distribution
        list_pred_away_opponent_score = list(
            np.random.normal(loc=away_opponent_score_avg,
                             scale=away_opponent_score_sd,
                             size=n_simulations))

    # -------------------------------------------------------------------------

    # put into a df
    df_predictions = pd.DataFrame({
        'pred_home_home_score':
        list_pred_home_home_score,
        'pred_home_opponent_score':
        list_pred_home_opponent_score,
        'pred_away_away_score':
        list_pred_away_away_score,
        'pred_away_opponent_score':
        list_pred_away_opponent_score
    })

    # -------------------------------------------------------------------------

    # 3. now let's have the scores meet in the middle
    # if we want a straight avg
    if inner_opp_win_pct == False:
        list_weights = [1 for x in range(1, 3)]
    # if we want to weight in terms of win pct
    else:
        # get list of teams
        list_matchup_teams = [home_team, away_team]
        # get win pct for each team in list_df_home_opp so we can use them as weights
        list_weights = []
        for opp in list_matchup_teams:
            # find index of opp in df_win_pct
            index_opp = list(df_win_pct['team']).index(opp)
            # get win pct
            win_pct = df_win_pct['win_pct'][index_opp]
            # append to list
            list_weights.append(win_pct)
        # logic to avoid errors
        if np.sum(list_weights) == 0:
            list_weights = [1, 1]

    # home score prediction
    df_predictions['pred_home_score'] = df_predictions.apply(
        lambda x: np.average(
            [x['pred_home_home_score'], x['pred_away_opponent_score']],
            weights=list_weights),
        axis=1)
    # away score prediction
    df_predictions['pred_away_score'] = df_predictions.apply(
        lambda x: np.average(
            [x['pred_home_opponent_score'], x['pred_away_away_score']],
            weights=list_weights),
        axis=1)

    # creat a col 1/0 to deal with ties
    df_predictions['rand_binomial'] = np.random.binomial(1, 0.5, n_simulations)

    # create col == 1 if home team wins
    # define function
    def did_home_win(pred_home_score, pred_away_score, rand_binomial):
        if pred_home_score > pred_away_score:
            return 1
        elif pred_home_score < pred_away_score:
            return 0
        elif pred_home_score == pred_away_score and rand_binomial == 1:
            return 1
        else:
            return 0

    # get sum of games where home team won
    sum_home_wins = np.sum(
        df_predictions.apply(
            lambda x: did_home_win(pred_home_score=x['pred_home_score'],
                                   pred_away_score=x['pred_away_score'],
                                   rand_binomial=x['rand_binomial']),
            axis=1))
    # get the proportion of games where the home team is > away team
    prop_home_win = sum_home_wins / n_simulations

    # get mean home score
    mean_home_score = round(np.mean(df_predictions['pred_home_score']), 3)
    # get mean away score
    mean_away_score = round(np.mean(df_predictions['pred_away_score']), 3)

    # get winning team
    if prop_home_win >= .5:
        winning_team = home_team
    else:
        winning_team = away_team

    # create a dictionary to return objects
    dict_results = {
        'mean_home_pts': mean_home_score,
        'mean_away_pts': mean_away_score,
        'prob_home_win': prop_home_win,
        'winning_team': winning_team
    }
    # return dict_results
    return dict_results

Ejemplo n.º 9

0

Mostrar archivo

Archivo: weights.py Proyecto: metatab-packages/inequality-collection

def wmedian3(df, column_name, weights_name='wt0'):
    import wquantiles as wq
    df = df.dropna(subset=[column_name,weights_name ])
    
    return wq.median( df[column_name], df[weights_name])

Ejemplo n.º 10

0

Mostrar archivo

Archivo: twisst_node_depth.py Proyecto: simonhmartin/twisst

    for y in range(nTopos):
        distMat = make2DarrayFrom1DupperTriangle(dists[x,y,:], nTaxa)
        for z in range(len(nodeNames[y])):
            depths[x,y,z] = distMat[nodeLeafPairs[y][z]].mean() if not nodes_all[y][z].is_leaf() else 0.0


#scale depths by dividing by the depth of the root
#the first node in each topo is the root, as the traversal goes to the root first
depths = depths / np.repeat(depths[:,:,0,np.newaxis], depths.shape[2], axis=2)
#anyehere we have nan is where the root depth was zero. This happens where we had missing data. So we can set all these tree depths to zero.
depths = np.nan_to_num(depths)


depths_average = np.average(depths, axis = 0, weights=np.repeat(weights[:,:,np.newaxis], depths.shape[2], axis=2))

depths_median = [[wquantiles.median(depths[:,j,k], weights=weights[:,j]) for k in range(depths.shape[2])] for j in range(depths.shape[1])]

if args.quantiles:
    depths_qL = [[wquantiles.quantile(depths[:,j,k], weights[:,j], args.quantiles[0]) for k in range(depths.shape[2])] for j in range(depths.shape[1])]
    depths_qU = [[wquantiles.quantile(depths[:,j,k], weights[:,j], args.quantiles[1]) for k in range(depths.shape[2])] for j in range(depths.shape[1])]


#cols = np.array([
    #"#2BCE48", #Green
    #"#005C31", #Forest
    #"#94FFB5", #Jade
    #"#9DCC00", #Lime
    #"#426600", #Quagmire
    #"#00998F", #Turquoise
    #"#5EF1F2", #Sky
    #"#0075DC", #Blue

Ejemplo n.º 11

0

Mostrar archivo

Archivo: twisst_node_depth.py Proyecto: simonhmartin/twisst

#scale depths by dividing by the depth of the root
#the first node in each topo is the root, as the traversal goes to the root first
depths = depths / np.repeat(
    depths[:, :, 0, np.newaxis], depths.shape[2], axis=2)
#anyehere we have nan is where the root depth was zero. This happens where we had missing data. So we can set all these tree depths to zero.
depths = np.nan_to_num(depths)

depths_average = np.average(depths,
                            axis=0,
                            weights=np.repeat(weights[:, :, np.newaxis],
                                              depths.shape[2],
                                              axis=2))

depths_median = [[
    wquantiles.median(depths[:, j, k], weights=weights[:, j])
    for k in range(depths.shape[2])
] for j in range(depths.shape[1])]

if args.quantiles:
    depths_qL = [[
        wquantiles.quantile(depths[:, j, k], weights[:, j], args.quantiles[0])
        for k in range(depths.shape[2])
    ] for j in range(depths.shape[1])]
    depths_qU = [[
        wquantiles.quantile(depths[:, j, k], weights[:, j], args.quantiles[1])
        for k in range(depths.shape[2])
    ] for j in range(depths.shape[1])]

#cols = np.array([
#"#2BCE48", #Green

Ejemplo n.º 12

0

Mostrar archivo

Archivo: plot_contours.py Proyecto: clark2668/ehe_studies

def plot_1d_binned_slices(truth, reco1, reco2=None,
        xarray1=None,xarray2=None,truth2=None,\
        plot_resolution=False, use_fraction = False,\
        bins=10,xmin=-1.,xmax=1,style="contours",\
        x_name = "Zenith", x_units = "",\
        y_units=None,
        reco1_name = "Reco 1", reco2_name = "Reco 2",\
        reco1_weight = None, reco2_weight = None,
        save=True,savefolder='.'):
    """Plots different energy slices vs each other (systematic set arrays)
	Receives:
		truth = 1D array with truth values
		reco1 = 1D array that has reconstructed results
		reco2 = optional, 1D array that has an alternate reconstructed results
		xarray1 = optional, 1D array that the reco1 variable (or resolution) will be plotted against, if none is given, will automatically use truth1
		xarray2 = optional, 1D array that the reco2 variable (or resolution2) will be plotted against, if none is given, will automatically use xarray1
		truth2 = 1D array with truth values used to calculate resolution2
		plot_resolution = use resolution (reco - truth) instead of just reconstructed values
		use_fraction = bool, use fractional resolution instead of absolute, where (reco - truth)/truth
		style = "errorbars" is only string that would trigger change (to errorbar version), default is contour plot version
		bins = integer number of data points you want (range/bins = width)
		xmin = minimum truth value to start cut at (default = -1.)
		xmax = maximum truth value to end cut at (default = 1.)
		x_name = variable for x axis (what is the truth)
		x_units = units for truth/x-axis variable
		reco1_name = name for reconstruction 1
		reco2_name = name for reconstruction 2
		reco1_weight = 1D array for reco1 weights, if left None, will not use
		reco2_weight = 1D array for reco2 weights, if left None, will not use
	Returns:
		Scatter plot with truth bins on x axis (median of bin width)
		y axis has median of resolution or absolute reconstructed value with error bars containing given percentile
	"""

    percentile_in_peak = 68.27  #CAN CHANGE
    left_tail_percentile = (100. - percentile_in_peak) / 2
    right_tail_percentile = 100. - left_tail_percentile
    ranges = numpy.linspace(xmin, xmax, num=bins)
    centers = (ranges[1:] + ranges[:-1]) / 2.

    # if no xarray given, automatically use truth
    if xarray1 is None:
        xarray1 = truth
    # Calculate resolution if plot_resolution flag == True
    if plot_resolution:
        if use_fraction:
            yvariable = ((reco1 - truth) / truth)  # in fraction
        else:
            yvariable = (reco1 - truth)
    else:  #use reco directly, not resolution
        y_variable = reco1
        assert use_fraction == False, "Flag for fractional resolution only, not doing resolution here"

    medians = numpy.zeros(len(centers))
    err_from = numpy.zeros(len(centers))
    err_to = numpy.zeros(len(centers))

    #Compare to second reconstruction if given
    if reco2 is not None:
        #check if some variables exist, if not, set to match reco1's
        if truth2 is None:
            truth2 = truth1
        if xarray2 is None:
            xarray2 = xarray1

        if plot_resolution:
            if use_fraction:
                yvariable2 = ((reco2 - truth2) / truth2)
            else:
                yvariable2 = (reco2 - truth2)
        else:
            yvariable2 = reco2
        medians2 = numpy.zeros(len(centers))
        err_from2 = numpy.zeros(len(centers))
        err_to2 = numpy.zeros(len(centers))

    # Find median and percentile bounds for data
    for i in range(len(ranges) - 1):

        # Make a cut based on the truth (binned on truth)
        var_to = ranges[i + 1]
        var_from = ranges[i]
        cut = (xarray1 >= var_from) & (xarray1 < var_to)
        assert sum(
            cut
        ) > 0, "No events in xbin from %s to %s for reco1, may need to change xmin, xmax, or number of bins or check truth/xarray inputs" % (
            var_from, var_to)
        if reco2 is not None:
            cut2 = (xarray2 >= var_from) & (xarray2 < var_to)
            assert sum(
                cut2
            ) > 0, "No events in xbin from %s to %s for reco2, may need to change xmin, xmax, or number of bins or check truth2/xarray2 inputs" % (
                var_from, var_to)

        #find number of reco1 (or resolution) in this bin
        if reco1_weight is None:
            lower_lim = numpy.percentile(yvariable[cut], left_tail_percentile)
            upper_lim = numpy.percentile(yvariable[cut], right_tail_percentile)
            median = numpy.percentile(yvariable[cut], 50.)
        else:
            import wquantiles as wq
            lower_lim = wq.quantile(yvariable[cut], reco1_weight[cut],
                                    left_tail_percentile)
            upper_lim = wq.quantile(yvariable[cut], reco1_weight[cut],
                                    right_tail_percentile)
            median = wq.median(yvariable[cut], reco1_weight[cut])

        medians[i] = median
        err_from[i] = lower_lim
        err_to[i] = upper_lim

        #find number of reco2 (or resolution2) in this bin
        if reco2 is not None:
            if reco2_weight is None:
                lower_lim2 = numpy.percentile(yvariable2[cut2],
                                              left_tail_percentile)
                upper_lim2 = numpy.percentile(yvariable2[cut2],
                                              right_tail_percentile)
                median2 = numpy.percentile(yvariable2[cut2], 50.)
            else:
                import wquantiles as wq
                lower_lim2 = wq.quantile(yvariable2[cut2], reco2_weight[cut2],
                                         left_tail_percentile)
                upper_lim2 = wq.quantile(yvariable2[cut2], reco2_weight[cut2],
                                         right_tail_percentile)
                median2 = wq.median(yvariable2[cut2], reco2_weight[cut2])

            medians2[i] = median2
            err_from2[i] = lower_lim2
            err_to2[i] = upper_lim2

    # Make plot
    plt.figure(figsize=(10, 7))

    # Median as datapoint
    # Percentile as y error bars
    # Bin size as x error bars
    if style is "errorbars":
        plt.errorbar(centers,
                     medians,
                     yerr=[medians - err_from, err_to - medians],
                     xerr=[centers - ranges[:-1], ranges[1:] - centers],
                     capsize=5.0,
                     fmt='o',
                     label="%s" % reco1_name)
        #Compare to second reconstruction, if given
        if reco2 is not None:
            plt.errorbar(centers,
                         medians2,
                         yerr=[medians2 - err_from2, err_to2 - medians2],
                         xerr=[centers - ranges[:-1], ranges[1:] - centers],
                         capsize=5.0,
                         fmt='o',
                         label="%s" % reco2_name)
            plt.legend(loc="upper center")
    # Make contour plot
    # Center solid line is median
    # Shaded region is percentile
    # NOTE: plotted using centers, so 0th and last bins look like they stop short (by 1/2*bin_size)
    else:
        alpha = 0.5
        lwid = 3
        cmap = plt.get_cmap('Blues')
        colors = cmap(numpy.linspace(0, 1, 2 + 2))[2:]
        color = colors[0]
        cmap = plt.get_cmap('Oranges')
        rcolors = cmap(numpy.linspace(0, 1, 2 + 2))[2:]
        rcolor = rcolors[0]
        ax = plt.gca()
        ax.plot(centers,
                medians,
                linestyle='-',
                label="%s median" % (reco1_name),
                color=color,
                linewidth=lwid)
        ax.fill_between(centers, medians, err_from, color=color, alpha=alpha)
        ax.fill_between(centers,
                        medians,
                        err_to,
                        color=color,
                        alpha=alpha,
                        label=reco1_name + " %i" % percentile_in_peak + '%')
        if reco2 is not None:
            ax.plot(centers,
                    medians2,
                    color=rcolor,
                    linestyle='-',
                    label="%s median" % reco2_name,
                    linewidth=lwid)
            ax.fill_between(centers,
                            medians2,
                            err_from1,
                            color=rcolor,
                            alpha=alpha)
            ax.fill_between(centers,
                            medians2,
                            err_to2,
                            color=rcolor,
                            alpha=alpha,
                            label=reco2_name + " %i" % percentile_in_peak +
                            '%')

    # Extra features to have a horizontal 0 line and trim the x axis
    plt.plot([xmin, xmax], [0, 0], color='k')
    plt.xlim(xmin, xmax)

    #Make pretty labels
    plt.xlabel("%s %s" % (x_name, x_units))
    if plot_resolution:
        if use_fraction:
            plt.ylabel(
                "Fractional Resolution: \n (reconstruction - truth)/truth")
        else:
            plt.ylabel("Resolution: \n reconstruction - truth %s" % x_units)
            if y_units is not None:
                plt.ylabel("Resolution: \n reconstruction - truth %s" %
                           y_units)
    else:
        plt.ylabel("Reconstructed %s %s" (x_name, x_units))

    # Make a pretty title
    title = "%s Dependence for %s" % (x_name, reco1_name)
    if reco2 is not None:
        title += " and %s" (reco2_name)
    if plot_resolution:
        title += " Resolution"
    plt.title("%s" % (title))

    # Make a pretty filename
    savename = "%s" % (x_name.replace(" ", ""))
    if use_fraction:
        savename += "Frac"
    if plot_resolution:
        savename += "Resolution"
    if reco2 is not None:
        savename += "_Compare%s" % (reco2_name.replace(" ", ""))
    if save == True:
        plt.savefig("%s/%s.png" % (savefolder, savename))

Ejemplo n.º 13

0

Mostrar archivo

Archivo: Chapter 1 - Exploratory Data Analysis.py Proyecto: yukirin/practical-statistics-for-data-scientists

state = pd.read_csv(STATE_CSV)
print(state["Population"].mean())

print(trim_mean(state["Population"], 0.1))

print(state["Population"].median())

# Weighted mean is available with numpy. For weighted median, we can use the specialised
# package `wquantiles` (https://pypi.org/project/wquantiles/).

print(state["Murder.Rate"].mean())

print(np.average(state["Murder.Rate"], weights=state["Population"]))

print(wquantiles.median(state["Murder.Rate"], weights=state["Population"]))

# Estimates of Variability

# Table 1-2
print(state.head(8))

# Standard deviation

print(state["Population"].std())

# Interquartile range is calculated as the difference of the 75% and 25% quantile.

print(state["Population"].quantile(0.75) - state["Population"].quantile(0.25))

# Median absolute deviation from the median can be calculated with a method in _statsmodels_

Ejemplo n.º 14

0

Mostrar archivo

Archivo: page-012.py Proyecto: philipwalsh/practical-statistics

###
### Practical Statistics for Data Sceintists
###

import pandas as pd
from scipy import stats
import numpy as np
import wquantiles

state = pd.read_csv(
    'D:/my-coding/practical-statistics-4-ds-book/data/state.csv')
state_mean = state['Population'].mean()
print(state_mean)
state_mean_trimmed_01 = stats.trim_mean(state['Population'], 0.1)
print(state_mean_trimmed_01)
state_median = state['Population'].median()
print(state_median)

state_weighted_mean = np.average(state['Murder.Rate'],
                                 weights=state['Population'])
print(state_weighted_mean)

state_weighted_median = wquantiles.median(state['Murder.Rate'],
                                          weights=state['Population'])
print(state_weighted_median)

## key ideas
## basic metric for location is the mean, but it can be sensaitive to extreme values aka outliers
## other metrics such as median and trimmed mean are less sensitive to outliers and unusal distributions and hence are more robust