def find_contours_2D(x_values, y_values, xbins, weights=None, c1=16, c2=84): """ Find upper and lower contours and median x_values = array, input for hist2d for x axis (typically truth) y_values = array, input for hist2d for y axis (typically reconstruction) xbins = values for the starting edge of the x bins (output from hist2d) c1 = percentage for lower contour bound (16% - 84% means a 68% band, so c1 = 16) c2 = percentage for upper contour bound (16% - 84% means a 68% band, so c2=84) Returns: x = values for xbins, repeated for plotting (i.e. [0,0,1,1,2,2,...] y_median = values for y value medians per bin, repeated for plotting (i.e. [40,40,20,20,50,50,...] y_lower = values for y value lower limits per bin, repeated for plotting (i.e. [30,30,10,10,20,20,...] y_upper = values for y value upper limits per bin, repeated for plotting (i.e. [50,50,40,40,60,60,...] """ if weights is not None: import wquantiles as wq y_values = numpy.array(y_values) indices = numpy.digitize(x_values, xbins) r1_save = [] r2_save = [] median_save = [] for i in range(1, len(xbins)): mask = indices == i if len(y_values[mask]) > 0: if weights is None: r1, m, r2 = numpy.percentile(y_values[mask], [c1, 50, c2]) else: r1 = wq.quantile(y_values[mask], weights[mask], c1 / 100.) r2 = wq.quantile(y_values[mask], weights[mask], c2 / 100.) m = wq.median(y_values[mask], weights[mask]) else: #print(i,'empty bin') r1 = numpy.nan m = numpy.nan r2 = numpy.nan median_save.append(m) r1_save.append(r1) r2_save.append(r2) median = numpy.array(median_save) lower = numpy.array(r1_save) upper = numpy.array(r2_save) # this is a funny way of outputting the result # which was in the original code we borrowed from the oscnext folks # remove it for now # x = list(itertools.chain(*zip(xbins[:-1],xbins[1:]))) # y_median = list(itertools.chain(*zip(median,median))) # y_lower = list(itertools.chain(*zip(lower,lower))) # y_upper = list(itertools.chain(*zip(upper,upper))) # return x, y_median, y_lower, y_upper # the first return with the [1:] and [:-1] is about locating the bin centers return (xbins[1:] + xbins[:-1]) / 2, median, lower, upper
def __init__(self, lines=None, skew=None): if skew is None: assert lines is not None angles = np.array([line.angle for line in lines.values()]) lengths = np.array([line.length for line in lines.values()]) skew = wquantiles.median(angles, lengths) else: assert skew is not None self._skew = skew self._matrix = cv2.getRotationMatrix2D((0, 0), skew * (180 / math.pi), 1) self._shapely_matrix = to_shapely_matrix(self._matrix)
def func_median(data, mask, map_clusters=None): """ Compute weighted median. This is a "non-discrete" implementation of the median, in that it computes the mean between the middle discrete values. For more context, see: https://github.com/nudomarinero/wquantiles/issues/4 :param data: nd-array: input data :param mask: (n+1)d-array: input mask :param map_clusters: not used :return: """ # Check if mask has an additional dimension (in case it is a label). If so, select the first label if mask.ndim == data.ndim + 1: mask = mask[..., 0] data, mask = data.reshape(-1), mask.reshape(-1) return wquantiles.median(data, mask), None
from scipy.stats import trim_mean import pandas as pd import numpy as np import wquantiles # wquantiles import yazdığımızda yüklenmiyorsa pip install wquantiles ile yükleyip sonra burda import edebiliriz) # dataylı bilgi https://pypi.org/project/wquantiles/#description state = pd.read_csv("state.csv") print(state.head(8)) print("Mean: ", state["Population"].mean()) print("Trimmed Mean: ", trim_mean(state["Population"], 0.1)) print("Median: ", state["Population"].median()) ## Trimmed mean is close to median because we cut bootm and top 10% data # numpy for weighted mean print("Murder Rate Mean: ", state["Murder.Rate"].mean()) print("Weighted mean: ", np.average(state["Murder.Rate"], weights = state["Population"])) print("Weighted median: ", wquantiles.median(state['Murder.Rate'], weights=state['Population']))
except: pass os.remove(mtnm) #now we construct the weighted median flux mdfx = [] weights = np.array(weights) #taking the lambda from the reference OB, since all of them are the same reflbd = star_db[obj][star][hiob]['rblbd'] #this loop uses the wq package to take the weighted median for each flux point for n, f in enumerate(reflbd): fxs = [] for ob in star_db[obj][star].keys(): fxs.append(star_db[obj][star][ob]['rbflx'][n]) fxs = np.array(fxs) mdfx.append(wq.median(fxs, weights)) #an issue with Etoile makes it add some zeros to the end of the files #we correct this here rejl = [] for n, f in enumerate(mdfx): if f == 0: rejl.append(n) #now we save the combined spectrum comblb = np.delete(reflbd, rejl) combfx = np.delete(mdfx, rejl) star_db[obj][star]['combspec'] = [comblb, combfx] #plt.plot(comblb, combfx ,'k-', lw=0.7, label='combined')
def stat(x, key): return wq.median(x["time"], x[key])
def wmedian2(df, column_name, weights_name='wt0'): df = df.dropna(subset=[column_name, weights_name]) return wq.median(df[column_name], df[weights_name])
def game_predictions(df, home_team, away_team, last_n_games='all', outer_opp_win_pct=True, central_tendency='mean', distribution='poisson', inner_opp_win_pct=True, weight_home=1, weight_away=1, n_simulations=1000): # suppress the SettingWithCopyWarning pd.options.mode.chained_assignment = None # drop unplayed games df = df.dropna(subset=['home_score']) # get win pct for each team for weighting later # get all teams list_all_teams = list(df['home_team']) + list(df['away_team']) # get the unique ones list_teams_unique = list(dict.fromkeys(list_all_teams)) # get win pct for each team list_win_pct = [] for team in list_teams_unique: # subset to where home team or away team == team df_subset = df[(df['home_team'] == team) | (df['away_team'] == team)] # see how many times team is in winning_team n_wins = list(df_subset['winning_team']).count(team) # get number of games n_games = df_subset.shape[0] # get win pct win_pct = n_wins / n_games # if we have zero win pct make it .01 if win_pct == 0: win_pct = 0.01 # append to list list_win_pct.append(win_pct) # match win_pct with team df_win_pct = pd.DataFrame({ 'team': list_teams_unique, 'win_pct': list_win_pct }) # ------------------------------------------------------------------------- # 1. get all the games where the home_team was playing df_home = df[(df['home_team'] == home_team) | (df['away_team'] == home_team)] # to prevent errors # get n_rows n_rows = df_home.shape[0] if (last_n_games != 'all') and last_n_games > n_rows: last_n_games = n_rows # use last n_games if last_n_games == 'all': df_home = df_home else: df_home = df_home.iloc[-last_n_games:] # rename the columns because it helps some of the logic later df_home.rename(columns={ 'home_score': 'home_pts', 'away_score': 'away_pts' }, inplace=True) # get points scored by home team (name the col home_score so it will match with the other logic we have) df_home['home_score'] = df_home.apply( lambda x: x['home_pts'] if x['home_team'] == home_team else x['away_pts'], axis=1) # get the points allowed by the home team df_home['away_score'] = df_home.apply( lambda x: x['home_pts'] if x['home_team'] != home_team else x['away_pts'], axis=1) # mark games where the home_team is home with a number (i.e., 2) df_home['weights'] = df_home.apply(lambda x: weight_home if x['home_team'] == home_team else 1, axis=1) # if we choose to weight each game by the opponents win % if outer_opp_win_pct == True: # get the name of the opponent df_home['opponent_name'] = df_home.apply( lambda x: x['home_team'] if x['away_team'] == home_team else x['away_team'], axis=1) # merge with df_win_pct to get opponent win % df_home = pd.merge(left=df_home, right=df_win_pct, left_on='opponent_name', right_on='team', how='left') # multiply 'weights' by 'win_pct' df_home['weights'] = df_home['weights'] * df_home['win_pct'] # save weights list_weights = list(df_home['weights']) # some logic to catch errors if np.sum(list_weights) == 0: list_weights = [1 for x in list_weights] # get the central tendency number if central_tendency == 'mean': # calculate mean of home_score home_home_score_avg = np.average(df_home['home_score'], weights=list_weights) # get mean of away_score home_opponent_score_avg = np.average(df_home['away_score'], weights=list_weights) else: # calculate median home_score home_home_score_avg = weighted.median(df_home['home_score'], weights=list_weights) # get median of away_score home_opponent_score_avg = weighted.median(df_home['away_score'], weights=list_weights) # if distribution == 'poisson' if distribution == 'poisson': # draw a random number from a poisson distribution for predicted home score list_pred_home_home_score = list( np.random.poisson(home_home_score_avg, n_simulations)) # draw a random number from a poisson distribution for predicted away score list_pred_home_opponent_score = list( np.random.poisson(home_opponent_score_avg, n_simulations)) # if distribution == 'normal' else: # calculate sd of home_score (for normal distribution) home_home_score_sd = np.sqrt( np.average((df_home['home_score'] - home_home_score_avg)**2, weights=list_weights)) # get sd of away_score home_opponent_score_sd = np.sqrt( np.average((df_home['away_score'] - home_opponent_score_avg)**2, weights=list_weights)) # draw a random number from a normal distribution list_pred_home_home_score = list( np.random.normal(loc=home_home_score_avg, scale=home_home_score_sd, size=n_simulations)) # draw a random number from a normal distribution list_pred_home_opponent_score = list( np.random.normal(loc=home_opponent_score_avg, scale=home_opponent_score_sd, size=n_simulations)) # ------------------------------------------------------------------------- # 2. repeat the same steps but using the away team # get all the games where the away_team was playing df_away = df[(df['home_team'] == away_team) | (df['away_team'] == away_team)] # to prevent errors # get n_rows n_rows = df_away.shape[0] if (last_n_games != 'all') and (last_n_games > n_rows): last_n_games = n_rows # use last n_games if last_n_games == 'all': df_away = df_away else: df_away = df_away.iloc[-last_n_games:] # rename the columns because it helps some of the logic later df_away.rename(columns={ 'home_score': 'home_pts', 'away_score': 'away_pts' }, inplace=True) # get points scored by away team (name the col away_score so it will match with the other logic we have) df_away['away_score'] = df_away.apply( lambda x: x['away_pts'] if x['away_team'] == away_team else x['home_pts'], axis=1) # get the points allowed by the home team df_away['home_score'] = df_away.apply( lambda x: x['away_pts'] if x['away_team'] != away_team else x['home_pts'], axis=1) # mark games where the away_team is away with a number (i.e., 2) df_away['weights'] = df_away.apply(lambda x: weight_away if x['away_team'] == away_team else 1, axis=1) # if we choose to weight each game by the opponents win % if outer_opp_win_pct == True: # get the name of the opponent df_away['opponent_name'] = df_away.apply( lambda x: x['away_team'] if x['home_team'] == away_team else x['home_team'], axis=1) # merge with df_win_pct to get opponent win % df_away = pd.merge(left=df_away, right=df_win_pct, left_on='opponent_name', right_on='team', how='left') # multiply 'weights' by 'win_pct' df_away['weights'] = df_away['weights'] * df_away['win_pct'] # save weights list_weights = list(df_away['weights']) # some logic to catch errors if np.sum(list_weights) == 0: list_weights = [1 for x in list_weights] # get the central tendency number if central_tendency == 'mean': # calculate mean of home_score away_away_score_avg = np.average(df_away['away_score'], weights=list_weights) # get mean of away_score away_opponent_score_avg = np.average(df_away['home_score'], weights=list_weights) else: # i.e., median # calculate median home_score away_away_score_avg = weighted.median(df_away['away_score'], weights=list_weights) # get median of away_score away_opponent_score_avg = weighted.median(df_away['home_score'], weights=list_weights) # if distribution == 'poisson' if distribution == 'poisson': # draw a random number from a poisson distribution for predicted away score list_pred_away_away_score = list( np.random.poisson(away_away_score_avg, n_simulations)) # draw a random number from a poisson distribution for predicted home score list_pred_away_opponent_score = list( np.random.poisson(away_opponent_score_avg, n_simulations)) # if distribution == 'normal' else: # calculate sd of home_score (for normal distribution) away_away_score_sd = np.sqrt( np.average((df_away['away_score'] - away_away_score_avg)**2, weights=list_weights)) # get sd of away_score away_opponent_score_sd = np.sqrt( np.average((df_away['home_score'] - away_opponent_score_avg)**2, weights=list_weights)) # draw a random number from a normal distribution list_pred_away_away_score = list( np.random.normal(loc=away_away_score_avg, scale=away_away_score_sd, size=n_simulations)) # draw a random number from a normal distribution list_pred_away_opponent_score = list( np.random.normal(loc=away_opponent_score_avg, scale=away_opponent_score_sd, size=n_simulations)) # ------------------------------------------------------------------------- # put into a df df_predictions = pd.DataFrame({ 'pred_home_home_score': list_pred_home_home_score, 'pred_home_opponent_score': list_pred_home_opponent_score, 'pred_away_away_score': list_pred_away_away_score, 'pred_away_opponent_score': list_pred_away_opponent_score }) # ------------------------------------------------------------------------- # 3. now let's have the scores meet in the middle # if we want a straight avg if inner_opp_win_pct == False: list_weights = [1 for x in range(1, 3)] # if we want to weight in terms of win pct else: # get list of teams list_matchup_teams = [home_team, away_team] # get win pct for each team in list_df_home_opp so we can use them as weights list_weights = [] for opp in list_matchup_teams: # find index of opp in df_win_pct index_opp = list(df_win_pct['team']).index(opp) # get win pct win_pct = df_win_pct['win_pct'][index_opp] # append to list list_weights.append(win_pct) # logic to avoid errors if np.sum(list_weights) == 0: list_weights = [1, 1] # home score prediction df_predictions['pred_home_score'] = df_predictions.apply( lambda x: np.average( [x['pred_home_home_score'], x['pred_away_opponent_score']], weights=list_weights), axis=1) # away score prediction df_predictions['pred_away_score'] = df_predictions.apply( lambda x: np.average( [x['pred_home_opponent_score'], x['pred_away_away_score']], weights=list_weights), axis=1) # creat a col 1/0 to deal with ties df_predictions['rand_binomial'] = np.random.binomial(1, 0.5, n_simulations) # create col == 1 if home team wins # define function def did_home_win(pred_home_score, pred_away_score, rand_binomial): if pred_home_score > pred_away_score: return 1 elif pred_home_score < pred_away_score: return 0 elif pred_home_score == pred_away_score and rand_binomial == 1: return 1 else: return 0 # get sum of games where home team won sum_home_wins = np.sum( df_predictions.apply( lambda x: did_home_win(pred_home_score=x['pred_home_score'], pred_away_score=x['pred_away_score'], rand_binomial=x['rand_binomial']), axis=1)) # get the proportion of games where the home team is > away team prop_home_win = sum_home_wins / n_simulations # get mean home score mean_home_score = round(np.mean(df_predictions['pred_home_score']), 3) # get mean away score mean_away_score = round(np.mean(df_predictions['pred_away_score']), 3) # get winning team if prop_home_win >= .5: winning_team = home_team else: winning_team = away_team # create a dictionary to return objects dict_results = { 'mean_home_pts': mean_home_score, 'mean_away_pts': mean_away_score, 'prob_home_win': prop_home_win, 'winning_team': winning_team } # return dict_results return dict_results
def wmedian3(df, column_name, weights_name='wt0'): import wquantiles as wq df = df.dropna(subset=[column_name,weights_name ]) return wq.median( df[column_name], df[weights_name])
for y in range(nTopos): distMat = make2DarrayFrom1DupperTriangle(dists[x,y,:], nTaxa) for z in range(len(nodeNames[y])): depths[x,y,z] = distMat[nodeLeafPairs[y][z]].mean() if not nodes_all[y][z].is_leaf() else 0.0 #scale depths by dividing by the depth of the root #the first node in each topo is the root, as the traversal goes to the root first depths = depths / np.repeat(depths[:,:,0,np.newaxis], depths.shape[2], axis=2) #anyehere we have nan is where the root depth was zero. This happens where we had missing data. So we can set all these tree depths to zero. depths = np.nan_to_num(depths) depths_average = np.average(depths, axis = 0, weights=np.repeat(weights[:,:,np.newaxis], depths.shape[2], axis=2)) depths_median = [[wquantiles.median(depths[:,j,k], weights=weights[:,j]) for k in range(depths.shape[2])] for j in range(depths.shape[1])] if args.quantiles: depths_qL = [[wquantiles.quantile(depths[:,j,k], weights[:,j], args.quantiles[0]) for k in range(depths.shape[2])] for j in range(depths.shape[1])] depths_qU = [[wquantiles.quantile(depths[:,j,k], weights[:,j], args.quantiles[1]) for k in range(depths.shape[2])] for j in range(depths.shape[1])] #cols = np.array([ #"#2BCE48", #Green #"#005C31", #Forest #"#94FFB5", #Jade #"#9DCC00", #Lime #"#426600", #Quagmire #"#00998F", #Turquoise #"#5EF1F2", #Sky #"#0075DC", #Blue
#scale depths by dividing by the depth of the root #the first node in each topo is the root, as the traversal goes to the root first depths = depths / np.repeat( depths[:, :, 0, np.newaxis], depths.shape[2], axis=2) #anyehere we have nan is where the root depth was zero. This happens where we had missing data. So we can set all these tree depths to zero. depths = np.nan_to_num(depths) depths_average = np.average(depths, axis=0, weights=np.repeat(weights[:, :, np.newaxis], depths.shape[2], axis=2)) depths_median = [[ wquantiles.median(depths[:, j, k], weights=weights[:, j]) for k in range(depths.shape[2]) ] for j in range(depths.shape[1])] if args.quantiles: depths_qL = [[ wquantiles.quantile(depths[:, j, k], weights[:, j], args.quantiles[0]) for k in range(depths.shape[2]) ] for j in range(depths.shape[1])] depths_qU = [[ wquantiles.quantile(depths[:, j, k], weights[:, j], args.quantiles[1]) for k in range(depths.shape[2]) ] for j in range(depths.shape[1])] #cols = np.array([ #"#2BCE48", #Green
def plot_1d_binned_slices(truth, reco1, reco2=None, xarray1=None,xarray2=None,truth2=None,\ plot_resolution=False, use_fraction = False,\ bins=10,xmin=-1.,xmax=1,style="contours",\ x_name = "Zenith", x_units = "",\ y_units=None, reco1_name = "Reco 1", reco2_name = "Reco 2",\ reco1_weight = None, reco2_weight = None, save=True,savefolder='.'): """Plots different energy slices vs each other (systematic set arrays) Receives: truth = 1D array with truth values reco1 = 1D array that has reconstructed results reco2 = optional, 1D array that has an alternate reconstructed results xarray1 = optional, 1D array that the reco1 variable (or resolution) will be plotted against, if none is given, will automatically use truth1 xarray2 = optional, 1D array that the reco2 variable (or resolution2) will be plotted against, if none is given, will automatically use xarray1 truth2 = 1D array with truth values used to calculate resolution2 plot_resolution = use resolution (reco - truth) instead of just reconstructed values use_fraction = bool, use fractional resolution instead of absolute, where (reco - truth)/truth style = "errorbars" is only string that would trigger change (to errorbar version), default is contour plot version bins = integer number of data points you want (range/bins = width) xmin = minimum truth value to start cut at (default = -1.) xmax = maximum truth value to end cut at (default = 1.) x_name = variable for x axis (what is the truth) x_units = units for truth/x-axis variable reco1_name = name for reconstruction 1 reco2_name = name for reconstruction 2 reco1_weight = 1D array for reco1 weights, if left None, will not use reco2_weight = 1D array for reco2 weights, if left None, will not use Returns: Scatter plot with truth bins on x axis (median of bin width) y axis has median of resolution or absolute reconstructed value with error bars containing given percentile """ percentile_in_peak = 68.27 #CAN CHANGE left_tail_percentile = (100. - percentile_in_peak) / 2 right_tail_percentile = 100. - left_tail_percentile ranges = numpy.linspace(xmin, xmax, num=bins) centers = (ranges[1:] + ranges[:-1]) / 2. # if no xarray given, automatically use truth if xarray1 is None: xarray1 = truth # Calculate resolution if plot_resolution flag == True if plot_resolution: if use_fraction: yvariable = ((reco1 - truth) / truth) # in fraction else: yvariable = (reco1 - truth) else: #use reco directly, not resolution y_variable = reco1 assert use_fraction == False, "Flag for fractional resolution only, not doing resolution here" medians = numpy.zeros(len(centers)) err_from = numpy.zeros(len(centers)) err_to = numpy.zeros(len(centers)) #Compare to second reconstruction if given if reco2 is not None: #check if some variables exist, if not, set to match reco1's if truth2 is None: truth2 = truth1 if xarray2 is None: xarray2 = xarray1 if plot_resolution: if use_fraction: yvariable2 = ((reco2 - truth2) / truth2) else: yvariable2 = (reco2 - truth2) else: yvariable2 = reco2 medians2 = numpy.zeros(len(centers)) err_from2 = numpy.zeros(len(centers)) err_to2 = numpy.zeros(len(centers)) # Find median and percentile bounds for data for i in range(len(ranges) - 1): # Make a cut based on the truth (binned on truth) var_to = ranges[i + 1] var_from = ranges[i] cut = (xarray1 >= var_from) & (xarray1 < var_to) assert sum( cut ) > 0, "No events in xbin from %s to %s for reco1, may need to change xmin, xmax, or number of bins or check truth/xarray inputs" % ( var_from, var_to) if reco2 is not None: cut2 = (xarray2 >= var_from) & (xarray2 < var_to) assert sum( cut2 ) > 0, "No events in xbin from %s to %s for reco2, may need to change xmin, xmax, or number of bins or check truth2/xarray2 inputs" % ( var_from, var_to) #find number of reco1 (or resolution) in this bin if reco1_weight is None: lower_lim = numpy.percentile(yvariable[cut], left_tail_percentile) upper_lim = numpy.percentile(yvariable[cut], right_tail_percentile) median = numpy.percentile(yvariable[cut], 50.) else: import wquantiles as wq lower_lim = wq.quantile(yvariable[cut], reco1_weight[cut], left_tail_percentile) upper_lim = wq.quantile(yvariable[cut], reco1_weight[cut], right_tail_percentile) median = wq.median(yvariable[cut], reco1_weight[cut]) medians[i] = median err_from[i] = lower_lim err_to[i] = upper_lim #find number of reco2 (or resolution2) in this bin if reco2 is not None: if reco2_weight is None: lower_lim2 = numpy.percentile(yvariable2[cut2], left_tail_percentile) upper_lim2 = numpy.percentile(yvariable2[cut2], right_tail_percentile) median2 = numpy.percentile(yvariable2[cut2], 50.) else: import wquantiles as wq lower_lim2 = wq.quantile(yvariable2[cut2], reco2_weight[cut2], left_tail_percentile) upper_lim2 = wq.quantile(yvariable2[cut2], reco2_weight[cut2], right_tail_percentile) median2 = wq.median(yvariable2[cut2], reco2_weight[cut2]) medians2[i] = median2 err_from2[i] = lower_lim2 err_to2[i] = upper_lim2 # Make plot plt.figure(figsize=(10, 7)) # Median as datapoint # Percentile as y error bars # Bin size as x error bars if style is "errorbars": plt.errorbar(centers, medians, yerr=[medians - err_from, err_to - medians], xerr=[centers - ranges[:-1], ranges[1:] - centers], capsize=5.0, fmt='o', label="%s" % reco1_name) #Compare to second reconstruction, if given if reco2 is not None: plt.errorbar(centers, medians2, yerr=[medians2 - err_from2, err_to2 - medians2], xerr=[centers - ranges[:-1], ranges[1:] - centers], capsize=5.0, fmt='o', label="%s" % reco2_name) plt.legend(loc="upper center") # Make contour plot # Center solid line is median # Shaded region is percentile # NOTE: plotted using centers, so 0th and last bins look like they stop short (by 1/2*bin_size) else: alpha = 0.5 lwid = 3 cmap = plt.get_cmap('Blues') colors = cmap(numpy.linspace(0, 1, 2 + 2))[2:] color = colors[0] cmap = plt.get_cmap('Oranges') rcolors = cmap(numpy.linspace(0, 1, 2 + 2))[2:] rcolor = rcolors[0] ax = plt.gca() ax.plot(centers, medians, linestyle='-', label="%s median" % (reco1_name), color=color, linewidth=lwid) ax.fill_between(centers, medians, err_from, color=color, alpha=alpha) ax.fill_between(centers, medians, err_to, color=color, alpha=alpha, label=reco1_name + " %i" % percentile_in_peak + '%') if reco2 is not None: ax.plot(centers, medians2, color=rcolor, linestyle='-', label="%s median" % reco2_name, linewidth=lwid) ax.fill_between(centers, medians2, err_from1, color=rcolor, alpha=alpha) ax.fill_between(centers, medians2, err_to2, color=rcolor, alpha=alpha, label=reco2_name + " %i" % percentile_in_peak + '%') # Extra features to have a horizontal 0 line and trim the x axis plt.plot([xmin, xmax], [0, 0], color='k') plt.xlim(xmin, xmax) #Make pretty labels plt.xlabel("%s %s" % (x_name, x_units)) if plot_resolution: if use_fraction: plt.ylabel( "Fractional Resolution: \n (reconstruction - truth)/truth") else: plt.ylabel("Resolution: \n reconstruction - truth %s" % x_units) if y_units is not None: plt.ylabel("Resolution: \n reconstruction - truth %s" % y_units) else: plt.ylabel("Reconstructed %s %s" (x_name, x_units)) # Make a pretty title title = "%s Dependence for %s" % (x_name, reco1_name) if reco2 is not None: title += " and %s" (reco2_name) if plot_resolution: title += " Resolution" plt.title("%s" % (title)) # Make a pretty filename savename = "%s" % (x_name.replace(" ", "")) if use_fraction: savename += "Frac" if plot_resolution: savename += "Resolution" if reco2 is not None: savename += "_Compare%s" % (reco2_name.replace(" ", "")) if save == True: plt.savefig("%s/%s.png" % (savefolder, savename))
state = pd.read_csv(STATE_CSV) print(state["Population"].mean()) print(trim_mean(state["Population"], 0.1)) print(state["Population"].median()) # Weighted mean is available with numpy. For weighted median, we can use the specialised # package `wquantiles` (https://pypi.org/project/wquantiles/). print(state["Murder.Rate"].mean()) print(np.average(state["Murder.Rate"], weights=state["Population"])) print(wquantiles.median(state["Murder.Rate"], weights=state["Population"])) # Estimates of Variability # Table 1-2 print(state.head(8)) # Standard deviation print(state["Population"].std()) # Interquartile range is calculated as the difference of the 75% and 25% quantile. print(state["Population"].quantile(0.75) - state["Population"].quantile(0.25)) # Median absolute deviation from the median can be calculated with a method in _statsmodels_
### ### Practical Statistics for Data Sceintists ### import pandas as pd from scipy import stats import numpy as np import wquantiles state = pd.read_csv( 'D:/my-coding/practical-statistics-4-ds-book/data/state.csv') state_mean = state['Population'].mean() print(state_mean) state_mean_trimmed_01 = stats.trim_mean(state['Population'], 0.1) print(state_mean_trimmed_01) state_median = state['Population'].median() print(state_median) state_weighted_mean = np.average(state['Murder.Rate'], weights=state['Population']) print(state_weighted_mean) state_weighted_median = wquantiles.median(state['Murder.Rate'], weights=state['Population']) print(state_weighted_median) ## key ideas ## basic metric for location is the mean, but it can be sensaitive to extreme values aka outliers ## other metrics such as median and trimmed mean are less sensitive to outliers and unusal distributions and hence are more robust