def _process(self, element, key=None): x, y = (element.dimension_values(i) for i in range(2)) x_dim, y_dim = (element.dimensions()[i] for i in range(2)) bins = self.p.bins bins = np.array(bins) # if bins is None: # bins = 10 # if isinstance(bins, int): # print(isdatetime(x)) # if isdatetime(x): # bins = pd.date_range(x.min(), x.max(), periods=bins) # else: # bins = np.linspace(x.min(), x.max(), bins) # elif isinstance(bins, list): # bins = np.array(bins) x_avg = bins[:-1] + np.diff(bins) / 2 y_avg, y16, y84 = (np.nan * np.zeros(len(x_avg)) for i in range(3)) for k, ll, ul in zip(range(len(x_avg)), bins[:-1], bins[1:]): y_sel = y[(ll < x) & (x <= ul)] y_avg[k] = self.p.avg_fun(y_sel) y16[k] = np.nanquantile(y_sel, q=0.16) y84[k] = np.nanquantile(y_sel, q=0.84) errors = { x_dim.name: x_avg, y_dim.name: np.array(y_avg), 'y16': np.array(y_avg) - np.array(y16), 'y84': np.array(y84) - np.array(y_avg) } return hv.ErrorBars(errors, kdims=[x_dim], vdims=[y_dim, 'y16', 'y84'])
def get_gene_stats(xvals, col_idxs, tissues): """ Compute summary stats across all samples for a given gene & tissue """ xmin, xq1, xmed, xmean, xq3, xmax, xsd, xmad = [], [], [], [], [], [], [], [] for tissue in tissues.keys(): tidx = [col_idxs[s] for s in tissues[tissue] if s in col_idxs.keys()] if len(tidx) > 0: tvals = xvals[tidx] xmin.append(np.nanmin(tvals)) xq1.append(np.nanquantile(tvals, q=0.25)) xmed.append(np.nanmedian(tvals)) xmean.append(np.nanmean(tvals)) xq3.append(np.nanquantile(tvals, q=0.75)) xmax.append(np.nanmax(tvals)) xsd.append(np.nanstd(tvals)) xmad.append(mad(tvals)) else: xmin.append(np.nan) xq1.append(np.nan) xmed.append(np.nan) xmean.append(np.nan) xq3.append(np.nan) xmax.append(np.nan) xsd.append(np.nan) xmad.append(np.nan) return xmin, xq1, xmed, xmean, xq3, xmax, xsd, xmad
def plot_prop_pdf_stack(source): pmh = ParmapHandler(source) props = ['tkin', 'texc', 'ncol', 'sigm', 'vcen'] all_bins = [ np.linspace(lo, hi, 100) for lo, hi in [ (7.0, 25.0), # tkin, K (2.7, 15.0), # texc, K (12.0, 17.0), # ncol, log(cm^-2) (0.0, 2.0), # sigm, km/s (-3.0, 3.0), # vcen, km/s (relative) ] ] fig, axes = plt.subplots(ncols=1, nrows=len(props), figsize=(4, 6)) for prop, bins, ax in zip(props, all_bins, axes): data = pmh.get_hdu(prop, rel_velo=True).data vals = data.flatten() med = np.nanmedian(vals) qlo = np.nanquantile(vals, 0.165) qhi = np.nanquantile(vals, 0.835) hist, _, _ = ax.hist(vals, bins=bins, density=True, color='0.3') ax.vlines( [qlo, med, qhi], 0, hist.max(), linestyles=['dotted', 'dashed', 'dotted'], colors='red', ) ax.set_xlim(bins.min(), bins.max()) ax.set_xlabel(pmh.get_label(prop)) ax.set_ylabel('PDF') plt.tight_layout(h_pad=0.5) save_figure(f'{source}_prop_pdf_stack', do_eps=False)
def replace_outliers_iqr(pivot_outliers, k): '''Replace outliers using an iqr deviation method''' pivot_no_outliers = pd.DataFrame(columns=pivot_outliers.columns, index=pivot_outliers.index) pivot_no_outliers.rename(columns={'with_outliers': 'without_outliers'}, level=0, inplace=True) for x in pivot_outliers.index: values = pivot_outliers.loc[x, :].values if np.nanstd(values) != 0 and np.isnan(values).sum() != len(values): Q1 = np.nanquantile(values, 0.25) Q3 = np.nanquantile(values, 0.75) IQR = Q3 - Q1 LB = Q1 - k * IQR UB = Q3 + k * IQR new_values = np.where((values < LB) | (values > UB), np.nanmedian(values), values) else: new_values = values pivot_no_outliers.iloc[ pivot_outliers.index.get_loc(x), :] = new_values.astype('float') return pivot_no_outliers
def _dfinfo(dataframe, asstring=True): '''Returns a a dataframe with info about the given `dataframe` ''' infocols = ['Min', 'Median', 'Max', '#NAs', '#<1Perc.', '#>99Perc.'] sum_df = odict() # if _dfr.empty for col in floatingcols(dataframe): q01 = np.nanquantile(dataframe[col], 0.01) q99 = np.nanquantile(dataframe[col], 0.99) df1, df99 = dataframe[(dataframe[col] < q01)], dataframe[(dataframe[col] > q99)] # segs1 = len(pd.unique(df1[ID_COL])) # segs99 = len(pd.unique(df99[ID_COL])) # stas1 = len(pd.unique(df1['station_id'])) # stas99 = len(pd.unique(df99['station_id'])) sum_df[col] = { infocols[0]: np.nanmin(dataframe[col]), infocols[1]: np.nanquantile(dataframe[col], 0.5), infocols[2]: np.nanmax(dataframe[col]), infocols[3]: (~np.isfinite(dataframe[col])).sum(), infocols[4]: len(df1), infocols[5]: len(df99) # columns[5]: stas1 + stas99, } return pd.DataFrame(data=list(sum_df.values()), columns=infocols, index=list(sum_df.keys()))
def search_assemble_mld_radavg(wod_dbase, lon_arr, lat_arr, kmrad=1e2, crit=0.0528e0): avg_mlds = [] std_mlds = [] min_mlds = [] max_mlds = [] for n, (lonc, latc) in enumerate(zip(lon_arr, lat_arr)): print(n, lonc, latc) locs = lonlat_inside_km_radius(wod_dbase['lon'], wod_dbase['lat'], (lonc, latc), kmrad) wod_loc_subset = wod_dbase[locs] wod_loc_subset = quik_quality_control(wod_loc_subset) print("found %s good profiles in area" % len(wod_loc_subset)) if len(wod_loc_subset) > 0: vars_arr = derive_variables(wod_loc_subset, which_ones='all') mlds = [ calc_mld(SA, CT, P, crit=crit) for SA, CT, P in zip( vars_arr[0], vars_arr[1], wod_loc_subset['pres']) ] avg_mlds.append(np.nanmedian(mlds)) std_mlds.append(np.nanstd(mlds)) min_mlds.append(np.nanquantile(mlds, .05)) max_mlds.append(np.nanquantile(mlds, .95)) return np.asarray(avg_mlds), np.asarray(std_mlds), np.asarray( min_mlds), np.asarray(max_mlds)
def stat_summarizer(figure): avg_perf = np.nanmean(figure) min_perf = np.nanmin(figure) q1_perf = np.nanquantile(figure, 0.25) med_perf = np.nanmedian(figure) q3_perf = np.nanquantile(figure, 0.75) max_perf = np.nanmax(figure) stdev = np.nanstd(figure) medianabdev = stats.median_absolute_deviation(figure, nan_policy='omit') sharpe = avg_perf / stdev sharpemad = med_perf / medianabdev finaldict = { 'avg_perf': avg_perf, 'med_perf': med_perf, 'stdev': stdev, 'medianabdev': medianabdev, 'min_perf': min_perf, 'max_perf': max_perf, 'q1_perf': q1_perf, 'q3_perf': q3_perf, 'sharpe': sharpe, 'sharpemad': sharpemad } return finaldict
def _make_area_buffer(d_x, d_y, q=1): n_rows, n_cols = d_y.shape with np.errstate(divide='ignore'): y_min = np.nanquantile(d_y, 1 - q, 0) y_max = np.nanquantile(d_y, q, 0) # mean = np.nanmedian(d_y) # y_min[np.isnan(y_min)] = mean # y_max[np.isnan(y_max)] = mean masks = using_clump(y_min) # polygon = np.concatenate((np.dstack((d_x,y_max))[0],np.dstack((d_x[::-1],y_min[::-1]))[0])) mesh_vertice = [] mesh_face = [] last_index = 0 for m in masks: _max = np.vstack((d_x[m], y_max[m])).T _min = np.vstack((d_x[m][::-1], y_min[m][::-1])).T # p = np.concatenate((_max,_min)).tolist() mv, mf = polygon2mesh(_max, _min) if len(mf) == 0: continue mf += last_index mesh_vertice.append(mv) mesh_face.append(mf) last_index = mf[-1, -1] + 1 if len(mesh_vertice) == 0: return None, None return np.concatenate(mesh_vertice), np.concatenate(mesh_face)
def bootstrap_ci(estimate, straps, alpha=0.05, method='pivot', axis=0, stack=True): """ Return pivot CIs This confidence interval returned is a pivotal CIs, C_l = 2 T - Q(1-α/2) C_u = 2 T - Q(α/2) where T is the estimator for the stastistic T, and α is the confidence level, and Q(x) is the empirical x percentile across the bootstraps. """ qlower, qupper = (np.nanquantile(straps, alpha / 2, axis=axis), np.nanquantile(straps, 1 - alpha / 2, axis=axis)) if method == 'percentile': CIs = qlower, estimate, qupper elif method == 'pivot': CIs = 2 * estimate - qupper, estimate, 2 * estimate - qlower else: raise ValueError("method must be either 'pivot' or 'percentile'") if stack: return np.stack(CIs) return CIs
def stat_summarizer_old(figure): avg_perf = np.nanmean(figure) min_perf = np.nanmin(figure) q1_perf = np.nanquantile(figure, 0.25) med_perf = np.nanmedian(figure) q3_perf = np.nanquantile(figure, 0.75) max_perf = np.nanmax(figure) iqr_perf = q3_perf - q1_perf max_min = max_perf - min_perf maxq3 = max_perf - q3_perf q1min = q1_perf - min_perf stdev = np.nanstd(figure) medianabdev = stats.median_absolute_deviation(figure, nan_policy='omit') finaldict = { 'avg_perf': avg_perf, 'min_perf': min_perf, 'q1_perf': q1_perf, 'med_perf': med_perf, 'q3_perf': q3_perf, 'max_perf': max_perf, 'iqr_perf': iqr_perf, 'max_min': max_min, 'maxq3': maxq3, 'q1min': q1min, 'stdev': stdev, 'medianabdev': medianabdev } return finaldict
def _inlier_range(series): low = np.nanquantile(series, 0.01) high = np.nanquantile(series, 0.99) assert low <= high # the two is a complete hack inner_range = (high - low) / 2 return low - inner_range, high + inner_range
def scatter_plot_by_wind(wind_low_threshold, wind_up_threshold, x, y, raster_variable_name, station_name): sns.set(rc={'figure.figsize': (9, 5)}) sns.set_theme(style="white") scatter_file = os.path.join( dir_comparison_plots, raster_variable_name + "_wind_%s_%s_station_%s_scatterplot.png" % (wind_low_threshold, wind_up_threshold, station_name)) # scatterplot if len(x) == 0 or len(y) == 0: return try: m, b = np.polyfit(x, y, 1) except np.linalg.LinAlgError: # only zeros (nans) return regress = linregress(x, y) plt.plot(x, m * x + b, color="#2b2b2b") sns.scatterplot(x, y, color="#c404ab") plt.axes().xaxis.set_tick_params(labelsize=8) plt.axes().yaxis.set_tick_params(labelsize=8) plt.text(np.nanquantile(x, [0.025])[0], np.nanquantile(y, [0.9])[0], "Lin. regression\nr-value: %s\nslope: %s" % (np.round(regress.rvalue, 2), np.round(regress.slope, 2)), fontsize=8) plt.ylabel("S2 trucks") plt.xlabel(raster_variable_name) plt.title("UBA station %s | Wind direction %s-%s" % (station_name, wind_low_threshold, wind_up_threshold)) plt.savefig(scatter_file, dpi=300) plt.close()
def thr_IQR(x, times=3, series=False, exclude_zero=True): """ if series is True, the last axis should be series """ if series is False: x = x[..., None] if exclude_zero is True: qu = np.asarray([ np.nanquantile(x[..., i][x[..., i] != 0], 0.75) for i in range(x.shape[-1]) ]) ql = np.asarray([ np.nanquantile(x[..., i][x[..., i] != 0], 0.25) for i in range(x.shape[-1]) ]) else: qu = np.asarray( [np.nanquantile(x[..., i], 0.75) for i in range(x.shape[-1])]) ql = np.asarray( [np.nanquantile(x[..., i], 0.25) for i in range(x.shape[-1])]) x_post = copy.deepcopy(x) x_post[x_post > (qu + times * (qu - ql))] = np.nan x_post[x_post < (ql - times * (qu - ql))] = np.nan if series is False: return x_post[..., 0] else: return x_post
def _dfinfo(dataframe): '''Returns a dataframe with statistical info about the given `dataframe` ''' infocols = ['Min', 'Median', 'Max', '#NAs', '#<1Perc.', '#>99Perc.'] defaultcolvalues = [np.nan, np.nan, np.nan, 0, 0, 0] sum_df = odict() # if _dfr.empty for col in floatingcols(dataframe): colvalues = defaultcolvalues if not dataframe.empty: q01 = np.nanquantile(dataframe[col], 0.01) q99 = np.nanquantile(dataframe[col], 0.99) df1, df99 = dataframe[(dataframe[col] < q01)], dataframe[(dataframe[col] > q99)] colvalues = [ np.nanmin(dataframe[col]), # Min np.nanquantile(dataframe[col], 0.5), # Median np.nanmax(dataframe[col]), # Max (~np.isfinite(dataframe[col])).sum(), # #NAs len(df1), # #<1Perc. len(df99) # @>99Perc. ] sum_df[col] = {i: v for i, v in zip(infocols, colvalues)} return pd.DataFrame(data=list(sum_df.values()), columns=infocols, index=list(sum_df.keys()))
def _get_target_neighbors(self, df_sched_expected_, n_neighbors=2): agg_funcs = {'mean': lambda x: np.mean(x, axis=1), 'min': lambda x: np.min(x, axis=1), 'max': lambda x: np.max(x, axis=1), 'q25': lambda x: np.nanquantile(x, 0.25, axis=1), 'median': lambda x: np.nanquantile(x, 0.5, axis=1), 'q75': lambda x: np.nanquantile(x, 0.75, axis=1), 'std': lambda x: np.std(x, axis=1), # 'mean_diff': lambda x: np.nanmean(np.diff(x.fillna(method='pad', axis=1), axis=1), axis=1), 'count': lambda x: np.sum(~np.isnan(x), axis=1), 'sum': lambda x: np.sum(x, axis=1)} df_sim = pd.DataFrame() print(f"Calculating aggregate statistics for {self.target_value} behavior.") for agg in agg_funcs.keys(): func = agg_funcs[agg] df_sim[agg] = func(df_sched_expected_) X_sim, _ = self._pca_reduction(df_sim) print(f"Computing {self.target_value} nearest neighbors by similarity in aggregate statistics.") neighbors = NearestNeighbors(n_neighbors=n_neighbors) neighbors.fit(X_sim) distances, indices = neighbors.kneighbors(X_sim) return distances, indices
def scatterplot(xs, ys, xlabel, ylabel, id_line=False, linewidth=1, ax=None): """ General scatterplot function :param xs: :type xs: array-like :param ys: :type ys: array-like :param xlabel: :param ylabel: :param id_line: boolean, whether or not to plot identity line :param linewidth: :param ax: :return: figure handle or Axes object """ return_fig = False if ax is None: fig, ax = plt.subplots(1, 1, figsize=(4, 4)) return_fig = True if id_line: lmin = np.nanmin([np.nanquantile(xs, 0.01), np.nanquantile(ys, 0.01)]) lmax = np.nanmax([np.nanquantile(xs, 0.99), np.nanquantile(ys, 0.99)]) ax.plot([lmin, lmax], [lmin, lmax], '-', color=[0.7, 0.7, 0.7], linewidth=linewidth) ax.scatter(xs, ys, marker='.', s=150, edgecolors=[1, 1, 1], alpha=1.0, color='k') ax.set_xlabel(xlabel) ax.set_ylabel(ylabel) ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) if return_fig: plt.show() return fig else: return ax
def quick_min_max(x, q=None): """Estimate the min/max values of input by down-sampling. :param numpy.ndarray x: data, 2D array for now. :param float/None q: quantile when calculating the min/max, which must be within [0, 1]. :return tuple: (min, max) """ if not isinstance(x, np.ndarray): raise TypeError("Input must be a numpy.ndarray!") if x.ndim != 2: raise ValueError("Input must be a 2D array!") while x.size > 1e5: sl = [slice(None)] * x.ndim sl[np.argmax(x.shape)] = slice(None, None, 2) x = x[tuple(sl)] if q is None: return np.nanmin(x), np.nanmax(x) if q < 0.5: q = 1 - q # Let np.nanquantile to handle the case when q is outside [0, 1] # caveat: nanquantile is about 30 times slower than nanmin/nanmax return np.nanquantile(x, 1 - q, interpolation='nearest'), \ np.nanquantile(x, q, interpolation='nearest')
def silverman_bw(x): import numpy as np s = np.nanstd(x) IQR = np.nanquantile(df.iloc[:, 0], .75) - np.nanquantile( df.iloc[:, 0], .25) A = np.min([s, IQR / 1.349]) n = np.count_nonzero(~np.isnan(x)) return 0.9 * A * (n**(-0.2))
def getBounds(varName, data=[]): bounds = (None, None) if len(data) > 0: data = data.flatten() bounds = (np.nanquantile(data, 0.05), np.nanquantile(data, 0.95)) else: bounds = getBoundsEx(varName) return bounds
def measurement(predictions, signals, supervision_type = "classification", **args): """ """ if supervision_type == "classification": return np.mean(predictions - signals == 0) elif supervision_type == "l2": return np.mean(np.square(predictions - signals)) * 0.5 elif supervision_type == "hit-MR": top_K = args["top_K"] info_df = args["info_df"] item_info_dict = dict(zip(list(info_df.take([0], axis = 1).values.flatten()), list(info_df.take([1], axis = 1).values.flatten()))) n_usrs = signals.shape[0] n_results = len(top_K) + 1 results = np.zeros((n_usrs, n_results)) start_time = time.time() for idx, row in signals.iterrows(): try: retrieved_items = [item_info_dict[item_id] for item_id in predictions[row["usr_id"]][0].split(",") if item_id in item_info_dict] n_level, n_trigger_item = predictions[row["usr_id"]][1] test_items = set([item_info_dict[item_id] for item_id in row["test_item"].split(",") if item_id in item_info_dict]) n_test = len(test_items) assert n_trigger_item >= 1 assert n_test >= 1 except: results[idx] = np.array([np.nan] * n_results) continue n_total_retrieved = n_level * n_trigger_item # compute hit scores for j, K in enumerate(top_K): bucket_size = int(np.ceil(int(K) * 1.0/n_trigger_item)) * n_trigger_item results[idx][j] = len(test_items.intersection(set(retrieved_items[: bucket_size])))/n_test # compute mean rank for itm in test_items: if itm in retrieved_items: results[idx][n_results - 1] += (int(retrieved_items.index(itm)/n_trigger_item) + 0.5) * n_trigger_item else: results[idx][n_results - 1] += n_total_retrieved results[idx][n_results - 1] = results[idx][n_results - 1]/n_test if idx % 10000 == 0: print("User id: {idx}; Elapsed time: {elapsed_time}s.".format(idx = idx, elapsed_time = time.time() - start_time)) return {'mean': np.nanmean(results, axis = 0),\ 'std': np.nanstd(results, axis = 0),\ 'Q25': np.nanquantile(results, 0.25, axis = 0),\ 'Q50': np.nanquantile(results, 0.5, axis = 0),\ 'Q75': np.nanquantile(results, 0.75, axis = 0),\ 'Q90': np.nanquantile(results, 0.9, axis = 0),\ 'Q95': np.nanquantile(results, 0.95, axis = 0)}
def qq_correction(df_m, estacion): ''' This function receives dataframes of model correct the values using the quantile-quantile technique. The df_m DataFrame MUST contain two columns: Fecha: The date in a datetime format Pandas Variable: The values of the variable This function retrieves the historical values of model and observation and generate the ECDF for both of them. After this, for each value in df_m, it adds a new column, with the corrected value. Reference: Boe et al., 2007. 'Statistical and dynamical downscaling of the Seine basin climate for hydro-meteorological studies' ''' # Check columns columnas = df_m.columns list2 = ['Fecha', 'tmax', 'tmin', 'radsup', 'velviento', 'hr'] result = any(elem in columnas for elem in list2) if result and len(columnas) == 2: print(' ################# Correccion Q-Q ', columnas[-1], ' #################') tot_val = len(df_m) df_m = df_m.assign(month=pd.DatetimeIndex(df_m.loc[:, 'Fecha']).month) corrected_values = np.empty(tot_val) corrected_values[:] = np.nan # Limit for CDF cdf_limite = .99999999 # Go for each row with data and make the correction according to month df_m.reset_index(drop=True, inplace=True) for index, row in df_m.iterrows(): ecdf_m, datos_m, ecdf_o, datos_o = get_ecdf( columnas[-1], estacion, row.month) dato = row[columnas[-1]] #Last column is data, first is Fecha p = ecdf_m(dato) if p > cdf_limite: p = cdf_limite corr_o = np.nanquantile(datos_o, p, interpolation='linear') corr_m = np.nanquantile(datos_m, p, interpolation='linear') corrected_values[index] = dato + (corr_o - corr_m) # End of Loop df_out = df_m.loc[:, [columnas[0], columnas[1]]].copy() df_out = df_out.assign(corregido=corrected_values) df_out.columns = ['Fecha', columnas[-1], columnas[-1] + '_corr'] return df_out else: err_txt = ''' ########### ERROR ##############\n No estan todas las columnas para hacer correccion Q-Q con datos.\n exit()\n ################################ ''' print(err_txt) exit()
def main(args=None): """ Main function to generate the polarization plot. """ args = parse_arguments().parse_args(args) matplotlib.rcParams['pdf.fonttype'] = 42 pc1 = pd.read_table(args.pca, header=None, sep="\t", dtype={ 0: "object", 1: "Int64", 2: "Int64", 3: "float32" }) pc1 = pc1.rename(columns={0: "chr", 1: "start", 2: "end", 3: "pc1"}) if args.outliers != 0: quantile = [args.outliers / 100, (100 - args.outliers) / 100] boundaries = np.nanquantile(pc1['pc1'].values.astype(float), quantile) quantiled_bins = np.linspace(boundaries[0], boundaries[1], args.quantile) else: quantile = [j / (args.quantile - 1) for j in range(0, args.quantile)] quantiled_bins = np.nanquantile(pc1['pc1'].values.astype(float), quantile) pc1["quantile"] = np.searchsorted(quantiled_bins, pc1['pc1'].values.astype(float), side="right") pc1.loc[pc1["pc1"] == np.nan]["quantile"] = args.quantile + 1 polarization_ratio = [] output_matrices = [] labels = [] for matrix in args.obsexp_matrices: obs_exp = hm.hiCMatrix(matrix) pc1["bin_id"] = pc1.apply(lambda row: get_indices(obs_exp, row), axis=1) name = ".".join(matrix.split("/")[-1].split(".")[0:-1]) labels.append(name) normalised_sum_per_quantile = count_interactions( obs_exp, pc1, args.quantile, args.offset) normalised_sum_per_quantile = np.nan_to_num( normalised_sum_per_quantile) if args.outputMatrix: output_matrices.append(normalised_sum_per_quantile) polarization_ratio.append( within_vs_between_compartments(normalised_sum_per_quantile, args.quantile)) if args.outputMatrix: np.savez(args.outputMatrix, [matrix for matrix in output_matrices]) plot_polarization_ratio(polarization_ratio, args.outputFileName, labels, args.quantile)
def _quantile(arr, q): if arr.ndim == 1: out = np.empty((q.size, ), dtype=arr.dtype) out[:] = np.nanquantile(arr, q) else: out = np.empty((arr.shape[0], q.size), dtype=arr.dtype) for index in range(out.shape[0]): out[index] = np.nanquantile(arr[index], q) return out
def getmidr(traj, thr): coords = np.reshape(traj, (-1, 2)) minx = np.nanquantile(coords[:, 0], thr) maxx = np.nanquantile(coords[:, 0], 1 - thr) miny = np.nanquantile(coords[:, 1], thr) maxy = np.nanquantile(coords[:, 1], 1 - thr) mid = np.array([(maxx + minx) / 2, (maxy + miny) / 2]) r = np.sqrt((coords[:, 0] - mid[0])**2 + (coords[:, 1] - mid[1])**2) return mid, np.nanquantile(r, 1 - thr)
def fit(self, X: Union[np.ndarray, pd.DataFrame], y=None): assert self.factor >= 0 X_ = np.asarray(X) self.mean_ = np.nanmedian(X_, axis=0) self.high_q_ = np.nanquantile(X_, self.high_quantile, axis=0) self.low_q_ = np.nanquantile(X_, self.low_quantile, axis=0) self.high_ = (self.high_q_ - self.mean_) * self.factor + self.mean_ self.low_ = (self.low_q_ - self.mean_) * self.factor + self.mean_ return self
def histme(x, y, color, **kwargs): rbins = (np.array([8, 5.7]) * 20).astype(int) # plotrange = [[np.nanquantile(x, .0005), np.nanquantile(x, .9995)], # [np.nanquantile(y, .0005), np.nanquantile(y, .9995)]] plotrange = [[np.nanquantile(x, .0005), np.nanquantile(x, .9995)], [0.01, 0.99]] # plt.hist2d(x, y, range=plotrange, bins=rbins, norm=colors.LogNorm(), cmap="Blues") plt.hist2d(x, y, range=plotrange, bins=rbins, cmap="Blues") plt.ylim(0, 1)
def __init__(self, *args, **kwargs): tkinter.Tk.__init__(self, *args, **kwargs) self.resizable(width=False, height=False) self.funkcje = ('Wskaźnik skośności', 'Pozycyjny wskaźnik skośności', 'Pozycyjny współczynnik asymetrii', 'Klasyczny współczynnik asymetrii', 'Współczynnik kurtozy', 'Współczynnik ekscesu') self.save = [] for x in range(len(selected_header)): sko = stats.skew(selected_data.iloc[:, x]) y = np.sort(selected_data.iloc[:, x]) poz_sko = np.nanquantile(y, q=0.75) + np.nanquantile( y, q=0.25) - 2 * (np.nanmedian(y)) poz_asy = poz_sko / (np.nanquantile(y, q=0.75) - np.nanquantile(y, q=0.25)) mean = np.nanmean(selected_data.iloc[:, x]) a = 0 for i in range(selected_data.shape[0]): a = a + ((selected_data.iloc[i, x] - mean)**3) m3 = a / selected_data.shape[0] kla_asy = m3 / (np.nanstd(selected_data.iloc[:, x])**3) kurtoza = stats.kurtosis(selected_data.iloc[:, x], axis=0, fisher=False) k1 = (stats.kurtosis( selected_data.iloc[:, x], axis=0, fisher=False)) - 3 self.Wyniki = [] self.Wyniki.append(sko) self.Wyniki.append(poz_sko) self.Wyniki.append(poz_asy) self.Wyniki.append(kla_asy) self.Wyniki.append(kurtoza) self.Wyniki.append(k1) self.save.append(self.Wyniki) wypelanianie_tabeli_w_petli(len(self.funkcje), self, x) tworzenie_tabel_w_petli(selected_header, self, poziom='True') tworzenie_tabel_w_petli(self.funkcje, self, poziom='False') self.l1 = Button(self, text='Zapisz wyniki', command=self.zapisz) self.l1.grid(row=len(self.funkcje) + 3, column=len(selected_header) + 1, pady=10, sticky=W) self.wolny = Label(self, text=' ', padx=10, pady=10) self.wolny.grid(row=len(self.funkcje) + 3, column=len(selected_header) + 3)
def list_aggregator(aggregatemethod, all_data): # AGGREGATE METHOD if aggregatemethod == 'mean': answer = np.nanmean(all_data) if aggregatemethod == 'median': answer = np.nanmedian(all_data) if aggregatemethod == 'minimum': answer = np.nanmin(all_data) if aggregatemethod == 'q1': answer = np.nanquantile(all_data, 0.25) if aggregatemethod == 'q3': answer = np.nanquantile(all_data, 0.75) if aggregatemethod == 'maximum': answer = np.nanmax(all_data) if aggregatemethod == 'stdev': answer = np.nanstd(all_data) if aggregatemethod == 'medianabdev': answer = stats.median_absolute_deviation(all_data, nan_policy='omit') if aggregatemethod == 'iqr': q1_perf = np.nanquantile(all_data, 0.25) q3_perf = np.nanquantile(all_data, 0.75) answer = q3_perf - q1_perf if aggregatemethod == 'range': min_perf = np.nanmin(all_data) max_perf = np.nanmax(all_data) answer = max_perf - min_perf if aggregatemethod == 'maxq3': max_perf = np.nanmax(all_data) q3_perf = np.nanquantile(all_data, 0.75) answer = max_perf - q3_perf if aggregatemethod == 'q1min': q1_perf = np.nanquantile(all_data, 0.25) min_perf = np.nanmin(all_data) answer = q1_perf - min_perf if aggregatemethod == 'q3q1avg': q1_perf = np.nanquantile(all_data, 0.25) q3_perf = np.nanquantile(all_data, 0.75) answer = (q3_perf + q1_perf) / 2 if aggregatemethod == 'q3q1avgoveriqr': q1_perf = np.nanquantile(all_data, 0.25) q3_perf = np.nanquantile(all_data, 0.75) iqr = q3_perf - q1_perf answer = ((q3_perf + q1_perf) / 2) / iqr if aggregatemethod == 'maxminavg': min_perf = np.nanmin(all_data) max_perf = np.nanmax(all_data) answer = (max_perf + min_perf) / 2 if aggregatemethod == 'maxminavgoverrange': min_perf = np.nanmin(all_data) max_perf = np.nanmax(all_data) maxmin = max_perf - min_perf answer = ((max_perf + min_perf) / 2) / maxmin return answer
def calc_weights(pop, urban, ntl, targets, more, access): """ """ # The calculated weights for each segment will go here if access["urban"] > 0.9: weights = np.ones_like(pop) * access["rural"] weights[urban >= 2] = access["urban"] return weights weights = np.zeros_like(pop) # Investigate each combination of urban/rural and four quartiles # of population density for loc in ["urban", "rural"]: for q in [0.25, 0.5, 0.75, 1]: # Values of 2 and 3 are considered urban if loc == "urban": condition_del = urban < 3 access_level = access["urban"] else: condition_del = urban >= 3 access_level = access["rural"] # Ignore errors from doing arr[arr < x] with nan values with np.errstate(invalid="ignore"): pop_temp = np.copy(pop) # local copy of pop for this loop pop_temp[condition_del] = np.nan # remove urban/rural pop_temp[targets == 0] = np.nan # remove not electrified # Filter to only keep this quartile quant_below = np.nanquantile(pop_temp, q - 0.25) quant = np.nanquantile(pop_temp, q) pop_temp[pop_temp <= quant_below] = np.nan pop_temp[pop_temp > quant] = np.nan # Get the average brightness per person of the top x% for this quartile # Where x is the rural/urban access rate ntl_per_pop = ntl / pop_temp ntl_quant = min(max(1 - access_level - more[loc][q], 0), 1) ntl_cut = np.nanquantile(ntl_per_pop, ntl_quant) # Create a weights array and assign values accoring to the formula below w = np.zeros_like(pop) w = 1 - (ntl_cut - ntl_per_pop) / ntl_cut w[w > 0.95] = 0.95 # limit values to max 1 w[np.isnan(w)] = 0 # Add the sucessive weights to the main array weights += w return weights
def test_no_p_overwrite(self): # this is worth retesting, because quantile does not make a copy p0 = np.array([0, 0.75, 0.25, 0.5, 1.0]) p = p0.copy() np.nanquantile(np.arange(100.), p, interpolation="midpoint") assert_array_equal(p, p0) p0 = p0.tolist() p = p.tolist() np.nanquantile(np.arange(100.), p, interpolation="midpoint") assert_array_equal(p, p0)
def test_no_p_overwrite(self): # this is worth retesting, beause quantile does not make a copy p0 = np.array([0, 0.75, 0.25, 0.5, 1.0]) p = p0.copy() np.nanquantile(np.arange(100.), p, interpolation="midpoint") assert_array_equal(p, p0) p0 = p0.tolist() p = p.tolist() np.nanquantile(np.arange(100.), p, interpolation="midpoint") assert_array_equal(p, p0)
def test_regression(self): ar = np.arange(24).reshape(2, 3, 4).astype(float) ar[0][1] = np.nan assert_equal(np.nanquantile(ar, q=0.5), np.nanpercentile(ar, q=50)) assert_equal(np.nanquantile(ar, q=0.5, axis=0), np.nanpercentile(ar, q=50, axis=0)) assert_equal(np.nanquantile(ar, q=0.5, axis=1), np.nanpercentile(ar, q=50, axis=1)) assert_equal(np.nanquantile(ar, q=[0.5], axis=1), np.nanpercentile(ar, q=[50], axis=1)) assert_equal(np.nanquantile(ar, q=[0.25, 0.5, 0.75], axis=1), np.nanpercentile(ar, q=[25, 50, 75], axis=1))
def array_nanquantile_global(arr, q): return np.nanquantile(arr, q)
def time_nanquantile(self, array_size, percent_nans): np.nanquantile(self.arr, q=0.2)
def test_basic(self): x = np.arange(8) * 0.5 assert_equal(np.nanquantile(x, 0), 0.) assert_equal(np.nanquantile(x, 1), 3.5) assert_equal(np.nanquantile(x, 0.5), 1.75)