def fit_transform(self, X, y=None): """Normalize numerical columns. Args: X (numpy.array) : numerical columns to normalize Returns: X (numpy.array): normalized numerical columns """ self.ecdfs = [None] * X.shape[1] for col in range(X.shape[1]): self.ecdfs[col] = ECDF(X[:, col]) X[:, col] = self._transform_col(X[:, col], col) return X
def summarize(data, name=None, units=None, plot=True): """Summarizes an array of posterior sample draws. Parameters: ----------- data: pandas Series or numpy array A 1D array containing MCMC posterior sample draws name: string Name of the parameter; automatically pulled if data is a pd.Series and name=None units: string The units associated with the parameter plot: bool, default True Whether or not to display an empirical cumulative distribution (ECDF) plot Returns: -------- summary: DataFrame Contains the summary stats of the samples """ if isinstance(data, pd.Series) and name is None: name = data.name if units is not None: name = f'{name} ({units})' vals = np.round(np.quantile(data, [0.5, 0.025, 0.975]), 3) df = pd.DataFrame( { 'value': name, 'median': vals[0], '95% CR': f'[{vals[1]}, {vals[2]}]' }, index=[0]) if plot: ecdf = ECDF(data.values) p = bokeh.plotting.figure(plot_width=400, plot_height=300) p.xaxis.axis_label = name p.yaxis.axis_label = 'ECDF' p.circle(ecdf.x, ecdf.y) bokeh.io.show(p) return df
def PlotNumerical(DF, targetcols = None, figsize = (10,5), ticksfontsize = 12, titlefontsize = 20, kde = True): if targetcols == None: SelectColumns = DF.columns.values else: SelectColumns = targetcols for col in SelectColumns: # col = "OWN_CAR_AGE" plt.figure(figsize = figsize) plt.suptitle(col,fontsize = titlefontsize,y = 0.91) plt.subplot(221) plt.grid() plt.xticks(fontsize = ticksfontsize) plt.yticks(fontsize = ticksfontsize) try: sns.distplot(DF[col], kde = kde) except: sns.distplot(DF[col], kde = False) plt.subplot(222) plt.grid() plt.xticks(fontsize = ticksfontsize) plt.yticks(fontsize = ticksfontsize) sns.boxplot(x = col, data = DF) plt.subplot(212) plt.grid() plt.xticks(fontsize = ticksfontsize) plt.yticks(fontsize = ticksfontsize) # sns.distplot(DF[col],rug_kws = {"cumulative" : True}, kde_kws = {"cumulative":True}) ecdf = ECDF(DF[col]) plt.plot(ecdf.x, ecdf.y / ecdf.y.max()) # sns.distplot(DF[col], rug_kws = {"cumulative":True}) plt.xlabel('Value') plt.ylabel('ECDF') plt.show()
def inv_ecdf_vs_pred_entropy(probabilities, label=None, color='b', linestyle='-', axis=None): pred_ent = predictive_entropy(probabilities) ecdf = ECDF(pred_ent) x_lim = np.log(probabilities.shape[1]) entropy_range = np.linspace(0.0, x_lim, probabilities.shape[1] * 100) if axis is None: fig, ax = plt.subplots(figsize=(12, 7), tight_layout=True) else: ax = axis ax.plot(entropy_range, 1 - ecdf(entropy_range), c=color, ls=linestyle, lw=3, label=label, clip_on=False) ax.set_xlim(ax.get_xlim()[0], np.ceil(x_lim)) ax.set_ylim(ax.get_ylim()[0], 1) ax.tick_params(direction='out', labelsize=14) ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) ax.tick_params(direction='out', labelsize=14, right=False, top=False) ax.set_ylabel('1-ecdf', fontsize=16) ax.set_xlabel('Predictive Entropy', fontsize=16)
def q4(): # Retorne aqui o resultado da questão 4. #Configurando padronização not_pulsar = stars.loc[stars['target'] == False] mean_not_pulsar = not_pulsar['mean_profile'] x = mean_not_pulsar false_pulsar_mean_profile_standardized = (x - x.mean()) / x.std() #quantis para média = 0 e desvio padrão = 1 q_80 = sct.norm.ppf(0.8, loc=0, scale=1) q_90 = sct.norm.ppf(0.9, loc=0, scale=1) q_95 = sct.norm.ppf(0.95, loc=0, scale=1) #Probabilidades associadas aos quantis ecdf = ECDF(false_pulsar_mean_profile_standardized) prob_quantis = (ecdf(q_80).round(3), ecdf(q_90).round(3), ecdf(q_95).round(3)) prob_quantis return prob_quantis
def ecdf(x, N=100, inverse=False): """ Thin wrapper around statsmodels ecdf. Arguments: x: array of points for which to find ecdf N: The number of output points you want in your ecdf inverse: Return 1 - ecdf Returns: x_out: an array of values y_out: an array of percentiles """ import numpy as np from statsmodels.distributions.empirical_distribution import ECDF x_out = np.linspace(min(x), max(x), N) y_out = ECDF(x)(x_out) if inverse: y_out = 1 - y_out return x_out, y_out
def quantile_correction(obs_data, mod_data, sce_data, modified=True): cdf = ECDF(mod_data) p = cdf(sce_data) * 100 cor = np.subtract(*[np.nanpercentile(x, p) for x in [obs_data, mod_data]]) if modified: mid = np.subtract( *[np.nanpercentile(x, 50) for x in [obs_data, mod_data]]) g = np.true_divide( *[np.nanpercentile(x, 50) for x in [obs_data, mod_data]]) iqr_obs_data = np.subtract(*np.nanpercentile(obs_data, [75, 25])) iqr_mod_data = np.subtract(*np.nanpercentile(mod_data, [75, 25])) f = np.true_divide(iqr_obs_data, iqr_mod_data) cor = g * mid + f * (cor - mid) return sce_data + cor else: return sce_data + cor
def q4(): false_pulsar_mean_profile = stars.loc[stars['target'] == False, 'mean_profile'] false_pulsar_mean_profile_standardized = sct.zscore( false_pulsar_mean_profile) # Percent point function ppf_q80 = sct.norm.ppf(0.80, loc=0, scale=1) ppf_q90 = sct.norm.ppf(0.90, loc=0, scale=1) ppf_q95 = sct.norm.ppf(0.95, loc=0, scale=1) # Create the ecdf function with standardized star data compute_ecdf_stars = ECDF(false_pulsar_mean_profile_standardized) return (round(compute_ecdf_stars(ppf_q80), 3), round(compute_ecdf_stars(ppf_q90), 3), round(compute_ecdf_stars(ppf_q95), 3))
def q2(): """Answer of question 02 Returns ------- float Probability of a given value is between one standard deviation from the mean """ mean = dataframe['normal'].mean() std = dataframe['normal'].std() ecdf = ECDF(dataframe['normal']) prob_inf = ecdf(mean - std) prob_sup = ecdf(mean + std) return float(prob_sup - prob_inf)
def CDF_band(n_samples): samples = np.random.standard_cauchy(n_samples) # Estimated Empirical CDF ecdf = ECDF(samples) line = np.linspace(-20, 20, 100000) ecdf_points = [] for i in line: ecdf_points.append(ecdf(i)) plt.plot(line, ecdf_points) plt.show() variance, mean = variance_mean(samples) skewness = get_skewness(samples) # Plugin Mean print('Plugin Estimator for Mean is:', mean) # Plugin Variance print('Plugin Estimator for Variance is:', variance) # Plugin Skewness print('Plugin Estimator for Skewness is:', skewness)
def fit_transform(self, X, y=None): """Normalize numerical columns. Args: X (pandas.DataFrame) : numerical columns to normalize Returns: (pandas.DataFrame): normalized numerical columns """ self.ecdfs = [None] * X.shape[1] X = X.copy() for col in range(X.shape[1]): self.ecdfs[col] = ECDF(X[col].values) X[col] = self._transform_col(X[col], col) return X
def q4(): """Answer of question 04 Returns ------- tuple Probability associated to 0.80, 0.90 and 0.95 quantiles of standardized values of mean_profile of false pulsar stars """ false_pulsar = stars[stars['target'] == False]['mean_profile'] false_pulsar_mean_profile_standardized = (false_pulsar - false_pulsar.mean()) / false_pulsar.std() ecdf = ECDF(false_pulsar_mean_profile_standardized) theorical_quantiles = sct.norm.ppf([.80, .90, .95], loc=0, scale=1) prob_quantiles = ecdf(theorical_quantiles).round(3) return tuple(prob_quantiles)
def calculate_baseline_scores(self): # TODO: should use the get_reachable_geoms function? print('\t Calculating baseline scores') # self.base_scores={'walkable_{}'.format(x): [] for x in [ # 'employment', 'housing', 'healthcare', 'hospitality', 'shopping']} base_scores = {} self.base_attributes = {} self.score_ecdfs = {} stats_to_aggregate = [ col for col in self.zones.columns if (('res_' in col) or ('emp_' in col)) ] # get the baseline reachable attributes and scores for every zone for ind, row in self.zones.loc[ self.zones['reference_area']].iterrows(): reachable_zones = self.zone_to_reachable[ind]['zones'] self.base_attributes[ind] = self.zones.loc[reachable_zones][ stats_to_aggregate].sum().to_dict() self.base_attributes[ind]['source_res'] = row['res_total'] self.base_attributes[ind]['source_emp'] = row['emp_total'] # get scores for individual zones- weighting cancels out base_scores[ind] = self.attributes_to_scores( [self.base_attributes[ind]]) # Create the ECDFs for each score using only the zones (not the grid cells self.base_zones_scores = pd.DataFrame.from_dict(base_scores, orient='index') for score in self.base_zones_scores.columns: base_scores_no_nan = [ x for x in self.base_zones_scores[score] if x == x ] self.score_ecdfs[score] = ECDF(base_scores_no_nan) # get weighted scores across the simulation area zones # (ignore the grid which is empty in reference and therefore would be weighted zero) ref_scores = self.attributes_to_scores( [self.base_attributes[ind] for ind in self.overlapping_geoids]) self.ref_ind = self.normalise_ind(ref_scores) # get the base reachable attributes for every grid cell location for i_c in range(len(self.geogrid)): reachable_zones = self.grid_to_reachable[i_c]['zones'] self.base_attributes[i_c] = self.zones.loc[reachable_zones][ stats_to_aggregate].sum().to_dict()
def plot_inter_limitorder_time_ecdf(self,show=True): """ Plot the the empirical cumulative distribution function of the time gaps between marketorders in self.marketorders :return Return axis """ self.limitorders = self.orderbook.loc[self.orderbook.Type==1] self.limitorders["IntertradeTimes"] = (self.limitorders.loc[:,"Timestamp"] - self.limitorders.loc[:,"Timestamp"].shift(1)) self.limitorders.loc[:,"IntertradeTimes"] = self.limitorders.IntertradeTimes.astype('timedelta64[ns]') self.limitorders.loc[:,"IntertradeTimes"] = self.limitorders.IntertradeTimes.apply(lambda x: self.get_intertrade_times(x,'seconds')) ecdf = ECDF(self.limitorders.IntertradeTimes.values) if show: fig = plt.figure(figsize=(10,7)) ax = fig.add_subplot(111) ax.ticklabel_format(axis='y', style='sci', scilimits=(-2,-1)) ax.semilogx(ecdf.x, ecdf.y) plt.show() return ecdf
def main(): prev_child = "" ### verbs with open("verbs/" + args.reg + "_verbs.pickle", "rb") as f: verb_counter = pickle.load(f) fileids = get_filenames("merged-" + args.reg) results_file = open("%s_results.csv" % args.reg, "w+") results_csvwriter = csv.writer(results_file, delimiter="\t") results_csvwriter.writerow( ["name", "age", "causative", "random", "percentile"]) for f in fileids: print(f) name = f.split("/")[-1].split("_")[0] age = f.split("_")[1].split(".")[0] sents = read_sents(f) if (args.build): vocab_size = we_on_merged_sessions(sents, name + "-" + age) caus_counter, caus_result = get_similarity(name, age, args.reg) print(caus_counter) #random_result = baseline(name, age, caus_counter) random_results = baseline(name, age, caus_counter, 1000, verb_counter, args.reg) ''' try: random_result = sum(random_results)/len(random_results) except: random_result = 0 ''' random_result = statistics.median(random_results) # percentile of causative in the distribution percentile = ECDF(random_results)(caus_result) print(percentile) results_csvwriter.writerow( [name, age, caus_result, random_result, percentile])
def q2(): # Retorne aqui o resultado da questão 2. # Média e desvio padrão x_mean = df['normal'].mean() x_std = df['normal'].std() # Intervalo data_point = (x_mean-x_std,x_mean+x_std) # CDF empírica da variável normal ecdf = ECDF(df['normal']) # Probabilidade no intervalo é a probabilidade do fim do intervalo # menos a probabilidade do inicio do intervalo p1 = ecdf(data_point[0]) p2 = ecdf(data_point[1]) return round(float(p2-p1),3)
def ECDFFitting(r, W, plot=False): Vlist = list( GenMultiRN(r, W, Type="P", Size=10000, Warn=False, HisGen=True).ravel()) Vlist = sorted(Vlist) ecdf = ECDF(Vlist) Fx = ecdf(Vlist) x = np.arange(min(Vlist) - 0.05, max(Vlist) + 0.05, 0.01) Fxx = ecdf(x) if plot: name = "ECDF" plt.figure() plt.plot(Vlist, Fx, 'o', label=name + ': Origin', markersize=3) #plt.plot(Vlist, FittedFunc(Vlist), 'r', label=name+': Fitted') plt.plot(x, Fxx, 'r', label=name + ': Fitted ecdf') plt.title('ECDF ' + " fitting: r = " + str(r)) plt.legend() plt.show() return ecdf
def _single_cdf(self, trial_name): data = [] for drop_rate in self.result[trial_name].keys(): for stat in self.result[trial_name][drop_rate]: data += self._prepare_histogram_data(stat) data = [ v # / max(data) for v in data ] ecdf = ECDF(data) return [ go.Scatter( name="eCDF", x=np.unique(data), y=ecdf(np.unique(data)) * 100, line_shape='hv', line_color='darkgreen', ) ]
def quantile_mapping(obs_cube, mod_cube, sce_cubes, *args, **kwargs): """ Quantile Mapping apply quantile mapping to all scenario cubes using the distributions of obs_cube and mod_cube Args: * obs_cube (:class:`iris.cube.Cube`): the observational data * mod_cube (:class:`iris.cube.Cube`): the model data at the reference period * sce_cubes (:class:`iris.cube.CubeList`): the scenario data that shall be corrected """ from statsmodels.distributions.empirical_distribution import ECDF obs_cube_mask = np.ma.getmask(obs_cube.data) cell_iterator = np.nditer(obs_cube.data[0], flags=['multi_index']) while not cell_iterator.finished: index_list = list(cell_iterator.multi_index) cell_iterator.iternext() index_list.insert(0, 0) index = tuple(index_list) if obs_cube_mask and obs_cube_mask[index]: continue index_list[0] = slice(0, None, 1) index = tuple(index_list) obs_data = obs_cube.data[index] mod_data = mod_cube.data[index] mod_ecdf = ECDF(mod_data) for sce_cube in sce_cubes: sce_data = sce_cube[index].data p = mod_ecdf(sce_data) * 100 corr = np.percentile(obs_data, p) - \ np.percentile(mod_data, p) sce_cube.data[index] += corr
def q4(): # Retorne aqui o resultado da questão 4. false_pulsar_mean_profile_standardized = ( stars[stars["target"] == 0]["mean_profile"] - stars[stars["target"] == 0]["mean_profile"].mean() ) / stars[stars["target"] == 0]["mean_profile"].std() false_pulsar_mean_profile_standardized vals = sct.norm.ppf([0.8, 0.9, 0.95]) #os quantis vals ecdf = ECDF(false_pulsar_mean_profile_standardized) resposta = ecdf(vals) resposta = tuple( map(lambda x: isinstance(x, float) and round(x, 3) or x, resposta)) return resposta
def permutationFWE(diff_arr, nullmean=0, permutations=1000, nproc=1): """ Performs family-wise error correction using permutation testing (Nichols & Holmes 2002) Parameters: diff_arr = MxN matrix of set of M independent tests for condition 1 minus condition 2 across N subjects permutations = Number of permutations to perform nproc = number of processes to run in parallel Returns: t: Array of T-values of correct contrast map (Mx1 vector, for M tests) p: Array of FWE-corrected p-values (Mx1 vector, for M tests); Note, p-values correspond to values on the CDF. One-sided or or two-sided p-values can be computed accordingly. N.B.: Only works for paired one-sample t-tests """ # Focus on difference matrix -- more computationally feasible (and less data to feed into parallel processing) # Prepare inputs for multiprocessing inputs = [] for i in range(permutations): seed = np.random.randint(0, 100000, 1)[0] inputs.append((diff_arr, nullmean, seed)) pool = mp.Pool(processes=nproc) result = pool.map_async(_permutation, inputs).get() pool.close() pool.join() # Returns an array of T-values distributions (our null distribution of "max-T" values) maxT_dist = np.asarray(result) # Obtain real t-values t = stats.ttest_1samp(diff_arr, nullmean, axis=1)[0] #t = np.mean(diff_arr,axis=1) # Construct ECDF from maxT_dist ecdf = ECDF(maxT_dist) # Return p-values from maxT_dist using our empirical CDF (FWE-corrected p-values) p_fwe = ecdf(t) return t, p_fwe
def plot_ccdf(sample): ecdf = ECDF(sample) x = np.linspace(min(sample), max(sample)) y = ecdf(x) new_y = [] for i in y: new_y.append(1-i) y = new_y plt.plot(x, y, 'bx') plt.gca().set_xscale('log') plt.gca().set_yscale('log') plt.gca().set_aspect('equal') axes = plt.gca() axes.set_xlim([min(x),max(x)]) axes.set_ylim([min(y),max(y)]) plt.show()
def counterfactual_ranks(points_to_predict, points_for_distribution, method="smoothed"): """ counterfactual ranks: compute \widehat U the value of the CDF at each element of points_to_predict, using the empirical CDF defined by 'points_for_distribution'. :param points_to_predict: points for wich to get the rank in the distribution :param points_for_distribution: points for which to compute the CDF :param method: can be "smoothed" or "standard" dependant on the type of method for computation of the CDF """ if method == "smoothed": y = smoothed_ecdf(new_points=points_to_predict, x=points_for_distribution) if method == "standard": ecdf = ECDF(points_for_distribution) y = ecdf(points_to_predict) return y
def vol_control_window_selection(self): vol_rolling_window = self.z_score_rolling_window vol_data = self.y vol_data = vol_data.diff().ewm(span=vol_rolling_window).std() vol_ecdf = ECDF(vol_data.values) def window_sample(x, ecdf): return np.floor((1 - ecdf(x)) * 12 + 2) self.df["aug_window"] = window_sample(vol_data) # Create a z-score dataframe containing all columns of possible windows all_possible_windows = map(int, list(set(self.df.aug_window.values))) data = self.df.kalman_hedged_spread.copy() dynamic_rolling_mean = pd.DataFrame(index=data.index) dynamic_rolling_std = dynamic_rolling_mean.copy() for window in all_possible_windows: dynamic_rolling_mean[f"window_{window}"] = data.rolling( window=window).mean() dynamic_rolling_std[f"window_{window}"] = data.rolling( window=window).std() for s in dynamic_rolling_mean.iterrows(): roll_window = "window_" + str(int(self.df.loc[s[0], "aug_window"])) self.df.loc[s[0], "dy_roll_mean"] = s[1][roll_window] for s in dynamic_rolling_std.iterrows(): roll_window = "window_" + str(int(self.df.loc[s[0], "aug_window"])) self.df.loc[s[0], "dy_roll_std"] = s[1][roll_window] self.df["z_score"] = ( self.df.hedged_spread.values - self.df.dy_roll_mean.values) / self.df.dy_roll_std.values self.mean_spread = self.df.dy_roll_mean self.std_spread = self.df.dy_roll_std # Show window sampling profile plot_space = np.arange(0, vol_data.max(), 0.01) plt.plot(plot_space, vol_ecdf(plot_space)) plt.title("Window sampling profile") plt.show()
def Empirical_ICDF(x, p): ''' Returns inverse empirical cumulative probability function at p points ''' # TODO: revisar que el fill_value funcione correctamente # fit ECDF ecdf = ECDF(x) cdf = ecdf(x) # interpolate KDE CDF to get support values fint = interp1d( cdf, x, fill_value=(np.nanmin(x), np.nanmax(x)), #fill_value=(np.min(x), np.max(x)), bounds_error=False ) return fint(p)
def plot_examine_diff_diff(start_time, fname="", loops=2, gap=0, thresh=10, **kwas): ''' lines: 1) domain independent cdf of ALL matches 2-n) cdf of matches for domain with answer space > thresh n-m) cdf of matches for ALL domains with answer space < thresh ''' kwas['start_time'] = start_time kwas['return_ccache'] = False svld, allsvl, allfmt, anssets = mv.arrange_self_data(start_time, gap, loops, **kwas) sm = mv.examine_diff_diff(svld) vals = [[], []] labels = ['all', 'small'] for dom in sm: vals[0] = vals[0] + sm[dom] if len(anssets[dom]) < thresh: vals[1] += sm[dom] else: vals.append(sm[dom]) labels.append(dom) fig, ax = plt.subplots(1, 1) for i in xrange(0, len(vals)): print "*****************"+labels[i]+"*********************" print vals[i] ecdf = ECDF(vals[i]) x = list(ecdf.x) y = list(ecdf.y) ax.plot(x, y, label=labels[i]) ps.set_dim(fig, ax, xdim=13, ydim=7.5) plt.xlabel("diff mask match by domain") plt.ylabel("CDF of clients") lgd = ps.legend_setup(ax, 4, "top center", True) filename = plotsdir+"diff_mask"+fname fig.savefig(filename+'.png', bbox_extra_artists=(lgd,), bbox_inches='tight') fig.savefig(filename+'.pdf', bbox_extra_artists=(lgd,), bbox_inches='tight') plt.close(fig) print "saving data..." for i in xrange(0, len(vals)): outstr = df.overwrite(plotsdir+labels[i]+'_diff_jaccard.csv', df.list2col(vals[i]))
def graficar_sueldo_neto(esi): sueldo_col_name = 'sueldo_neto' cdf_function = ECDF(esi[sueldo_col_name].dropna().values) layout = dict( title = "<span style='font-size:26px'>Study of age groups</span><br><span style='color:#999; font-size: 16px; font-weight:200'>students and professionals</span>", plot_bgcolor='#f5f5f5', margin = dict(t=50, l=0, r=0), legend=dict(yanchor='top',xanchor='right', x=0.992, y=0.98, font=dict(size= 12),traceorder='normal'), xaxis = dict(domain=[0,1]), barmode="overlay", bargap = 0.1, width = 765 ) sueldo_range = np.linspace(0, esi[sueldo_col_name].max(), 10000) fig = px.line(x=sueldo_range, y=100*cdf_function(sueldo_range), title=f'curva de densidad acumulada de {sueldo_col_name}', layout=layout) fig.update_layout(xaxis=dict(title='Sueldo neto'), yaxis=dict(title='Percentil')) st.plotly_chart(fig)
def calculate_epsilon_per_pair_parallel(values, delta, precision): # values = [0.0, 0.2, .4, .6, .7, 10, 20, 100, 400, 500, 1000, 2000] values = list(map(abs, values)) values = sorted(values) R_ij = max(values) epsilons = [] r_ij = R_ij * precision cdf = ECDF(values) epsilon = inf flag = 1 prev = values[0] for i in values: if i != prev: flag = 0 prev = i if not flag: for t_k in values: p_k = calculate_cdf(cdf, t_k + r_ij) - calculate_cdf( cdf, t_k - r_ij) # covering the case with risk less than or equal 1-p_k if not ( round(1 - p_k, 2) <= delta ): # the round is for very small differences, like 0.050000000000000044 eps = -log(p_k / (1.0 - p_k) * (1.0 / (delta + p_k) - 1.0)) / log( exp(1.0)) * (1.0 / R_ij) epsilons.append(eps) else: epsilons.append(inf) if len(epsilons) > 0: epsilon = min(epsilons) else: # fix the ECDF when all the values are equal. # after the discussion, we decided to let the user know about that issue and maybe has can handle it on his own. # epsilon=-inf epsilon = inf return epsilon
def plotGwCorrmat(label, gldas, gwcorrmat, masking=True): ''' plot correlation matrix between gw and CNN-learned model mismatch ''' print('max gw-grace correlation', np.nanmax(gwcorrmat)) print('min gw-grace correlation', np.nanmin(gwcorrmat)) cmap = ListedColormap((sns.diverging_palette(240, 10, s=80, l=55, n=9)).as_hex()) fig, ax = plt.subplots(figsize=(10, 6), dpi=300) pp = np.zeros(gldas.mask.shape) + np.NaN pp[gldas.mask == 1] = 1.0 if masking: temp = np.multiply(gwcorrmat, pp) im = ax.imshow(temp, cmap, origin='lower', vmax=0.8, vmin=-0.8) #plot CDF corrvec = gwcorrmat[gldas.gwvalidcells] #remove nan cells corrvec = corrvec[np.where(np.isnan(corrvec) == False)[0]] cdf = ECDF(corrvec) #[left, bottom, width, height] figpos = [0.52, 0.7, 0.18, 0.16] #sns.set_style("whitegrid") sns.set_style("ticks", {"xtick.major.size": 6, "ytick.major.size": 6}) ax2 = fig.add_axes(figpos) ax2.set_ylim(0, 1) ax2.grid(True) ax2.plot(cdf.x, cdf.y, linewidth=1.5) ax2.set_xlabel('Correlation') ax2.set_ylabel('CDF') else: im = ax.imshow(gwcorrmat, cmap, origin='lower') cx, cy = gldas.getBasinBoundForPlotting() ax.plot(cx, cy, '-', color='#7B7D7D') plt.colorbar(im, orientation="horizontal", fraction=0.046, pad=0.1, ax=ax) plt.savefig('gwcorr{0}.png'.format(label), dpi=fig.dpi, transparent=True, frameon=False)
def filter_variants(boost_obj, args): logger.info("Filtering Variants") path = args.out if args.predictor.lower() == 'classifier': df1 = pd.DataFrame() df1['xgb_score'] = boost_obj.y_pred df1['chr:pos'] = boost_obj.X_data.index real_snps = df1[df1['xgb_score'] == 1]['chr:pos'] elif args.predictor.lower() == 'regressor': fitted_values = boost_obj.y_pred # RS1 = list(boost_obj.y_data.nonzero()[0]) RS1 = list(boost_obj.y_data.to_numpy().nonzero()[0]) ecdf_func = ECDF([fitted_values[idx] for idx in RS1]) fitted_value_scores = ecdf_func(fitted_values) min_qscore = 0.05 real_snps_idx = [i > min_qscore for i in fitted_value_scores] real_snps = boost_obj.X_data.loc[real_snps_idx].index ## Plot ECDF if args.predictor.lower() == 'regressor': logger.info("Plotting ECDF") plt.figure() plt.plot(ecdf_func.x, ecdf_func.y, '.') plt.xlabel('Boosting_Score') plt.ylabel('ECDF_Score') plt.title('ECDF') if args.snps: name = 'ecdf_' + args.model + '_' + args.predictor + '_snps.png' elif args.indels: name = 'ecdf_' + args.model + '_' + args.predictor + '_indels.png' elif args.all: name = 'ecdf_' + args.model + '_' + args.predictor + '.png' plt.savefig(os.path.join(path, name)) return real_snps