def get_minmax(sample, min_samples = 8): """ We consider that a ratio of connections is typical if it falls within the 99.99 % percentile of the Normal distribution N(m, v) modelling ratios. This ensures that the expected rate of false alarms is about 1/10000, and therefore only a handful a week (given the large number of jurisdictions). Similarly, we infer the range of the rate of usage from each jurisdiction (given Ci j) to be the 99.99 % percentile range of a Poisson distribution with parameter Ci j. This full range must be within the typical range of ratios to avoid raising an alarm. Args: sample (pandas.core.series.Series): A series containing the relative change values for a set of countries. """ log.debug("Getting min and max for a sample on {0}'s data: {1}".format(sample.name, sample)) initial_sample_len = len(sample) if initial_sample_len > min_samples: sample = drop_outliers(sample) num_outliers = initial_sample_len - len(sample) log.debug("Sample had {0} outliers removed. Current sample: {1}".format(num_outliers, sample)) if len(sample) > min_samples: mu, signma = norm.fit(sample) sample_max = norm.ppf(0.9999, mu, signma) sample_min = norm.ppf(1 - 0.9999, mu, signma) log.debug("Sample min == {0}, Sample max == {1}".format(sample_min, sample_max)) return pd.Series({"max":sample_max, "min":sample_min}) else: log.debug("After removing outliers the sample was a length of {0}. This is shorter than acceptable minimum length of {1}.".format(len(sample), min_samples)) return pd.Series({"max":None, "min":None}) else: log.debug("Sample with length of {0} is shorter than acceptable minimum length of {1}.".format(initial_sample_len, min_samples)) return pd.Series({"max":None, "min":None})
def Z_test(x1, x2, Alpha=0.95): ResultsTable = pd.DataFrame() # Compute standard deviation and number of observation S_x1 = x1.std(ddof=1) S_x2 = x2.std(ddof=1) N_x1 = len(x1) N_x2 = len(x2) # Test statistic and p value Z = (x1.mean() - x2.mean()) / np.sqrt(S_x1**2 / N_x1 + S_x2**2 / N_x2) p = 2 * (1 - norm.cdf(abs(Z))) # Rejection range MinValue = norm.ppf((1 - Alpha) / 2) MaxValue = norm.ppf(1 - (1 - Alpha) / 2) RejectionRange = np.array([[-np.inf, round(MinValue, 3)], [round(MaxValue, 3), np.inf]]) Results = { 'Test statistic': round(Z, 3), 'p value': round(p, 9), 'Significance level (%)': Alpha * 100, 'Rejection range': RejectionRange } ResultsTable = ResultsTable.append(Results, ignore_index=True) return ResultsTable
def make_tendencies_minmax(l, INTERVAL=1): lminus1 = dict([(ccode, n_day_rel(l[ccode], INTERVAL)) for ccode in l]) c = lminus1[lminus1.keys()[0]] dists = [] minx = [] maxx = [] for i in range(len(c)): vals = [ lminus1[ccode][i] for ccode in lminus1.keys() if lminus1[ccode][i] != None ] if len(vals) < 8: dists += [None] minx += [None] maxx += [None] else: vals.sort() median = vals[len(vals) / 2] q1 = vals[len(vals) / 4] q2 = vals[(3 * len(vals)) / 4] qd = q2 - q1 vals = [ v for v in vals if median - qd * 4 < v and v < median + qd * 4 ] if len(vals) < 8: dists += [None] minx += [None] maxx += [None] continue mu, signma = norm.fit(vals) dists += [(mu, signma)] maxx += [norm.ppf(0.9999, mu, signma)] minx += [norm.ppf(1 - 0.9999, mu, signma)] ## print minx[-1], maxx[-1] return minx, maxx
def make_tendencies_minmax(l, INTERVAL = 1): lminus1 = dict([(ccode, n_day_rel(l[ccode], INTERVAL)) for ccode in l]) c = lminus1[lminus1.keys()[0]] dists = [] minx = [] maxx = [] for i in range(len(c)): vals = [lminus1[ccode][i] for ccode in lminus1.keys() if lminus1[ccode][i] != None] if len(vals) < 8: dists += [None] minx += [None] maxx += [None] else: vals.sort() median = vals[len(vals)/2] q1 = vals[len(vals)/4] q2 = vals[(3*len(vals))/4] qd = q2 - q1 vals = [v for v in vals if median - qd*4 < v and v < median + qd*4] if len(vals) < 8: dists += [None] minx += [None] maxx += [None] continue mu, signma = norm.fit(vals) dists += [(mu, signma)] maxx += [norm.ppf(0.9999, mu, signma)] minx += [norm.ppf(1 - 0.9999, mu, signma)] ## print minx[-1], maxx[-1] return minx, maxx
def QQPlot(DataValues, Alpha_CI=0.95, DataLabel='Data', FigFile='QQPlot.png'): ### Based on: https://www.tjmahr.com/quantile-quantile-plots-from-scratch/ ### Itself based on Fox book: Fox, J. (2015) ### Applied Regression Analysis and Generalized Linear Models. ### Sage Publications, Thousand Oaks, California. # Data analysis N = len(DataValues) X_Bar = np.mean(DataValues) S_X = np.std(DataValues,ddof=1) # Sort data to get the rank Data_Sorted = np.zeros(N) Data_Sorted += DataValues Data_Sorted.sort() # Compute quantiles EmpiricalQuantiles = np.arange(0.5, N + 0.5) / N TheoreticalQuantiles = norm.ppf(EmpiricalQuantiles, X_Bar, S_X) ZQuantiles = norm.ppf(EmpiricalQuantiles,0,1) # Compute data variance DataIQR = np.quantile(DataValues, 0.75) - np.quantile(DataValues, 0.25) NormalIQR = np.sum(np.abs(norm.ppf(np.array([0.25, 0.75]), 0, 1))) Variance = DataIQR / NormalIQR Z_Space = np.linspace(min(ZQuantiles), max(ZQuantiles), 100) Variance_Line = Z_Space * Variance + np.median(DataValues) # Compute alpha confidence interval (CI) Z_SE = np.sqrt(norm.cdf(Z_Space) * (1 - norm.cdf(Z_Space)) / N) / norm.pdf(Z_Space) Data_SE = Z_SE * Variance Z_CI_Quantile = norm.ppf(np.array([(1 - Alpha_CI) / 2]), 0, 1) # Create point in the data space Data_Space = np.linspace(min(TheoreticalQuantiles), max(TheoreticalQuantiles), 100) # QQPlot BorderSpace = max( 0.05*abs(Data_Sorted.min()), 0.05*abs(Data_Sorted.max())) Y_Min = Data_Sorted.min() - BorderSpace Y_Max = Data_Sorted.max() + BorderSpace Figure, Axes = plt.subplots(1, 1, figsize=(5.5, 4.5), dpi=100) Axes.plot(TheoreticalQuantiles, Data_Sorted, linestyle='none', marker='o', mew=0.5, fillstyle='none', color=(0, 0, 0), label=DataLabel) Axes.plot(Data_Space, Variance_Line, linestyle='--', color=(1, 0, 0), label='Variance :' + str(format(np.round(Variance, 2),'.2f'))) Axes.plot(Data_Space, Variance_Line + Z_CI_Quantile * Data_SE, linestyle='--', color=(0, 0, 1), label=str(int(100*Alpha_CI)) + '% CI') Axes.plot(Data_Space, Variance_Line - Z_CI_Quantile * Data_SE, linestyle='--', color=(0, 0, 1)) plt.xlabel('Theoretical quantiles (-)') plt.ylabel('Empirical quantiles (-)') plt.ylim([Y_Min, Y_Max]) plt.legend(loc='upper center', ncol=3, bbox_to_anchor=(0.5, 1.15), prop={'size':10}) plt.savefig(FigFile) plt.show() plt.close(Figure) return Variance
def summary(self, yname=None, xname=None, title=None, alpha=0.05): df = pd.DataFrame() df["Type"] = (["Mean"] * self.k_exog + ["Scale"] * self.k_scale + ["Smooth"] * self.k_smooth + ["SD"] * self.k_noise) df["coef"] = self.params try: df["std err"] = np.sqrt(np.diag(self.cov_params())) except Exception: df["std err"] = np.nan from scipy.stats.distributions import norm df["tvalues"] = df.coef / df["std err"] df["P>|t|"] = 2 * norm.sf(np.abs(df.tvalues)) f = norm.ppf(1 - alpha / 2) df["[%.3f" % (alpha / 2)] = df.coef - f * df["std err"] df["%.3f]" % (1 - alpha / 2)] = df.coef + f * df["std err"] df.index = self.model.data.param_names summ = summary2.Summary() if title is None: title = "Gaussian process regression results" summ.add_title(title) summ.add_df(df) return summ
def startSimulation(self) -> bool: if self.is_running(): logging.info(self.__class__.__name__, ":startSimulation already in progress.") return False if not self._validate(): return False if len(self._simulation_profiles) == 0: return False days_range: range = range(0, (self._num_trading_days), 1) if self._trading_days_order == "A" \ else range(self._num_trading_days -1, -1, -1) for sim_num in range(self.simulations_number): logging.debug(self.__class__.__name__, 'Starting simulation {}'.format(sim_num)) for day in days_range: daily_return = norm.ppf( np.random.rand(self._num_years_per_sim), self._ret_dist_mean, self._ret_dist_std) done = day == days_range[-1] for profile_name, profile in self._simulation_profiles.items(): logging.debug( self.__class__.__name__, ": Simulating profile {}".format(profile_name)) profile.performTransition(daily_return, (sim_num, day, done)) return True
def mquantiles_cimj(data, prob=[0.25,0.50,0.75], alpha=0.05, axis=None): """ Computes the alpha confidence interval for the selected quantiles of the data, with Maritz-Jarrett estimators. Parameters ---------- data : ndarray Data array. prob : sequence, optional Sequence of quantiles to compute. alpha : float, optional Confidence level of the intervals. axis : int or None, optional Axis along which to compute the quantiles. If None, use a flattened array. Returns ------- ci_lower : ndarray The lower boundaries of the confidence interval. Of the same length as `prob`. ci_upper : ndarray The upper boundaries of the confidence interval. Of the same length as `prob`. """ alpha = min(alpha, 1 - alpha) z = norm.ppf(1 - alpha/2.) xq = mstats.mquantiles(data, prob, alphap=0, betap=0, axis=axis) smj = mjci(data, prob, axis=axis) return (xq - z * smj, xq + z * smj)
def test_null_constrained(): # Create a mixed population of Z-scores: 1000 standard normal and # 20 uniformly distributed between 3 and 4. grid = np.linspace(0.001, 0.999, 1000) z0 = norm.ppf(grid) z1 = np.linspace(3, 4, 20) zs = np.concatenate((z0, z1)) for estimate_mean in False,True: for estimate_scale in False,True: for estimate_prob in False,True: emp_null = NullDistribution(zs, estimate_mean=estimate_mean, estimate_scale=estimate_scale, estimate_null_proportion=estimate_prob) if not estimate_mean: assert_allclose(emp_null.mean, 0, atol=1e-5, rtol=1e-5) if not estimate_scale: assert_allclose(emp_null.sd, 1, atol=1e-5, rtol=1e-2) if not estimate_prob: assert_allclose(emp_null.null_proportion, 1, atol=1e-5, rtol=1e-2) # consistency check assert_allclose(emp_null.pdf(np.r_[-1, 0, 1]), norm.pdf(np.r_[-1, 0, 1], loc=emp_null.mean, scale=emp_null.sd), rtol=1e-13)
def sampleSize(stdev, tolerance, percentConfidence, printLatex=False): from scipy.stats.distributions import norm k = round(norm.ppf(0.5 + percentConfidence / 200., 0, 1) * 100) / 100. # 1.-(100-percentConfidence)/200. if printLatex: print('${0}^2\\frac{{{1}^2}}{{{2}^2}}$'.format(k, stdev, tolerance)) return (k * stdev / tolerance)**2
def test_null_constrained(estimate_mean, estimate_scale, estimate_prob): # Create a mixed population of Z-scores: 1000 standard normal and # 20 uniformly distributed between 3 and 4. grid = np.linspace(0.001, 0.999, 1000) z0 = norm.ppf(grid) z1 = np.linspace(3, 4, 20) zs = np.concatenate((z0, z1)) emp_null = NullDistribution(zs, estimate_mean=estimate_mean, estimate_scale=estimate_scale, estimate_null_proportion=estimate_prob) if not estimate_mean: assert_allclose(emp_null.mean, 0, atol=1e-5, rtol=1e-5) if not estimate_scale: assert_allclose(emp_null.sd, 1, atol=1e-5, rtol=1e-2) if not estimate_prob: assert_allclose(emp_null.null_proportion, 1, atol=1e-5, rtol=1e-2) # consistency check assert_allclose(emp_null.pdf(np.r_[-1, 0, 1]), norm.pdf(np.r_[-1, 0, 1], loc=emp_null.mean, scale=emp_null.sd), rtol=1e-13)
def mquantiles_cimj(data, prob=[0.25, 0.50, 0.75], alpha=0.05, axis=None): """ Computes the alpha confidence interval for the selected quantiles of the area_data, with Maritz-Jarrett estimators. Parameters ---------- data : ndarray Data array. prob : sequence, optional Sequence of quantiles to compute. alpha : float, optional Confidence level of the intervals. axis : int or None, optional Axis along which to compute the quantiles. If None, use a flattened array. Returns ------- ci_lower : ndarray The lower boundaries of the confidence interval. Of the same length as `prob`. ci_upper : ndarray The upper boundaries of the confidence interval. Of the same length as `prob`. """ alpha = min(alpha, 1 - alpha) z = norm.ppf(1 - alpha / 2.) xq = mstats.mquantiles(data, prob, alphap=0, betap=0, axis=axis) smj = mjci(data, prob, axis=axis) return (xq - z * smj, xq + z * smj)
def regr_data_prep(self,kk,N_i=1): ''' Regression data preparation via nested simulations ''' import customML as cm # --- Computation budget allocatoin --- N_o = int(kk/N_i) # --- portfolio price @ t = \tau via Nested simulations--- t0 = time.time() ran1 = npr.standard_normal((N_o,1)) S1 = np.zeros((N_o,1)) S1[:] = self.S0 S1[:] = S1[:] * np.exp((self.mu - 0.5*self.sigma*self.sigma)*self.tau + \ self.sigma * np.sqrt(self.tau) * ran1[:]) ran2 = npr.standard_normal((N_o,N_i)) S2 = np.zeros((N_o,N_i)) S2[:,:] = np.dot(S1[:],np.ones((1,N_i))) * np.exp((self.rfr - 0.5*self.sigma*self.sigma)*(self.T-self.tau) \ + self.sigma * np.sqrt(self.T-self.tau) * ran2[:,:]) prob0 = (1.-np.exp(-2.*(np.log(np.dot(S1[:],np.ones((1,N_i)))/(self.H[0]*np.ones((N_o,N_i))))\ *np.log(S2[:,:]/(self.H[0]*np.ones((N_o,N_i))))/(self.sigma**2)/(self.T-self.tau))))\ *(np.dot(S1[:],np.ones((1,N_i))) >= self.H[0]*np.ones((N_o,N_i))).astype(float)\ *(S2[:,:] >= self.H[0]*np.ones((N_o,N_i))).astype(float) prob1 = (1.-np.exp(-2.*(np.log(np.dot(S1[:],np.ones((1,N_i)))/(self.H[1]*np.ones((N_o,N_i))))\ *np.log(S2[:,:]/(self.H[1]*np.ones((N_o,N_i))))/(self.sigma**2)/(self.T-self.tau))))\ *(np.dot(S1[:],np.ones((1,N_i))) >= self.H[1]*np.ones((N_o,N_i))).astype(float)\ *(S2[:,:] >= self.H[1]*np.ones((N_o,N_i))).astype(float) prob2 = (1.-np.exp(-2.*(np.log(np.dot(S1[:],np.ones((1,N_i)))/(self.H[2]*np.ones((N_o,N_i))))\ *np.log(S2[:,:]/(self.H[2]*np.ones((N_o,N_i))))/(self.sigma**2)/(self.T-self.tau))))\ *(np.dot(S1[:],np.ones((1,N_i))) >= self.H[2]*np.ones((N_o,N_i))).astype(float)\ *(S2[:,:] >= self.H[2]*np.ones((N_o,N_i))).astype(float) Vtau0 = np.dot((np.maximum(self.K[0]-S2[:,:],0)*prob0), np.ones((N_i,1))) / \ float(N_i) * np.exp(-self.rfr*(self.T-self.tau)) Vtau1 = np.dot((np.maximum(self.K[1]-S2[:,:],0)*prob1), np.ones((N_i,1))) / \ float(N_i) * np.exp(-self.rfr*(self.T-self.tau)) Vtau2 = np.dot((np.maximum(self.K[2]-S2[:,:],0)*prob2), np.ones((N_i,1))) / \ float(N_i) * np.exp(-self.rfr*(self.T-self.tau)) ValueTau = Vtau0*self.pos[0] + Vtau1*self.pos[1] + Vtau2*self.pos[2] t_ns = time.time() - t0 # prediction samples #ran3 = norm(loc=0, scale=1).ppf(lhs(D, samples=I_pred)) stratified_gaussian = np.array([(i-0.5)/self.I_pred for i in range(1,self.I_pred+1)]) ran3 = norm.ppf(stratified_gaussian[:,np.newaxis]) S_pred = np.zeros((self.I_pred,1)) S_pred[:] = self.S0 S_pred[:] = S_pred[:] * np.exp((self.mu - 0.5*self.sigma*self.sigma)*self.tau +\ self.sigma*np.sqrt(self.tau) * ran3[:]) self.X = S1 self.X_pred = S_pred self.y = ValueTau
def test_sqrt_lasso(): np.random.seed(234923) # Based on the example in the Belloni paper n = 100 p = 500 ii = np.arange(p) cx = 0.5**np.abs(np.subtract.outer(ii, ii)) cxr = np.linalg.cholesky(cx) x = np.dot(np.random.normal(size=(n, p)), cxr.T) b = np.zeros(p) b[0:5] = [1, 1, 1, 1, 1] from scipy.stats.distributions import norm alpha = 1.1 * np.sqrt(n) * norm.ppf(1 - 0.05 / (2 * p)) # Use very low noise level for a unit test y = np.dot(x, b) + 0.25 * np.random.normal(size=n) # At low noise levels, the sqrt lasso should be around a # factor of 3 from the oracle without refit, and should # almost equal the oracle with refit. expected_oracle = {False: 3, True: 1} # Used for regression testing expected_params = { False: np.r_[0.87397122, 0.96051874, 0.9905915, 0.93868953, 0.90771773], True: np.r_[0.95114241, 1.0302987, 1.01723074, 0.97587343, 0.99846403] } for refit in False, True: rslt = OLS(y, x).fit_regularized(method="sqrt_lasso", alpha=alpha, refit=refit) err = rslt.params - b numer = np.sqrt(np.dot(err, np.dot(cx, err))) oracle = OLS(y, x[:, 0:5]).fit() oracle_err = np.zeros(p) oracle_err[0:5] = oracle.params - b[0:5] denom = np.sqrt(np.dot(oracle_err, np.dot(cx, oracle_err))) # Check performance relative to oracle, should be around assert_allclose(numer / denom, expected_oracle[refit], rtol=0.5, atol=0.1) # Regression test the parameters assert_allclose(rslt.params[0:5], expected_params[refit], rtol=1e-5, atol=1e-5)
def sampleSize(stdev, tolerance, percentConfidence, nRoundingDigits=None, printLatex=False): from scipy.stats.distributions import norm if nRoundingDigits is None: k = round(norm.ppf(0.5 + percentConfidence / 200., 0, 1), 2) # 1.-(100-percentConfidence)/200. else: k = round(norm.ppf(0.5 + percentConfidence / 200., 0, 1), nRoundingDigits) stdev = round(stdev, nRoundingDigits) tolerance = round(tolerance, nRoundingDigits) if printLatex: print('$z_{{{}}}^2\\frac{{s^2}}{{e^2}}={}^2\\frac{{{}^2}}{{{}^2}}$'. format(0.5 + percentConfidence / 200., k, stdev, tolerance)) return (k * stdev / tolerance)**2
def logrank_power(n, surv1, surv2, alpha=0.05): d = n * (2 - surv1 - surv2) if surv1 == 1 or surv2 == 1: return 0 elif surv1 == 0 or surv2 == 0: return -1 phi = log(surv1) / log(surv2) if surv1 < surv2 else log(surv2) / log(surv1) z_a = norm.ppf(1 - alpha) z_1_beta = sqrt(d * (1 - phi) * (1 - phi) / (1 + phi) / (1 + phi)) - z_a return norm.cdf(z_1_beta)
def getdprime(A_correct, A_total, B_correct, B_total, corrected): if corrected == True: if A_correct == A_total: tA = 1 - 1/(2*A_total) elif A_correct == 0: tA = 1 / (2*A_total) else: tA = A_correct/(A_total) if B_correct == B_total: tB = 1 - 1/(2*B_total) elif B_correct == 0: tB = 1 / (2*B_total) else: tB = B_correct/(B_total) else: tA = A_correct/(A_total) tB = B_correct/(B_total) dp = norm.ppf(tA) - norm.ppf(1-(tB)) return dp
def getdprime(A_correct, A_total, B_correct, B_total, corrected): if corrected == True: if A_correct == A_total: tA = 1 - 1 / (2 * A_total) elif A_correct == 0: tA = 1 / (2 * A_total) else: tA = A_correct / (A_total) if B_correct == B_total: tB = 1 - 1 / (2 * B_total) elif B_correct == 0: tB = 1 / (2 * B_total) else: tB = B_correct / (B_total) else: tA = A_correct / (A_total) tB = B_correct / (B_total) dp = norm.ppf(tA) - norm.ppf(1 - (tB)) return dp
def statsmodels_to_results(model): """ Convert statsmodels summary to a dataframe. Parameters ---------- model : statsmodels model output The output of a statsmodels analysis. For example rlm or mixedlm. Returns ------- df : Pandas dataframe. """ from statsmodels.regression.mixed_linear_model import MixedLMResultsWrapper from scipy.stats.distributions import norm df = summary_to_dataframe(model.summary()) # deal with numerical precision loss in at least some of the values for col, attr in _REPLACEMENTS: if col in df.columns: df[col] = getattr(model, attr, df[col]) # This one messes up the standard error and quartiles, too if isinstance(model, MixedLMResultsWrapper): sl = slice(model.k_fe) mu = np.asarray(df.iloc[sl, df.columns == 'Coef.'])[:, 0] # Adapted from statsmodels, see # https://github.com/statsmodels/statsmodels/blob/master/statsmodels/regression/mixed_linear_model.py#L2710-L2736 # noqa: E501 stderr = np.sqrt(np.diag(model.cov_params()[sl])) df.iloc[sl, df.columns == 'Std.Err.'] = stderr # Confidence intervals qm = -norm.ppf(0.05 / 2) df.iloc[sl, df.columns == '[0.025'] = mu - qm * stderr df.iloc[sl, df.columns == '0.975]'] = mu + qm * stderr # All random effects variances and covariances sdf = np.zeros((model.k_re2 + model.k_vc, 2)) jj = 0 for i in range(model.k_re): for j in range(i + 1): sdf[jj, 0] = np.asarray(model.cov_re)[i, j] sdf[jj, 1] = np.sqrt(model.scale) * model.bse[model.k_fe + jj] jj += 1 # Variance components for i in range(model.k_vc): sdf[jj, 0] = model.vcomp[i] sdf[jj, 1] = np.sqrt(model.scale) * model.bse[model.k_fe + jj] jj += 1 df.iloc[model.k_fe:, df.columns == 'Coef.'] = sdf[:, 0] df.iloc[model.k_fe:, df.columns == 'Std.Err.'] = sdf[:, 1] df = expand_summary_dataframe(df) return df
def _calculate_ci_approx(p,sigma,n): """Return index j and k that correspond to confidence interval of level sigma for percentile p*100 along with the respective confidence level Large n approximation """ nu = norm.ppf((1+sigma)/2)*np.sqrt(p*(1-p)) # print(nu) j = np.floor(n*p-nu*np.sqrt(n)) k = np.ceil(n*p+nu*np.sqrt(n)) return (j,k,sigma)
def test_sqrt_lasso(): np.random.seed(234923) # Based on the example in the Belloni paper n = 100 p = 500 ii = np.arange(p) cx = 0.5 ** np.abs(np.subtract.outer(ii, ii)) cxr = np.linalg.cholesky(cx) x = np.dot(np.random.normal(size=(n, p)), cxr.T) b = np.zeros(p) b[0:5] = [1, 1, 1, 1, 1] from scipy.stats.distributions import norm alpha = 1.1 * np.sqrt(n) * norm.ppf(1 - 0.05 / (2 * p)) # Use very low noise level for a unit test y = np.dot(x, b) + 0.25 * np.random.normal(size=n) # At low noise levels, the sqrt lasso should be around a # factor of 3 from the oracle without refit, and should # almost equal the oracle with refit. expected_oracle = {False: 3, True: 1} # Used for regression testing expected_params = {False: np.r_[0.87397122, 0.96051874, 0.9905915 , 0.93868953, 0.90771773], True: np.r_[0.95114241, 1.0302987 , 1.01723074, 0.97587343, 0.99846403]} for refit in False, True: rslt = OLS(y, x).fit_regularized(method="sqrt_lasso", alpha=alpha, refit=refit) err = rslt.params - b numer = np.sqrt(np.dot(err, np.dot(cx, err))) oracle = OLS(y, x[:, 0:5]).fit() oracle_err = np.zeros(p) oracle_err[0:5] = oracle.params - b[0:5] denom = np.sqrt(np.dot(oracle_err, np.dot(cx, oracle_err))) # Check performance relative to oracle, should be around assert_allclose(numer / denom, expected_oracle[refit], rtol=0.5, atol=0.1) # Regression test the parameters assert_allclose(rslt.params[0:5], expected_params[refit], rtol=1e-5, atol=1e-5)
def confidenceInterval(mean, stdev, nSamples, percentConfidence, trueStd = True, printLatex = False): '''if trueStd, use normal distribution, otherwise, Student Use otherwise t.interval or norm.interval ex: norm.interval(0.95, loc = 0., scale = 2.3/sqrt(11)) t.interval(0.95, 10, loc=1.2, scale = 2.3/sqrt(nSamples)) loc is mean, scale is sigma/sqrt(n) (for Student, 10 is df)''' from math import sqrt from scipy.stats.distributions import norm, t if trueStd: k = round(norm.ppf(0.5+percentConfidence/200., 0, 1)*100)/100. # 1.-(100-percentConfidence)/200. else: # use Student k = round(t.ppf(0.5+percentConfidence/200., nSamples-1)*100)/100. e = k*stdev/sqrt(nSamples) if printLatex: print('${0} \pm {1}\\frac{{{2}}}{{\sqrt{{{3}}}}}$'.format(mean, k, stdev, nSamples)) return mean-e, mean+e
def test_local_fdr(): # Create a mixed population of Z-scores: 1000 standard normal and # 20 uniformly distributed between 3 and 4. grid = np.linspace(0.001, 0.999, 1000) z0 = norm.ppf(grid) z1 = np.linspace(3, 4, 20) zs = np.concatenate((z0, z1)) # Exact local FDR for U(3, 4) component. f1 = np.exp(-z1**2 / 2) / np.sqrt(2 * np.pi) r = len(z1) / float(len(z0) + len(z1)) f1 /= (1 - r) * f1 + r fdr = local_fdr(zs) fdr1 = fdr[len(z0):] assert_allclose(f1, fdr1, rtol=0.05, atol=0.1)
def test_local_fdr(): # Create a mixed population of Z-scores: 1000 standard normal and # 20 uniformly distributed between 3 and 4. grid = np.linspace(0.001, 0.999, 1000) z0 = norm.ppf(grid) z1 = np.linspace(3, 4, 20) zs = np.concatenate((z0, z1)) # Exact local FDR for U(3, 4) component. f1 = np.exp(-z1**2 / 2) / np.sqrt(2*np.pi) r = len(z1) / float(len(z0) + len(z1)) f1 /= (1 - r) * f1 + r fdr = local_fdr(zs) fdr1 = fdr[len(z0):] assert_allclose(f1, fdr1, rtol=0.05, atol=0.1)
def avg_with_error(_d): try: d = map(float, _d) except: print _d raise if has_confidence: n = len(d) avg = mean(d) sd = std(d) alpha = 1.0 - confidence intv = norm.ppf(1.0 - alpha / 2.0) * (sd / sqrt(n)) return (avg, intv) else: n = len(d) if n > 0: return (sum(d) / float(n), 0.0) else: return (0.0, 0.0)
def mquantiles_cimj(data, prob=[0.25,0.50,0.75], alpha=0.05, axis=None): """Computes the alpha confidence interval for the selected quantiles of the data, with Maritz-Jarrett estimators. :Input: data : sequence Input data. prob : sequence *[0.25,0.5,0.75]* Sequence of quantiles whose standard error must be estimated. alpha : float *[0.05]* Confidence degree. axis : integer *[None]* Axis along which to compute the standard error. """ alpha = min(alpha, 1-alpha) z = norm.ppf(1-alpha/2.) xq = mquantiles(data, prob, alphap=0, betap=0, axis=axis) smj = mjci(data, prob, axis=axis) return (xq - z * smj, xq + z * smj)
def mquantiles_cimj(data, prob=[0.25,0.50,0.75], alpha=0.05, axis=None): """Computes the alpha confidence interval for the selected quantiles of the data, with Maritz-Jarrett estimators. Parameters ---------- data: ndarray Data array. prob: sequence Sequence of quantiles to compute. alpha : float Confidence level of the intervals. axis : integer Axis along which to compute the quantiles. If None, use a flattened array. """ alpha = min(alpha, 1-alpha) z = norm.ppf(1-alpha/2.) xq = mstats.mquantiles(data, prob, alphap=0, betap=0, axis=axis) smj = mjci(data, prob, axis=axis) return (xq - z * smj, xq + z * smj)
def mquantiles_cimj(data, prob=[0.25, 0.50, 0.75], alpha=0.05, axis=None): """Computes the alpha confidence interval for the selected quantiles of the data, with Maritz-Jarrett estimators. Parameters ---------- data: ndarray Data array. prob: sequence Sequence of quantiles to compute. alpha : float Confidence level of the intervals. axis : integer Axis along which to compute the quantiles. If None, use a flattened array. """ alpha = min(alpha, 1 - alpha) z = norm.ppf(1 - alpha / 2.) xq = mstats.mquantiles(data, prob, alphap=0, betap=0, axis=axis) smj = mjci(data, prob, axis=axis) return (xq - z * smj, xq + z * smj)
def confidenceInterval(mean, stdev, nSamples, percentConfidence, trueStd=True, printLatex=False): '''if trueStd, use normal distribution, otherwise, Student Use otherwise t.interval or norm.interval ex: norm.interval(0.95, loc = 0., scale = 2.3/sqrt(11)) t.interval(0.95, 10, loc=1.2, scale = 2.3/sqrt(nSamples)) loc is mean, scale is sigma/sqrt(n) (for Student, 10 is df)''' from scipy.stats.distributions import norm, t if trueStd: k = round(norm.ppf(0.5 + percentConfidence / 200., 0, 1), 2) else: # use Student k = round(t.ppf(0.5 + percentConfidence / 200., nSamples - 1), 2) e = k * stdev / sqrt(nSamples) if printLatex: print('${0} \pm {1}\\frac{{{2}}}{{\sqrt{{{3}}}}}$'.format( mean, k, stdev, nSamples)) return mean - e, mean + e
def __init__(self, n_normal, normal_max_value, p_zeros, rhos, i_normal=None, i_ps=None): """ Constructor. The Gaussian Conditional Independence Model for Credit Risk Reference: https://arxiv.org/abs/1412.1183 Args: n_normal (int): number of qubits to represent the latent normal random variable Z normal_max_value (float): min/max value to truncate the latent normal random variable Z p_zeros (list or array): standard default probabilities for each asset rhos (list or array): sensitivities of default probability of assets with respect to latent variable Z i_normal (list or array): indices of qubits to represent normal variable i_ps (list or array): indices of qubits to represent asset defaults """ self.n_normal = n_normal self.normal_max_value = normal_max_value self.p_zeros = p_zeros self.rhos = rhos self.K = len(p_zeros) num_qubits = [n_normal] + [1] * self.K # set and store indices if i_normal is not None: self.i_normal = i_normal else: self.i_normal = range(n_normal) if i_ps is not None: self.i_ps = i_ps else: self.i_ps = range(n_normal, n_normal + self.K) # get normal (inverse) CDF and pdf F = lambda x: norm.cdf(x) F_inv = lambda q: norm.ppf(q) f = lambda x: norm.pdf(x) # set low/high values low = [-normal_max_value] + [0] * self.K high = [normal_max_value] + [1] * self.K # call super constructor super().__init__(num_qubits, low=low, high=high) # create normal distribution self._normal = NormalDistribution(n_normal, 0, 1, -normal_max_value, normal_max_value) # create linear rotations for conditional defaults self._slopes = np.zeros(self.K) self._offsets = np.zeros(self.K) self._rotations = [] for k in range(self.K): psi = F_inv(p_zeros[k]) / np.sqrt(1 - rhos[k]) # compute slope / offset slope = -np.sqrt(rhos[k]) / np.sqrt(1 - rhos[k]) slope *= f(psi) / np.sqrt(1 - F(psi)) / np.sqrt(F(psi)) offset = 2 * np.arcsin(np.sqrt(F(psi))) # adjust for integer to normal range mapping offset += slope * (-normal_max_value) slope *= 2 * normal_max_value / (2**n_normal - 1) self._offsets[k] = offset self._slopes[k] = slope lry = LinearYRotation(slope, offset, n_normal, i_state=self.i_normal, i_target=self.i_ps[k]) self._rotations += [lry]
Pairs = [SFS, SC0, SC1, SC2, SObsr, SLHD, SMHD, SESD, SLC] PairsLength = [len(SFS), len(SC0), len(SC1), len(SC2), len(SObsr), len(SLHD), len(SMHD), len(SESD), len(SLC)] Combinations = len(SFS)*len(SC0)*len(SC1)*len(SC2)*len(SObsr)*len(SLHD)*len(SMHD)*len(SESD)*len(SLC) #================================================================================= # Latin Hypercube Sampling Design # Candidate sets sam = 300 # sample size LHsets = lhs(9, samples = sam) # Generate candidate sets LHsets = norm(loc=0, scale=1).ppf(LHsets) # Normalized the value of candidate sets to N(0,1) for i in range(9): # Substitute the LH matrix to real value Range = Pairs[i] N = PairsLength[i] Prob = 1/N Interval = []; for k in range(N-1): Interval.append(norm.ppf(Prob*(k+1))) for j in range(sam): for q in range(N-1): if Interval[q] > LHsets[j,i]: LHsets[j,i] = Range[q] break if q == N-2: LHsets[j,i] = Range[N-1] #================================================================================= # Groundtruth Travel Time Data # INRIX Travel Time Data Data = xlrd.open_workbook('Your Path'); Table = Data.sheet_by_name(u'Sheet1') I1 = Table.col_values(0) # Travel time for segment 1 I2 = Table.col_values(1) # Travel time for segment 2 #=================================================================================
def real_power(ms, mus, alpha=0.05): za = norm.ppf(1-alpha/2) sms = sqrt(ms) true_hits = (mus != 0).astype(double) return true_hits.dot(norm.cdf(-za+sms*mus) + norm.cdf(-za-sms*mus))
def F_inv(x): return norm.ppf(x) def f(x): return norm.pdf(x)
def quantile_ci(self, p, alpha=0.05, method='cloglog'): """ Returns a confidence interval for a survival quantile. Parameters ---------- p : float The probability point for which a confidence interval is determined. alpha : float The confidence interval has nominal coverage probability 1 - `alpha`. method : str Function to use for g-transformation, must be ... Returns ------- lb : float The lower confidence limit. ub : float The upper confidence limit. Notes ----- The confidence interval is obtained by inverting Z-tests. The limits of the confidence interval will always be observed event times. References ---------- The method is based on the approach used in SAS, documented here: http://support.sas.com/documentation/cdl/en/statug/68162/HTML/default/viewer.htm#statug_lifetest_details03.htm """ tr = norm.ppf(1 - alpha / 2) method = method.lower() if method == "cloglog": g = lambda x: np.log(-np.log(x)) gprime = lambda x: -1 / (x * np.log(x)) elif method == "linear": g = lambda x: x gprime = lambda x: 1 elif method == "log": g = lambda x: np.log(x) gprime = lambda x: 1 / x elif method == "logit": g = lambda x: np.log(x / (1 - x)) gprime = lambda x: 1 / (x * (1 - x)) elif method == "asinsqrt": g = lambda x: np.arcsin(np.sqrt(x)) gprime = lambda x: 1 / (2 * np.sqrt(x) * np.sqrt(1 - x)) else: raise ValueError("unknown method") r = g(self.surv_prob) - g(1 - p) r /= (gprime(self.surv_prob) * self.surv_prob_se) ii = np.flatnonzero(np.abs(r) <= tr) if len(ii) == 0: return np.nan, np.nan lb = self.surv_times[ii[0]] if ii[-1] == len(self.surv_times) - 1: ub = np.inf else: ub = self.surv_times[ii[-1] + 1] return lb, ub
def __sample_with_corr(mean_values, std_dev, desired_corr, num_samples, distro='normal'): """ Randomally samples from a normal-multivariate distribution using LHS while attempting to get the desired_cov Parameters ---------- mean_values desired_cov num_samples distro : str normal, lognormal (no proper handling of corr conversion) Returns ------- """ # raise Exception("This method is deprecated please use sample_with_corr") # draw samples in an uncorrelated manner num_vars = len(mean_values) samples = lhs_normal_sample(num_samples, np.zeros(num_vars), np.ones(num_vars)) # samples = lhs_uniform_sample(num_vars, num_samples) # cholesky-like decomp for non PD matricies. T = np.corrcoef(samples.T) # this decomposition might be right but it's used wrong.. # permutation, Q, e = gmw_cholesky(T) Q = np.linalg.cholesky(T) # this matrix has the same correlation as the desired RStar. # It is known to be PD since any neg eigenvalues were removed already. # this can be changed to using gmw_cholesky to be more general though. P = np.linalg.cholesky(desired_corr) dependent_samples = np.dot(samples, np.dot(P, np.linalg.inv(Q)).T) # for il=1:ntry # for j=1:nvar # % rank RB # [r,id]=ranking(RB(:,j)); # % sort R # [RS,id]=sort(R(:,j)); # % permute RS so has the same rank as RB # z(:,j) = RS(r).*xsd(j)+xmean(j); # end # ae=sum(sum(abs(corrcoef(z)-corr))); # if(ae<amin) # zb=z; # amin=ae; # end # end ntry = 1 amin = 1.8e308 z = np.zeros(np.shape(samples)) for il in range(ntry): for j in range(num_vars): r = np.argsort(dependent_samples[:, j]) rank = np.zeros(np.shape(r), dtype=int) rank[r] = np.array(range(num_samples)) rs = np.sort(samples[:, j]) z[:, j] = np.multiply(rs[rank], std_dev[j]) + mean_values[j] ae = np.abs(np.corrcoef(z.T) - desired_corr).sum().sum() if ae < amin: zb = z amin = ae else: raise Exception('Could not order samples ae={0}'.format(ae)) # zb are the uniform correlated samples, now transform them to desired # # transform the uniform sample about the mean to the unit interval for i in range(num_vars): zb[:, i] = (zb[:, i] - min(zb[:, i])) zb[:, i] = zb[:, i] / max(zb[:, i]) slightly_lt0 = zb[:, i] <= 0.0 # + 1e-5 slightly_gt1 = zb[:, i] >= 1.0 # - 1e-5 zb[slightly_lt0, i] = 1e-10 # 1e-5 zb[slightly_gt1, i] = 1 - 1e-10 # 1.0 - 1e-5 distro = distro.lower() # using the desired distro's ppf, sample the distro with the correlated uniform sample for i in range(num_vars): # create a norm distro with mean/std_dev then sample from it using percent point func (inv of cdf percentiles) if distro == 'normal': zb[:, i] = norm.ppf(zb[:, i], loc=mean_values[i], scale=std_dev[i]) elif distro == 'lognormal': # using mu/sigma from wiki + the scipy convention of loc and scale to specify the mean and sigma mean = np.log(mean_values[i] / (1 + std_dev[i]**2 / mean_values[i]**2)**0.5) sigma = (np.log(1 + std_dev[i]**2 / mean_values[i]**2))**0.5 zb[:, i] = np.exp(norm.ppf(zb[:, i], loc=mean, scale=sigma)) elif distro == 'uniform': zb[:, i] = uniform.ppf(zb[:, i], loc=mean_values[i], scale=std_dev[i]) else: raise Exception( "Distro {0} not supported at the moment".format(distro)) return zb
def plot_fdc(series, multimode=True, plot_enso=False, starting_month=None, lag=6, scale='log', xmin=0.0005, xmax=0.9995, ax=None, **kwargs): """ Plots one or several flow duration curves (FDCs) for the series. The input series should be 1D or 2D. By default, if the series is 1D, one curve only will be plotted, whereas if the series is 2D, a curve will be plotted for each line of the series. A 1D series can also be converted into an annual series with the :keyword:`starting_month` parameter. In that case, ``starting_month`` should be an integer between 1 and 12 precising the month at which the 12-month period should start. For example, to plot the FDCs for each water year (usually from April to the following March), use ``starting_month=4``. When ``enso=True``, ENSO phases are plotted with different colors. When the series is 2D or if it has been converted to an annual frequency, the ENSO indices are defined with the ``full_year=True`` option, where an ENSO episode lasts at least 12 consecutive months. Parameters ---------- series : TimeSeries Flow data. ax : {None, :class:`matplotlib.axes.Axes`}, optional Subplot where to plot the flow duration curves. If None, use the current plot. multimode : {True, False}, optional Whether to interpret a 2D input series as several series or a single one. starting_month : {None, integer}, optional First month of each year. If None, plots the global flow duration curve. Otherwise, ``starting_month`` must be an integer between 1 and 12, corresponding to the first month of the water year (usually, 4 for April). plot_enso : {True, False}, optional Whether to plot each ENSO phase with a different color. lag : {integer}, optional Number of months of lag for the definition of ENSO indices. For example, if lag=6, the ENSO phase starting in Oct. 2001 is applied starting on Apr. 2002. If None, use a lag computed as the time difference between ``starting_month`` and the first month of the reference season of the ENSO indicator (or October if undefined). scale : {'log','lin'}, optional String indicating whether the x-axis is in log (``'log'``) or linear (``'lin'``) scale. If ``'log'``, each plotting position is expressed as a Gaussian pdf. other parameters : The parameters recognized by the :func:`matplotlib.pyplot.plot` function are also recognized. Raises ------ TypeError If ``plot_enso=True`` but the series is not a :class:`~scikits.hydroclimpy.enso.ClimateSeries`. ValueError * If ``starting_month`` is not between 1 and 12. * If ``starting_month`` is defined but the initial series is not 1D. """ if ax is None: ax = gca() # Make sure we have at most a 2D series ............... if series.ndim > 2: raise ValueError("The input series should be 2D at most!") # Get the ENSO indicator associated w/ the series (if any) ensoindicator = getattr(series, 'ensoindicator', None) # Check the starting month ............................ if starting_month is not None: # Make sure we have an integer between 1 and 12 starting_month = int(starting_month) if (starting_month < 1) or (starting_month > 12): errmsg = "The starting month should be between 1 (Jan.) and "\ "12 (Dec.)! (got %s instead)" % starting_month raise ValueError(errmsg) # Check whether we need to plot the ENSO information .. if plot_enso is True: # Make sure we have some ENSO information ......... if ensoindicator is None: errmsg = "No ENSO information is associated with the input series." raise InvalidENSOError(errmsg) # Reset the indices if we have a starting_month ... if starting_month is not None: if lag is None: refmonth = (ensoindicator.reference_season or [10, ])[0] lag = (starting_month + 12 - refmonth) % 12 series.set_ensoindices(full_year=True, lag=lag) else: # Make sure that the indices are already set series.set_ensoindices() # Load the default marker colors .................. from scikits.hydroclimpy.plotlib.ensotools import ENSOlines, \ ENSOmarkers, \ ENSOlabels # No ENSO information to plot : get basic lines & markers else: ENSOlines = {'G':'#cccccc'} ENSOmarkers = {'G':'#cccccc'} # Check whether we are in multimode or not ............ ## 1D input if series.ndim == 1: # Convert to annual if needed if starting_month: multimode = True series = series.convert(FR_ANNSTART[starting_month - 1], func=None) else: multimode = False _series = series.view(ma.MaskedArray) ## 2D input else: # w/ starting month if starting_month is not None: errmsg = "The input series should be 2D! (got %s instead)" raise ValueError(errmsg % str(series.shape)) # w/o multimode if not multimode: _series = series.view(ma.MaskedArray).ravel() # Get the number of valid data per year (ie, per row) n = _series.count(axis= -1) # Get the xdata ......... scale = scale[:3].lower() if scale == 'lin': if multimode: xdata = [np.linspace(1. / (nx + 1), 1 - 1. / (nx + 1), nx) for nx in n] else: xdata = np.linspace(1. / (n + 1), 1 - 1. / (n + 1), n) # xdata = ma.empty(len(series), dtype=float) # xdata[:n] = np.linspace(1. / (n + 1), 1 - 1. / (n + 1), n) elif scale == 'log': if multimode: xdata = [norm.ppf(np.linspace(1. / (nx + 1), 1 - 1. / (nx + 1), nx)) for nx in n] else: xdata = norm.ppf(np.linspace(1. / (n + 1), 1 - 1. / (n + 1), n)) # xdata = ma.empty(len(series), dtype=float) # xdata[:n] = norm.ppf(np.linspace(1. / (n + 1), 1 - 1. / (n + 1), n)) else: raise ValueError("Unrecognized option '%' for scale: "\ "should be in ['lin','log'])") # Get some defaults ..... if multimode: lwdefault = 0.8 zorderdefault = 3 colordefault = ENSOlines['G'] else: lwdefault = 2 zorderdefault = 10 colordefault = 'k' marker = kwargs.pop('marker', 'o') markersize = kwargs.get('markersize', kwargs.get('ms', 3)) lw = kwargs.pop('linewidth', kwargs.pop('lw', lwdefault)) zorder = kwargs.pop('zorder', zorderdefault) color = kwargs.pop('color', kwargs.pop('c', colordefault)) # Multi-mode : one line per year ...................... if multimode: if plot_enso: ensoindices = series.ensoindices if ensoindices.ndim > 1: ensoindices = ensoindices[:, 0] # ENSO mode : different colors for different phases # eidx = series.ensoindices._data # # Take the first column if it's 2D # if eidx.ndim > 1: # eidx=eidx[:,0] for(i, attr) in zip((-1, 0, 1), ('cold', 'neutral', 'warm')): key = attr[0].upper() label = ENSOlabels[key] ydata = series[ensoindices == i] ydata = [np.sort(_).compressed()[::-1] for _ in ydata] # ydata = np.sort(getattr(series, attr).compressed())[::-1] points = [zip(x, y) for (x, y) in zip(xdata, ydata)] collec = LineCollection(points, label=ENSOlabels[key], color=ENSOlines[key], zorder=zorder, linewidth=lw) ax.add_collection(collec, autolim=True) else: ydata = [np.sort(y.compressed())[::-1] for y in _series] points = [zip(x, y) for (x, y) in zip(xdata, ydata)] label = kwargs.pop('label', None) collec = LineCollection(points, label=label, linewidth=lw, colors=ENSOlines['G']) ax.add_collection(collec, autolim=True) # One line for the while dataset ...................... else: ydata = ma.sort(series.compressed(), endwith=False)[::-1] points = [zip(xdata, ydata._series)] label = kwargs.pop('label', 'none') collec = LineCollection(points, label=label, linewidth=lw, colors=color, zorder=zorder) ax.add_collection(collec, autolim=True) # If we need to add some colors if plot_enso and marker: for attr in ('cold', 'neutral', 'warm'): key = attr[0].upper() label = ENSOlabels[key] color = ENSOmarkers[key] #ydata = ma.sort(getattr(series, attr), endwith=False)[::-1] current = getattr(ydata, attr)._series _fdc = ax.plot(xdata, current, ls='', lw=0, marker=marker, ms=markersize, mfc=color, mec=color, label=label, zorder=zorder) #........................ set_normal_limits(ax, xmin=xmin, xmax=xmax, scale=scale) ax.set_ylim(_series.min(), _series.max()) return ax
def quantile_ci(self, p, alpha=0.05, method='cloglog'): """ Returns a confidence interval for a survival quantile. Parameters ---------- p : float The probability point for which a confidence interval is determined. alpha : float The confidence interval has nominal coverage probability 1 - `alpha`. method : string Function to use for g-transformation, must be ... Returns ------- lb : float The lower confidence limit. ub : float The upper confidence limit. Notes ----- The confidence interval is obtained by inverting Z-tests. The limits of the confidence interval will always be observed event times. References ---------- The method is based on the approach used in SAS, documented here: http://support.sas.com/documentation/cdl/en/statug/68162/HTML/default/viewer.htm#statug_lifetest_details03.htm """ tr = norm.ppf(1 - alpha / 2) method = method.lower() if method == "cloglog": g = lambda x: np.log(-np.log(x)) gprime = lambda x: -1 / (x * np.log(x)) elif method == "linear": g = lambda x: x gprime = lambda x: 1 elif method == "log": g = lambda x: np.log(x) gprime = lambda x: 1 / x elif method == "logit": g = lambda x: np.log(x / (1 - x)) gprime = lambda x: 1 / (x * (1 - x)) elif method == "asinsqrt": g = lambda x: np.arcsin(np.sqrt(x)) gprime = lambda x: 1 / (2 * np.sqrt(x) * np.sqrt(1 - x)) else: raise ValueError("unknown method") r = g(self.surv_prob) - g(1 - p) r /= (gprime(self.surv_prob) * self.surv_prob_se) ii = np.flatnonzero(np.abs(r) <= tr) if len(ii) == 0: return np.nan, np.nan lb = self.surv_times[ii[0]] if ii[-1] == len(self.surv_times) - 1: ub = np.inf else: ub = self.surv_times[ii[-1] + 1] return lb, ub
def test_0(size_N, mu_0, sgm_0): x = spnorm.rvs(size=size_N, loc=mu_0, scale=sgm_0) inv_x_0 = spnorm.ppf(x, loc=mu_0, scale=sgm_0) opencl_kernel = " "" "" "
def F_inv(x): return norm.ppf(x)
# https://www.hko.gov.hk/en/wxinfo/season/fcvsobs_seasonal.htm climat_yrs = [(1981, 2011), (1991, 2021)] # seasonal prediction by HKO # 0: "Normal to below normal" # 1: "Normal to above normal" # https://www.hko.gov.hk/en/wxinfo/season/fcvsobs_seasonal.htm b_norm = [ [1,1,1,1,1,1,1,0], #temp [1,1,0,0,0,1,1,0] #rf ] # CDF to Z-scores for above normal and below normal CDF_AN = 0.7 CDF_BN = 0.3 Z_AN = norm.ppf(CDF_AN) Z_BN = norm.ppf(CDF_BN) # Get all data filter by season only def extract_by_season(season='spring'): if season == 'spring': return df_all[df_all['month'].isin([3, 4, 5])] elif season == 'summer': return df_all[df_all['month'].isin([6, 7, 8])] elif season == 'autumn': return df_all[df_all['month'].isin([9, 10, 11])] elif season == 'winter': return df_all[df_all['month'].isin([12, 1, 2])] else: raise ValueError('Season not defined')
pname = fname.replace("patterns_", "patterns_plot_") pname = pname.replace(".txt", ".pdf") pdf = PdfPages(pname) lb = ["1→1", "0→1", "1→0", "0→0", "-1→0", "0→-1", "-1→-1"] from turbo_colormap import turbo_colormap_data c = turbo_colormap_data ii = np.linspace(15, len(c) - 30, 7) # Avoid very dark colors ii = np.round(ii).astype(np.int) cols = [c[i] for i in ii] syms = ['s', 'o', 'x', '+', 'D', '>', '<'] next(fid) # Skip initial ``` fq = norm.ppf(1 - 0.025) while True: group = next(fid).rstrip() if group == "```": break head = next(fid).strip().split() table = [next(fid).rstrip() for k in range(7)] rows = [x[0:36].rstrip() for x in table] table = [x[36:].lstrip() for x in table] next(fid) next(fid) next(fid)
## ## results summary ## import numpy as np import pandas as pd from scipy.stats.distributions import norm from .tools import maybe_diag ## ## constants ## z95 = norm.ppf(0.975) ## ## param summary ## def param_table(beta, y_name, x_names, sigma=None): # basic frame frame = pd.DataFrame({ 'coeff': beta, }, index=x_names) frame = frame.rename_axis(y_name, axis=1) # handle sigma cases if sigma is None: return frame
# (x'x)^{-1} = (vs^2v')^{-1} xtx = np.dot(vt.T / s**2, vt) # Standard error for the interaction term se = np.sqrt(uv * xtx[3, 3]) # Z-scores for the interaction term zs = params[:, 3] / se zs = zs.dropna() zsa = np.abs(zs) # P-values for the interaction term pv = student_t.cdf(-np.abs(zs), xmat.shape[0] - xmat.shape[1]) # Bonferroni threshold bt = norm.ppf(1 - 0.025 / zs.shape[0]) # Calculate the FDR for a range of threshold from 2 to 5. fdr = [] n = len(zs) for t in np.linspace(0, 6, 20): d = np.sum(zsa > t) f = 2 * n * norm.cdf(-t) / d fdr.append([t, f, d]) fdr = np.asarray(fdr) # Plots relating to FDR plt.clf() plt.grid(True) plt.plot(fdr[:, 0], fdr[:, 1], '-') plt.xlabel("Z-score threshold", size=15)
def betanorm(ms, xbars, alpha=0.05): """Return the power of a Z-test for given means and sample sizes.""" za = norm.ppf(1 - alpha/2) sms = sqrt(ms) bn = norm.cdf(-za+sms*xbars) + norm.cdf(-za-sms*xbars) return bn
plt.clf() plt.grid(True) for j in range(xm.shape[1]): plt.plot(x, xm[:, j], '-') plt.xlabel("x", size=15) plt.ylabel("Spline value", size=15) plt.title(title) pdf.savefig() x = np.linspace(-1, 1, n) for k in range(3, 11): fml = "bs(x, %d)" % k title = "df=%d, uniform" % k plot(fml, x, title) p = np.linspace(0.001, 0.999, n) x = -np.log(1 - p) for k in range(3, 11): fml = "bs(x, %d)" % k title = "df=%d, exponential" % k plot(fml, x, title) x = norm.ppf(p) for k in range(3, 11): fml = "bs(x, %d)" % k title = "df=%d, Gaussian" % k plot(fml, x, title) pdf.close()
def F_inv(x): # pylint: disable=invalid-name return norm.ppf(x)
def sampleSize(stdev, tolerance, percentConfidence, printLatex = False): from scipy.stats.distributions import norm k = round(norm.ppf(0.5+percentConfidence/200., 0, 1)*100)/100. # 1.-(100-percentConfidence)/200. if printLatex: print('${0}^2\\frac{{{1}^2}}{{{2}^2}}$'.format(k, stdev, tolerance)) return (k*stdev/tolerance)**2