def test_permuted_ols_check_h0_noeffect_signswap(random_state=0): rng = check_random_state(random_state) # design parameters n_samples = 100 # create dummy design with no effect target_var = rng.randn(n_samples, 1) tested_var = np.ones((n_samples, 1)) # permuted OLS # We check that h0 is close to the theoretical distribution, which is # known for this simple design (= t(n_samples - dof)). perm_ranges = [10, 100, 1000] # test various number of permutations all_kstest_pvals = [] # we compute the Mean Squared Error between cumulative Density Function # as a proof of consistency of the permutation algorithm all_mse = [] for i, n_perm in enumerate(np.repeat(perm_ranges, 10)): pval, orig_scores, h0 = permuted_ols( tested_var, target_var, model_intercept=False, n_perm=n_perm, two_sided_test=False, random_state=i) assert_equal(h0.size, n_perm) # Kolmogorov-Smirnov test kstest_pval = stats.kstest(h0, stats.t(n_samples).cdf)[1] all_kstest_pvals.append(kstest_pval) mse = np.mean( (stats.t(n_samples).cdf(np.sort(h0)) - np.linspace(0, 1, h0.size + 1)[1:]) ** 2) all_mse.append(mse) all_kstest_pvals = np.array(all_kstest_pvals).reshape( (len(perm_ranges), -1)) all_mse = np.array(all_mse).reshape((len(perm_ranges), -1)) # check that a difference between distributions is not rejected by KS test assert_array_less(0.01 / (len(perm_ranges) * 10.), all_kstest_pvals) # consistency of the algorithm: the more permutations, the less the MSE assert_array_less(np.diff(all_mse.mean(1)), 0)
def _hinv(self, v, u, rotation=0, *theta): """! @brief Inverse H function (Inv Conditional distribution) of T copula. TODO: CHECK UU and VV ordering! """ kT = self.kTau(rotation, *theta) kTs = kT / abs(kT) kTM = 1 if kTs < 0 else 0 h1 = 1.0 - np.power(theta[0], 2.0) nu1 = theta[1] + 1.0 dist1 = stats.t(df=theta[1], scale=1.0, loc=0.0) dist2 = stats.t(df=nu1, scale=1.0, loc=0.0) UU = np.array(kTM + kTs * u) # TODO: check input bounds VV = np.array(v) # inverse CDF yields quantiles x = dist2.ppf(UU) y = dist1.ppf(VV) # eval H function uu = dist1.cdf(x * np.sqrt((theta[1] + np.power(y, 2.0)) * h1 / nu1) + theta[0] * y) return uu
def create_probe_statistic(probe_values, fpr, verbose=0): # Create prediction interval statistics based on randomly permutated probe features (based on real features) n = len(probe_values) if n == 0: if verbose > 0: logging.info( "All probes were infeasible. All features considered relevant." ) # # If all probes were infeasible we expect an empty list # # If they are infeasible it also means that only strongly relevant features were in the data # # As such we just set the prediction without considering the statistics low_t = 0 up_t = 0 elif n == 1: val = probe_values[0] low_t = val up_t = val else: probe_values = np.asarray(probe_values) mean = probe_values.mean() s = probe_values.std() low_t = mean + stats.t(df=n - 1).ppf(fpr) * s * np.sqrt(1 + (1 / n)) up_t = mean - stats.t(df=n - 1).ppf(fpr) * s * np.sqrt(1 + (1 / n)) return ProbeStatistic(low_t, up_t, n)
def ci_specified_risk_metrics(returns, weights, portfolio_vol, ci, d): """ Calculate the Analytical VaR and Expected Shortfall for a portfolio, using both the Normal distribution and the T-distribution. :param returns: np.array([float]) - the historical portfolio returns :param weights: np.array([float]) - the weights of the assets in the portfolio :param portfolio_vol: float - the volatility of the portfolio :param ci: float - the confidence interval at which to take the Analytical VaR :param d: int - the number of degrees of freedom to use :return: float, float, float, float, float, float """ # calculate the standard deviation of the portfolio sigma = np.sqrt(portfolio_vol) # calculate the mean return of the portfolio mu = np.sum(portfolio_returns(returns, weights)) / returns.shape[1] # integrate the Probability Density Function to find the Analytical Value at Risk for both Normal and t distributions var_level = stats.norm.ppf(ci, mu, sigma) t_dist_var_level = stats.t(d).ppf(ci) # calculate the expected shortfall for each distribution - this is the expected loss (in % daily returns) for the # portfolio in the worst a_var% of cases - it is effectively the mean of the values along the x-axis from # -infinity% to a_var% es = (stats.norm.pdf(stats.norm.ppf((1 - ci))) * sigma) / (1 - ci) - mu t_dist_es = (stats.t(d).pdf(stats.t(d).ppf( (1 - ci))) * sigma * (d + (stats.t(d).ppf( (1 - ci)))**2)) / ((1 - ci) * (d - 1)) - mu return sigma, mu, var_level, t_dist_var_level, es, t_dist_es
def risk_metrics(returns, weights, portfolio_vol, var_p, d): """ Calculate the Analytical VaR and Expected Shortfall for a portfolio, using both the Normal distribution and the T-distribution. :param returns: np.array([float]) - the historical portfolio returns :param weights: np.array([float]) - the weights of the assets in the portfolio :param portfolio_vol: float - the volatility of the portfolio :param var_p: float - the value of the daily returns at which to take the Analytical VaR :param d: int - the number of degrees of freedom to use :return: float, float, float, float, float, float """ # calculate the standard deviation of the portfolio sigma = np.sqrt(portfolio_vol) # calculate the mean return of the portfolio mu = np.sum(portfolio_returns(returns, weights))/returns.shape[1] # integrate the Probability Density Function to find the Analytical Value at Risk for both Normal and t distributions a_var = stats.norm(mu, sigma).cdf(var_p) t_dist_a_var = stats.t(d).cdf(var_p) # calculate the expected shortfall for each distribution - this is the expected loss (in % daily returns) for the # portfolio in the worst a_var% of cases - it is effectively the mean of the values along the x-axis from # -infinity% to a_var% es = (stats.norm(mu, sigma).pdf(stats.norm(mu, sigma).ppf((1 - a_var))) * sigma)/(1 - a_var) - mu percent_point_function = stats.t(d).ppf((1 - a_var)) t_dist_es = -1/(1 - a_var) * (1-d)**(-1) * (d-2 + percent_point_function**2) * stats.t(d).pdf(percent_point_function)*sigma - mu return sigma, mu, a_var, t_dist_a_var, es, t_dist_es
def plot_power(ax, s1, s2, xlabel='', ylabel='', title='', **options): x = np.linspace(-8, 8, 250) se = se_welch(s1, s2) df = welch_satterhwaithe_df(s1, s2) null = stats.t(df=df) alt = stats.t(loc=(s1.mean() - s2.mean()) / se, df=df) sns.lineplot(x, null.pdf(x), label='null') sns.lineplot(x, alt.pdf(x), label='alt') ax.vlines(x=null.ppf(0.975), ymin=0, ymax=0.5, color='#000000', linestyle='--', label='alpha', zorder=1) ax.fill_between(x, alt.pdf(x), where=(x >= null.ppf(0.975)), color="red", alpha=0.25) ax.set_xlabel(xlabel, fontsize=15) ax.set_ylabel(ylabel, fontsize=15) ax.tick_params(axis='both', which='major', labelsize=15) ax.set_title(title, fontsize=15) ax.legend(fontsize='xx-large', loc='upper left')
def stats(x): n = len(TRACTS) m = x.mean() se = x.std() / sqrt(n) e = se * t(n - 1).ppf(.975) p = 2 - 2 * t(n - 1).cdf(abs(m) / se) return (m, se, e, p)
def sample_elections(plan_df, n=1000, p_seats=False): elec_results = plan_df[['2008', '2012', '2016']].values state_year_results = elec_results.mean(axis=0) state_vote_share_mean = state_year_results.mean() state_vote_t = t(df=2, loc=state_vote_share_mean, scale=state_year_results.std(ddof=1)) state_vote_share_samples = state_vote_t.rvs(n) district_mean = elec_results.mean(axis=1) district_std = elec_results.std(axis=1, ddof=1) district_vote_t = t(df=2, loc=district_mean, scale=district_std) district_static_samples = district_vote_t.rvs((n, len(district_mean))) district_mean_samples = district_static_samples.mean(axis=1) district_vote_shares = ( district_static_samples + (state_vote_share_samples - district_mean_samples)[:, np.newaxis]) if p_seats: seat_shares = np.sum( 1 - t.cdf(.5, df=2, loc=district_vote_shares, scale=district_std), axis=1) else: seat_shares = np.sum(district_vote_shares > 0.5, axis=1) vote_shares = np.sum(district_vote_shares, axis=1) return seat_shares / len(plan_df), vote_shares / len(plan_df)
def create_probe_statistic(probe_values, fpr, verbose=0): # Create prediction interval statistics based on randomly permutated probe features (based on real features) n = len(probe_values) if n == 0: if verbose > 0: logging.info( "All probes were infeasible. All features considered relevant." ) # # If all probes were infeasible we expect an empty list # # If they are infeasible it also means that only strongly relevant features were in the data # # As such we just set the prediction without considering the statistics mean = 0 else: probe_values = np.asarray(probe_values) mean = probe_values.mean() if mean == 0: lower_threshold, upper_threshold = mean, mean s = 0 else: s = probe_values.std() lower_threshold = mean + stats.t(df=n - 1).ppf(fpr) * s * np.sqrt(1 + (1 / n)) upper_threshold = mean - stats.t(df=n - 1).ppf(fpr) * s * np.sqrt(1 + (1 / n)) if verbose > 0: print( f"FS threshold: {lower_threshold}-{upper_threshold}, Mean:{mean}, Std:{s}, n_probes {n}" ) return lower_threshold, upper_threshold
def CalcErr(Vals, stds, Flux, FluxErr, Tj, TjErr,fitFunc,**kwargs): outErrs=np.zeros(len(Vals)) npar=len(Flux) tvalsq=(stats.t(df=(npar-4)).ppf(0.975))**2/npar relmeasErrsq=np.sum(FluxErr**2)/((np.sum(Flux))**2)+np.sum(TjErr**2)/((np.sum(Tj))**2) if fitFunc=='EMA': taug=kwargs['taug'] x0val=taug[0] x0std=taug[1] SO2Flux=taug[2] SO2FluxErr=taug[3] SO2Tj = taug[4] SO2TjErr = taug[5] nSO2par = len(SO2Flux) tvalsqg = (stats.t(df=(nSO2par - 4)).ppf(0.975)) ** 2 / nSO2par reltaugErrsq = tvalsqg * (np.sum(SO2FluxErr**2)/((np.sum(SO2Flux))**2)+np.sum(SO2TjErr**2)/((np.sum(SO2Tj))**2) + (x0std / x0val) ** 2) + 0.01 + 0.01 for ivar in np.arange(len(Vals)): outErrs[ivar]=np.absolute(Vals[ivar])*np.sqrt(tvalsq*(relmeasErrsq+(stds[ivar]/Vals[ivar])**2)+0.01+0.01+0.04+reltaugErrsq) else: for ival in np.arange(len(Vals)): outErrs[ival]=np.absolute(Vals[ival])*np.sqrt(tvalsq*(relmeasErrsq+(stds[ival]/Vals[ival])**2)+0.01+0.01) return outErrs
def __init__(self, mean, cov, df=None, random_state=1): self.mean = mean self.cov = cov self.sd = sd = np.sqrt(np.diag(cov)) if df is None: self.dist = stats.multivariate_normal(mean=mean, cov=cov) self.udist = stats.norm(loc=mean, scale=sd) self.std_udist = stats.norm(loc=0., scale=1.) else: sigma = cov * (df - 2) / df self.dist = MVT(mean=mean, sigma=sigma, df=df) self.udist = stats.t(loc=mean, scale=sd, df=df) self.std_udist = stats.t(loc=0., scale=1., df=df) self.dist.random_state = random_state self.udist.random_state = random_state self.std_udist.random_state = random_state self._chol = cholesky(self.cov) self._pchol = pivoted_cholesky(self.cov) e, v = np.linalg.eigh(self.cov) # To match Bastos and O'Hagan definition # i.e., eigenvalues ordered from largest to smallest e, v = e[::-1], v[:, ::-1] ee = np.diag(np.sqrt(e)) self._eig = (v @ ee)
def __init__(self, rho, nu, Law_RS, Law_RF): self.rho = rho # Dependence Parameter self.nu = nu # Degree of Freedom self.Law_RS = Law_RS # Marginal Distribution of Spot self.Law_RF = Law_RF # Marginal Distribution of Future self.meta_t = multivariate_t(nu =nu, # DF Sigma=np.array([[1,rho], # COV [rho,1]])) self.t1 = stats.t(df=nu) # inner self.t2 = stats.t(df=nu)
def __init__(self, a, b, beginning=None, ending=None, beginning_factor=None, ending_factor=None): """ start and end can be in either datetime or unix time """ a, b = UnixTime(a), UnixTime(b) assert a < b, "'b' should be greater than 'a'" if (beginning, ending) != (None, None): assert (beginning_factor, ending_factor) == (None, None), "PiecewiseTemporalEvent() only accepts " \ "either 'beginning_factor' and 'ending_factor' " \ "or 'beginning' and 'ending'" if beginning_factor is not None: assert beginning_factor > 0 self.beginning_factor = beginning_factor if ending_factor is not None: assert ending_factor > 0 self.ending_factor = ending_factor if (beginning, ending) != (None, None): beginning = UnixTime(beginning) ending = UnixTime(ending) else: beginning, ending = 0, 0 while not a < beginning < ending < b: beginning = random_time( a, b, probability_distribution=t( # df, mean, variance 4, a + float(b - a) / self.beginning_factor, float(b - a) / self.beginning_factor)) ending = random_time( a, b, probability_distribution=t( # df, mean, variance 4, b - float(b - a) / self.ending_factor, float(b - a) / self.ending_factor)) TemporalEventPiecewiseLinear.__init__(self, { a: 0, beginning: 1 }, { ending: 1, b: 0 })
def p(self): n = len(self.x) x1 = self.x[self.x < self.peak] y1 = self.y[self.x < self.peak] se1 = sqrt(sum(square(y1 - self.b[0] + self.b[1] * (x1 - self.peak))) / (n - 2)) / sqrt( sum(square(x1 - mean(x1)))) x2 = self.x[self.x >= self.peak] y2 = self.y[self.x >= self.peak] se2 = sqrt(sum(square(y2 - self.b[0] + self.b[2] * (x2 - self.peak))) / (n - 2)) / sqrt( sum(square(x2 - mean(x2)))) return (2 * (1 - t(n - 2).cdf(abs(self.b[1] / se1))), 2 * (1 - t(n - 2).cdf(abs(self.b[2] / se2))))
def ci_se(self, alpha, symmetric): if symmetric == True: qq = t(df=self.n).ppf(1 - alpha / 2) return np.array( [self.theta - self.se * qq, self.theta + self.se * qq]) else: qq = t(df=self.n).ppf(1 - alpha) if symmetric == 'lower': return self.theta - qq * self.se else: return self.theta + qq * self.se
def show_continuous(): """Show a variety of continuous distributions""" x = linspace(-10, 10, 201) # Normal distribution showDistribution(x, stats.norm, stats.norm(loc=2, scale=4), "Normal Distribution", "Z", "P(Z)", "") # Exponential distribution showDistribution(x, stats.expon, stats.expon(loc=-2, scale=4), "Exponential Distribution", "X", "P(X)", "") # Students' T-distribution # ... with 4, and with 10 degrees of freedom (DOF) plot(x, stats.norm.pdf(x), "g") hold(True) showDistribution(x, stats.t(4), stats.t(10), "T-Distribution", "X", "P(X)", ["normal", "t=4", "t=10"]) # F-distribution # ... with (3,4) and (10,15) DOF showDistribution(x, stats.f(3, 4), stats.f(10, 15), "F-Distribution", "F", "P(F)", ["(3,4) DOF", "(10,15) DOF"]) # Weibull distribution # ... with the shape parameter set to 1 and 2 # Don't worry that in Python it is called "weibull_min": the "weibull_max" is # simply mirrored about the origin. showDistribution( arange(0, 5, 0.02), stats.weibull_min(1), stats.weibull_min(2), "Weibull Distribution", "X", "P(X)", ["k=1", "k=2"], xmin=0, xmax=4, ) # Uniform distribution showDistribution(x, stats.uniform, "", "Uniform Distribution", "X", "P(X)", "") # Logistic distribution showDistribution(x, stats.norm, stats.logistic, "Logistic Distribution", "X", "P(X)", ["Normal", "Logistic"]) # Lognormal distribution x = logspace(-9, 1, 1001) + 1e-9 showDistribution(x, stats.lognorm(2), "", "Lognormal Distribution", "X", "lognorm(X)", "", xmin=-0.1) # The log-lin plot has to be done by hand: plot(log(x), stats.lognorm.pdf(x, 2)) xlim(-10, 4) title("Lognormal Distribution") xlabel("log(X)") ylabel("lognorm(X)") show()
def probabilidade_dado_um_intervalo( numero_amostras, x1, x2, mu=0, sigma=1, df=10000000 ): """Calcula a probabilidade dado um intervalo de valores.""" t1 = (x1 - mu) / (sigma / sqrt(numero_amostras)) t2 = (x2 - mu) / (sigma / sqrt(numero_amostras)) p_t1 = stats.t(df=df, loc=0, scale=1).cdf(t1) p_t2 = stats.t(df=df, loc=0, scale=1).cdf(t2) probabilidade = p_t2 - p_t1 return t1, t2, probabilidade
def __init__(self, a, b, beginning=None, ending=None, beginning_factor=None, ending_factor=None): """ start and end can be in either datetime or unix time """ a, b = UnixTime(a), UnixTime(b) assert a < b, "'b' should be greater than 'a'" if (beginning, ending) != (None, None): assert (beginning_factor, ending_factor) == (None, None), "PiecewiseTemporalEvent() only accepts " \ "either 'beginning_factor' and 'ending_factor' " \ "or 'beginning' and 'ending'" if not a < beginning < ending < b: raise AttributeError( "The inputs should satisfy 'a < beginning < ending < b' relation" ) if beginning_factor is not None: assert beginning_factor > 0 self.beginning_factor = beginning_factor if ending_factor is not None: assert ending_factor > 0 self.ending_factor = ending_factor if (beginning, ending) == (None, None): beginning, ending = 0, 0 while not a < beginning < ending < b: beginning = random_time( a, b, probability_distribution=t( # df, mean, variance 4, a + float(b - a) / self.beginning_factor, float(b - a) / self.beginning_factor)) ending = random_time( a, b, probability_distribution=t( # df, mean, variance 4, b - float(b - a) / self.ending_factor, float(b - a) / self.ending_factor)) TemporalEvent.__init__(self, uniform(loc=a, scale=UnixTime(beginning - a)), uniform(loc=ending, scale=UnixTime(b - ending)), bins=4)
def show_continuous(): """Show a variety of continuous distributions""" x = np.linspace(-10,10,201) # Normal distribution showDistribution(x, stats.norm, stats.norm(loc=2, scale=4), 'Normal Distribution', 'Z', 'P(Z)','') # Exponential distribution showDistribution(x, stats.expon, stats.expon(loc=-2, scale=4), 'Exponential Distribution', 'X', 'P(X)','') # Students' T-distribution # ... with 4, and with 10 degrees of freedom (DOF) plt.plot(x, stats.norm.pdf(x), 'g-.') plt.hold(True) showDistribution(x, stats.t(4), stats.t(10), 'T-Distribution', 'X', 'P(X)',['normal', 't=4', 't=10']) # F-distribution # ... with (3,4) and (10,15) DOF showDistribution(x, stats.f(3,4), stats.f(10,15), 'F-Distribution', 'F', 'P(F)',['(3,4) DOF', '(10,15) DOF']) # Weibull distribution # ... with the shape parameter set to 1 and 2 # Don't worry that in Python it is called "weibull_min": the "weibull_max" is # simply mirrored about the origin. showDistribution(np.arange(0,5,0.02), stats.weibull_min(1), stats.weibull_min(2), 'Weibull Distribution', 'X', 'P(X)',['k=1', 'k=2'], xmin=0, xmax=4) # Uniform distribution showDistribution(x, stats.uniform,'' , 'Uniform Distribution', 'X', 'P(X)','') # Logistic distribution showDistribution(x, stats.norm, stats.logistic, 'Logistic Distribution', 'X', 'P(X)',['Normal', 'Logistic']) # Lognormal distribution x = np.logspace(-9,1,1001)+1e-9 showDistribution(x, stats.lognorm(2), '', 'Lognormal Distribution', 'X', 'lognorm(X)','', xmin=-0.1) # The log-lin plot has to be done by hand: plt.plot(np.log(x), stats.lognorm.pdf(x,2)) plt.xlim(-10, 4) plt.title('Lognormal Distribution') plt.xlabel('log(X)') plt.ylabel('lognorm(X)') plt.show()
def perform_analysis(model: RegressionModel) -> np.array: """ Calculates a set of statistics for a given linear regression model @param model: RegressionModel object to be studied @return: indices of factors which may be equal to zero (for significance level 0.1) """ n, m = model.x.shape sum_errs2 = sum((model.y - model.y_hat)**2) s2 = sum_errs2 / (n - m) print("s^2 = {:.3f}".format(s2)) cov_a = s2 * model.inv_xx print_matrix(cov_a, var_name="cov(a)") s_a = np.diag(cov_a)**.5 print_matrix(s_a, var_name="standard deviation of `a`") ixx_diag = np.array([np.diag(model.inv_xx)]) corr_a = model.inv_xx / np.sqrt(ixx_diag.T @ ixx_diag) print_matrix(corr_a, var_name="corr(a)") sum_y2_centered = sum((model.y - np.mean(model.y))**2) r2 = 1 - sum_errs2 / sum_y2_centered print("R^2 = {:.4f}".format(r2)) r2n = 1 - (sum_errs2 / (n - m)) / (sum_y2_centered / (n - 1)) print("R_n^2 = {:.4f}".format(r2n)) gamma = .95 quantile = stats.t(n - m).ppf((1 + gamma) / 2) a_confidence_intervals = np.array( [model.a - s_a * quantile, model.a + s_a * quantile]) print_matrix( a_confidence_intervals.T, var_name="confidence intervals for `a` with confidence level {}". format(gamma)) joint_alpha = (1 - gamma) / m joint_quantile = stats.t(n - m).ppf((1 + joint_alpha) / 2) a_joint_conf_intervals = np.array( [model.a - s_a * joint_quantile, model.a + s_a * joint_quantile]) print_matrix( a_joint_conf_intervals.T, var_name="joint confidence intervals for `a` with confidence {}". format(gamma)) # testing hypotheses: a_i ?= 0 zero_hypot_statistics = np.abs(model.a) / s_a print_matrix( zero_hypot_statistics, var_name="statistics for hypothesis a_i = 0 (t_alpha={})".format( quantile)) return (zero_hypot_statistics < quantile).nonzero()[0]
def show_continuous(): """Show a variety of continuous distributions""" x = linspace(-10,10,201) # Normal distribution showDistribution(x, stats.norm, stats.norm(loc=2, scale=4), 'Normal Distribution', 'Z', 'P(Z)','') # Exponential distribution showDistribution(x, stats.expon, stats.expon(loc=-2, scale=4), 'Exponential Distribution', 'X', 'P(X)','') # Students' T-distribution # ... with 4, and with 10 degrees of freedom (DOF) plot(x, stats.norm.pdf(x), 'g-.') hold(True) showDistribution(x, stats.t(4), stats.t(10), 'T-Distribution', 'X', 'P(X)',['normal', 't=4', 't=10']) # F-distribution # ... with (3,4) and (10,15) DOF showDistribution(x, stats.f(3,4), stats.f(10,15), 'F-Distribution', 'F', 'P(F)',['(3,4) DOF', '(10,15) DOF']) # Weibull distribution # ... with the shape parameter set to 1 and 2 # Don't worry that in Python it is called "weibull_min": the "weibull_max" is # simply mirrored about the origin. showDistribution(arange(0,5,0.02), stats.weibull_min(1), stats.weibull_min(2), 'Weibull Distribution', 'X', 'P(X)',['k=1', 'k=2'], xmin=0, xmax=4) # Uniform distribution showDistribution(x, stats.uniform,'' , 'Uniform Distribution', 'X', 'P(X)','') # Logistic distribution showDistribution(x, stats.norm, stats.logistic, 'Logistic Distribution', 'X', 'P(X)',['Normal', 'Logistic']) # Lognormal distribution x = logspace(-9,1,1001)+1e-9 showDistribution(x, stats.lognorm(2), '', 'Lognormal Distribution', 'X', 'lognorm(X)','', xmin=-0.1) # The log-lin plot has to be done by hand: plot(log(x), stats.lognorm.pdf(x,2)) xlim(-10, 4) title('Lognormal Distribution') xlabel('log(X)') ylabel('lognorm(X)') show()
def N_eq(N): # Distribution for the control group Fc = t(df=p * N - 2) # Distribution for the treatment group Ft = t(df=p * N - 2, loc=MDE) # Calculate discrepancy delta_N = (MDE - (Fc.ppf(1 - alpha / 2) + Ft.ppf(kappa)) * np.sqrt( (p * (1 - p))**(-1) * sigma2 / N)) # Return discrepancy return delta_N
def __init__(self, dofs): self.dofs = dofs if self.dofs is not None: if self.dofs > 0: self.bounds = np.array([-np.inf, np.inf]) mean, var, skew, kurt = t.stats(df=self.dofs, moments='mvsk') self.parent = t(df=self.dofs) self.mean = mean self.variance = var self.skewness = skew self.kurtosis = kurt self.x_range_for_pdf = np.linspace(-5.0, 5.0, RECURRENCE_PDF_SAMPLES) self.parent = t(self.dofs)
def _create_probe_statistic(probe_values, fpr): # Create prediction interval statistics based on randomly permutated probe features (based on real features) n = len(probe_values) if n == 1: val = probe_values[0] low_t = val up_t = val else: probe_values = np.asarray(probe_values) mean = probe_values.mean() s = probe_values.std() low_t = mean + stats.t(df=n - 1).ppf(fpr) * s * np.sqrt(1 + (1 / n)) up_t = mean - stats.t(df=n - 1).ppf(fpr) * s * np.sqrt(1 + (1 / n)) return low_t, up_t
def ttest_uneq(x): assert x.columns.get_level_values(0).isin(['mu','se','n']).all() idx = pd.IndexSlice mu_mat = x.loc[:,idx['mu']].values n_mat = x.loc[:,idx['n']].values se_mat = x.loc[:,idx['se']].values # Calculate t-stat dmu_vec = mu_mat[:,0] - mu_mat[:,1] var_vec = np.sum(se_mat**2 / n_mat,1) se_vec = np.sqrt(var_vec) df_vec = var_vec**2 / np.sum((se_mat**2 / n_mat)**2 / (n_mat-1),1) stat_vec = dmu_vec / se_vec pval = 2*np.minimum(stats.t(df=df_vec).cdf(stat_vec),1-stats.t(df=df_vec).cdf(stat_vec)) res = pd.DataFrame({'stat':stat_vec, 'pval':pval, 'df':df_vec},index=x.index) return res
def welchs_ttest(stats1, stats2): """ SNAGGED FROM https://github.com/mozilla/datazilla-metrics/blob/master/dzmetrics/ttest.py#L56 Execute TWO-sided Welch's t-test given pre-calculated means and stddevs. Accepts summary data (N, stddev, and mean) for two datasets and performs one-sided Welch's t-test, returning p-value. """ n1 = stats1.count m1 = stats1.mean v1 = max(stats1.variance, 1.0/12.0) n2 = stats2.count m2 = stats2.mean v2 = max(stats2.variance, 1.0/12.0) if n1 < 2 or n2 < 2: return {"confidence": 0, "diff": 0} vpooled = v1 / n1 + v2 / n2 # 1/12 == STD OF STANDARD UNIFORM DISTRIBUTION # We assume test replicates (xi) are actually rounded results from # actual measurements somewhere in the range of (xi - 0.5, xi + 0.5), # which has a variance of 1/12 tt = abs(m1 - m2) / sqrt(vpooled) df_numerator = vpooled ** 2 df_denominator = ((v1 / n1) ** 2) / (n1 - 1) + ((v2 / n2) ** 2) / (n2 - 1) df = df_numerator / df_denominator # abs(x - 0.5)*2 IS AN ATTEMPT TO GIVE HIGH NUMBERS TO EITHER TAIL OF THE cdf return {"confidence": abs(stats.t(df).cdf(tt) - 0.5) * 2, "diff": tt}
def density(unifs): if np.ndim(unifs[0]) == 0: unifs = [[i] for i in unifs] v = self.par['v'] sigma = norm_matrix(self.par['sigma']) std = math.sqrt(v / (v - 2)) vecs = [] marginal_density = [] t_obj = stats.t(v) for i in range(dim): temp = t_obj.ppf(unifs[i]) marginal_density.append(t_obj.pdf(temp)) vecs.append(temp) res = [] vecs = list(zip(*vecs)) marginal_density = list(zip(*marginal_density)) factor = spe.gamma((v + self.dim) / 2) / ( spe.gamma(v / 2) * math.sqrt((math.pi * v) ** self.dim * np.linalg.det(sigma))) cov_inv = sigma ** (-1) for i in list(zip(*[vecs, marginal_density])): x = np.matrix(i[0]) m = np.prod(i[1]) if m > 0: temp = 1 / m * factor * math.exp( math.log(1 + x * cov_inv * np.transpose(x) / v) * (-(v + self.dim) / 2)) else: temp = 0 res.append(temp) return res
def mu_ci(self, alpha=0.10, prior=False): """ Compute marginal confidence intervals around each parameter \theta_{ti} If prior is False, compute posterior """ _x, _y = np.diag_indices(self.ndim) diags = self.mu_scale[:, _x, _y] # Only care about marginal scale delta = self.state_discount if isinstance(delta, np.ndarray): delta = np.diag(delta) if prior: df = self.df[:-1] mode = self.mu_mode[:-1] scale = np.sqrt(diags[:-1] / delta) else: df = self.df[1:] mode = self.mu_mode[1:] scale = np.sqrt(diags[1:]) q = stats.t(df).ppf(1 - alpha / 2) band = (scale.T * q).T ci_lower = mode - band ci_upper = mode + band return mode, ci_lower, ci_upper
def run(self, results_x, results_z, attach=True): ''' see class docstring (for now) ''' if not np.allclose(results_x.model.endog, results_z.model.endog): raise ValueError('endogenous variables in models are not the same') nobs = results_x.model.endog.shape[0] y = results_x.model.exog x = results_x.model.exog z = results_z.model.exog #sigma2_x = results_x.ssr/nobs #sigma2_z = results_z.ssr/nobs yhat_x = results_x.fittedvalues #yhat_z = results_z.fittedvalues res_zx = sm.OLS(y, np.column_stack((yhat_x, z))).fit() self.res_zx = res_zx #for testing tstat = res_zx.tvalues[0] pval = res_zx.pvalues[0] if attach: self.res_zx = res_zx self.dist = stats.t(res_zx.model.df_resid) self.teststat = tstat self.pvalue = pval return tsta, pval
def linReg(X, y, intercept = False): #reweighted least squares logistic regression #add intercept: if intercept: X = np.insert(X, X.shape[1], 1, axis=1) y = np.array([y]).T #make column #fit regression: betas = np.dot(np.dot(np.linalg.inv((np.dot(X.T,X))), X.T), y) #calculate p-values: error = y - (np.dot(X,betas)) RSS = np.sum(error**2) betas = betas.flatten() df = float((X.shape[0] - (len(betas) - 1 if intercept else 0)) - 1) s2 = RSS / df #print s2 beta_ses = np.sqrt(s2 / (np.sum( (X - np.mean(X,0))**2, 0))) #print beta_ses ts = [betas[j] / beta_ses[j] for j in range(len(betas))] pvalues = (1 - ss.t(df).cdf(np.abs(ts))) * 2 #two-tailed ##FOR TESTING: #print (betas, pvalues)#DEBUG #for comparison purposes: #results = sm.OLS(y, X).fit() #DEBUG #print (results.params, results.pvalues) return betas, pvalues
def VaR(ts, alpha, flavour): if flavour == "historical": temp_ts = ts.copy() temp_ts.sort() n = len( temp_ts) try: return -temp_ts.values[ np.floor( (1-alpha)*n ) ] except: return -temp_ts[ np.floor( (1-alpha)*n ) ] elif flavour == "t": t = stats.t t = stats.t( *t.fit( ts ) ) return -t.ppf( 1-alpha ) elif flavour == "normal": mean = ts.mean() std = ts.std() return -stats.norm.ppf( 1-alpha, mean, std ) elif flavour == "Cornish-Fischer": z_c = -stats.norm.ppf( 1-alpha, 0 ,1) S = stats.skew(ts) K = stats.kurtosis(ts) z_cf = z_c + (z_c**2-1)*S/6 + (z_c**3- 3*z_c)*K/24 + (2*z_c**3-5*z_c)*S**2/36 return ts.mean() - z_cf*np.sqrt( ts.std() ) elif flavour == "kernel": kde = stats.gaussian_kde( ts ) print kde.factor f = lambda x: kde.integrate_box_1d(-1, x) - (1-alpha) return -fsolve( f, -0.05)[0]
def student(mm): ''' Распределение Стьюдента ''' alfa=0.05 # уровень значимости n = size(mm) - 1 # число степеней свободы t = stats.t(n) tcr = t.ppf(1 - alfa / 2) return round(mean(mm), 4), round (tcr * std(mm) / sqrt( size(mm) ), 4)
def test_sample_nig(): mu_0 = 0.0 lmbda_0 = 10.0 alpha_0 = 10.0 beta_0 = 10.0 # Directly sample nig and lookg at marginals from pyhawkes.utils.utils import sample_nig mu_samples = np.array([sample_nig(mu_0, lmbda_0, alpha_0, beta_0)[0] for _ in xrange(10000)]) # Plot the histogram of impulse means plt.figure() p_mu = t(df=2 * alpha_0, loc=mu_0, scale=np.sqrt(beta_0 / (alpha_0 * lmbda_0))) _, bins, _ = plt.hist(mu_samples, bins=50, alpha=0.5, normed=True) bincenters = 0.5 * (bins[1:] + bins[:-1]) plt.plot(bincenters, p_mu.pdf(bincenters), "r--", linewidth=1) plt.xlabel("mu") plt.ylabel("p(mu)") plt.figure() probplot(mu_samples, dist=p_mu, plot=plt.gca()) plt.show()
def plot_mu_density(self, t, index=0, support_thresh=None): ix = index dists = {} weights = {} thresh = 0 for i in range(self.nmodels): df = self.df[t] mode = self.mu_mode[t + 1, i, ix] scale = np.sqrt(self.mu_scale[t + 1, i, ix, ix]) dist = stats.t(df, loc=mode, scale=scale) dists[i] = dist weights[i] = self.marginal_prob[t + 1, i] thresh = max(thresh, dist.pdf(mode)) if support_thresh is not None: thresh = support_thresh else: # HACK thresh /= 1000 plot_mixture(dists, weights, hi=self.mu_mode[:, :, ix].max(), lo=self.mu_mode[:, :, ix].min(), support_thresh=thresh)
def plot_mu_density(self, t, index=0, support_thresh=0.1): """ Plot posterior densities for single model parameter over the set of mixture components Parameters ---------- t : int time index, relative to response variable index : int parameter index to plot Notes ----- cf. West & Harrison Figure 12.3. Automatically annotating individual component curves would probably be difficult. """ ix = index dists = {} for name in self.names: model = self.models[name] df = model.df[t] mode = model.mu_mode[t + 1, ix] scale = np.sqrt(model.mu_scale[t + 1, ix, ix]) dists[name] = stats.t(df, loc=mode, scale=scale) plot_mixture(dists, self.get_weights(t), support_thresh=support_thresh)
def TProbabilitiesLowerTail(values, df): if len(values)>0 and df>0: outputStr = "" areas = [] for val in values: outputStr += str(val) rv= stats.t(df, loc = 0, scale = 1) #default loc ve scale degerleri area = rv.cdf(val) area = "{0:.5f}".format(area) areas.append(area) if len(values) >1 and values.index(val) < len(values) - 1 : outputStr += ", " else: outputStr += "" outputStr += ", serbestlik derecesi: " + str(df) return outputStr, areas elif df <= 0: return False, "Standart sapma 0'dan kucuk olamaz." else: return False, "Hesaplama icin gecerli degerler girilmelidir."
def PlotTDistributionDistributionFunction(df): if df>0: main_frame = QtGui.QWidget() dpi = 100 fig = Figure((5.0, 4.0), dpi=dpi) canvas = FigureCanvas(fig) canvas.setParent(main_frame) axes = fig.add_subplot(111) mpl_toolbar = NavigationToolbar(canvas, main_frame) hbox = QtGui.QHBoxLayout() vbox = QtGui.QVBoxLayout() vbox.addWidget(canvas) vbox.addWidget(mpl_toolbar) vbox.addLayout(hbox) main_frame.setLayout(vbox) axes.clear() alpha = 0.0005 #R'da o sekilde alinmis, burada da ayni olmas? icin bu deger verildi sequance = stats.t.isf(alpha, df) x = np.linspace(-sequance, sequance, 100) #100 adet veri default verildi rv = stats.t(df) y = rv.cdf(x) axes.plot(x,y) canvas.draw() return main_frame else: return False, "Serbestlik derecesi 0'dan kucuk olamaz."
def get_normal_gamma_posterior_predictive(x, u0, k0, a0, b0): """ Get posterior predictive for vector sample with unknown mean and variance. The returned Student-t distribution for the predictive posterior can be seen as derived in [1], [2], and [3] 1: https://www.cs.ubc.ca/~murphyk/Teaching/CS340-Fall07/reading/NG.pdf (page 5) 2: https://www.cs.ubc.ca/~murphyk/Papers/bayesGauss.pdf (page 9) 3: https://en.wikipedia.org/wiki/Conjugate_prior (Normal-gamma conjugate prior section) :param x: Sample to estimate distribution for (any non-zero length sequence convertible to float) :param u0: Hyperparameter for mean of mean distribution :param k0: Hyperparameter for inverse variance of mean distribution :param a0: Hyperparameter for alpha of precision distribution (shape) :param b0: Hyperparameter for beta of precision distribution (rate, not scale) :return: T-Distribution (posterior predictive for samples) *Note that posterior on parameters is not returned here simply because python has no normal-gamma implementation """ x = np.array(x, dtype=np.float64) n = len(x) x_bar = np.mean(x) u = (k0 * u0 + n * x_bar) / (k0 + n) k = k0 + n a = a0 + n / 2. b = b0 + .5 * np.sum((x - x_bar)**2) + (k0 * n * (x_bar - u0)**2) / (2 * (k0 + n)) # print(u, k, a, b, (b * (k + 1))/(a * k)) predictive_dist = stats.t(df=2*a, loc=u, scale=(b * (k + 1))/(a * k)) return predictive_dist
def check_mean(): '''Data from Altman, check for significance of mean value. Compare average daily energy intake (kJ) over 10 days of 11 healthy women, and compare it to the recommended level of 7725 kJ. ''' # Get data from Altman data = getData('altman_91.txt') # Watch out: by default the SD is calculated with 1/N! myMean = np.mean(data) mySD = np.std(data, ddof=1) print 'Mean and SD: {0:4.2f} and {1:4.2f}'.format(myMean, mySD) # Confidence intervals tf = stats.t(len(data)-1) ci = np.mean(data) + stats.sem(data)*np.array([-1,1])*tf.isf(0.025) print 'The confidence intervals are {0:4.2f} to {1:4.2f}.'.format(ci[0], ci[1]) # Check for significance checkValue = 7725 t, prob = stats.ttest_1samp(data, checkValue) if prob < 0.05: print '{0:4.2f} is significantly different from the mean (p={1:5.3f}).'.format(checkValue, prob) # For not normally distributed data, use the Wilcoxon signed rank test (rank, pVal) = stats.wilcoxon(data-checkValue) if pVal < 0.05: issignificant = 'unlikely' else: issignificant = 'likely' print 'It is ' + issignificant + ' that the value is {0:d}'.format(checkValue)
def welchs_ttest(stats1, stats2): """ SNAGGED FROM https://github.com/mozilla/datazilla-metrics/blob/master/dzmetrics/ttest.py#L56 Execute TWO-sided Welch's t-test given pre-calculated means and stddevs. Accepts summary data (N, stddev, and mean) for two datasets and performs one-sided Welch's t-test, returning p-value. """ n1 = stats1.count m1 = stats1.mean v1 = stats1.variance n2 = stats2.count m2 = stats2.mean v2 = stats2.variance if n1 < 2 or n2 < 2: return Struct(confidence=0, diff=0) vpooled = v1 / n1 + v2 / n2 tt = abs(m1 - m2) / sqrt(vpooled) df_numerator = vpooled ** 2 df_denominator = ((v1 / n1) ** 2) / (n1 - 1) + ((v2 / n2) ** 2) / (n2 - 1) df = df_numerator / df_denominator # RETURN NUMBER-OF-NINES OF CONFIDENCE (0.99 = 2, 0.999 = 3, etc) return Struct(confidence=-stats.t(df).logsf(tt) / log(10), diff=tt)
def cdf( self, resids: ArrayLike, parameters: Optional[Union[Sequence[float], ArrayLike1D]] = None, ) -> NDArray: parameters = self._check_constraints(parameters) scalar = isscalar(resids) if scalar: resids = array([resids]) eta, lam = parameters a = self.__const_a(parameters) b = self.__const_b(parameters) var = eta / (eta - 2) y1 = (b * resids + a) / (1 - lam) * sqrt(var) y2 = (b * resids + a) / (1 + lam) * sqrt(var) tcdf = stats.t(eta).cdf resids = asarray(resids) p = (1 - lam) * tcdf(y1) * (resids < (-a / b)) p += (resids >= (-a / b)) * ((1 - lam) / 2 + (1 + lam) * (tcdf(y2) - 0.5)) if scalar: p = p[0] return p
def predict(self, X, **kwargs): mu = self.trace_['alpha'] + self.trace_['beta'] * X[:, None] dist = t(df=self.trace_['nu'], loc=mu, scale=self.trace_['sigma']) if kwargs.get('q') is None: return dist, dist.mean().mean(axis=1) else: return dist, [dist.ppf(q_).mean(axis=1) for q_ in kwargs['q']]
def oneProportion() -> float: """Calculate the confidence intervals of the population, based on a given data sample. The data are taken from Altman, chapter 10.2.1. Suppose a general practitioner chooses a random sample of 215 women from the patient register for her general practice, and finds that 39 of them have a history of suffering from asthma. What is the confidence interval for the prevalence of asthma? Returns ------- ci : 95% confidence interval """ # Get the data numTotal = 215 numPositive = 39 # --- >>> START stats <<< --- # Calculate the confidence intervals p = float(numPositive) / numTotal se = np.sqrt(p * (1 - p) / numTotal) td = stats.t(numTotal - 1) ci = p + np.array([-1, 1]) * td.isf(0.025) * se # --- >>> STOP stats <<< --- # Print them print('ONE PROPORTION ----------------------------------------') print('The confidence interval for the given sample is ' + f'{ci[0]:.3f} - {ci[1]:.3f}') return ci
def get_p_val(y, z, a, b, muL, muR, var, ind=0, use_tdist=False): """ Correct pval approach using approximations of truncated Gaussians Parameters ---------- y: points from one cluster z: points from the other cluster a: separating hyperplane use_tdist: null distribution of TN statistic to use False for standard normal True for t distribution with df=len(y)+len(z)-2 muL, muR, var: estimated using maximum likelihood ind: gene to test Returns ---------- pvalue """ muY, varY, muZ, varZ = get_null_truncmv_params(a, b, muL, muR, var=var, ind=ind) nY, nZ = len(y), len(z) stat = (np.sum(z[:, ind]) - np.sum(y[:, ind]) - (nZ * muZ - nY * muY)) / np.sqrt(nY * varY + nZ * varZ) if use_tdist: df = len(z) + len(y) - 2 d0 = t(df=df).cdf else: d0 = norm.cdf p = np.min((d0(stat), d0(-stat))) * 2 return p
def oneProportion(): '''Calculate the confidence intervals of the population, based on a given data sample. The data are taken from Altman, chapter 10.2.1. Suppose a general practitioner chooses a random sample of 215 women from the patient register for her general practice, and finds that 39 of them have a history of suffering from asthma. What is the confidence interval for the prevalence of asthma?''' # Get the data numTotal = 215 numPositive = 39 # --- >>> START stats <<< --- # Calculate the confidence intervals p = float(numPositive)/numTotal se = np.sqrt(p*(1-p)/numTotal) td = stats.t(numTotal-1) ci = p + np.array([-1,1])*td.isf(0.025)*se # --- >>> STOP stats <<< --- # Print them print('ONE PROPORTION ----------------------------------------') print(('The confidence interval for the given sample is {0:5.3f} to {1:5.3f}'.format( ci[0], ci[1]))) return ci
def get_p_val_1D(y, z, a, muL, muR, var, use_tdist=False): """ Correct pval approach using approximations of truncated Gaussians (1D case, so y, z, a should be scalars) Parameters ---------- y: points from one cluster z: points from the other cluster a: threshold use_tdist: null distribution of TN statistic to use False for standard normal True for t distribution with df=len(y)+len(z)-2 muL, muR, var: estimated using maximum likelihood Returns ---------- pvalue """ muY, varY, muZ, varZ = get_null_trunc_params(muL, muR, var=var, a=a) nY, nZ = len(y), len(z) stat = (np.sum(z) - np.sum(y) - (nZ * muZ - nY * muY)) / np.sqrt(nY * varY + nZ * varZ) if use_tdist: df = len(z) + len(y) - 2 d0 = t(df=df).cdf else: d0 = norm.cdf p = np.min((d0(stat), d0(-stat))) * 2 return p
def sample_ar1t( rhos, n=50, df_t=DEFAULT_DF_T, ): """ Samples t variables according to a Markov chain. """ # Initial t samples p = rhos.shape[0] + 1 tvars = stats.t(df=df_t).rvs(size=(n, p)) # Initialize X X = np.zeros((n, p)) scale = np.sqrt((df_t - 2) / df_t) X[:, 0] = scale * tvars[:, 0] # Loop through variables according to markov chain conjugates = np.sqrt(1 - rhos**2) for j in range(1, p): X[:, j] = rhos[j - 1] * X[:, j - 1] + conjugates[j - 1] * scale * tvars[:, j] return X
def fun7(): print("open三大抽样分布") #绘制 正态分布 卡方分布 t分布 F分布 nor_dis = stats.norm() chi2_dis = stats.chi2(df=eval(k_1.get())) t_dis = stats.t(df=eval(t_1.get())) f_dis = stats.f(dfn=eval(f_1.get()), dfd=eval(f_2.get())) x1 = np.linspace(nor_dis.ppf(0.001), nor_dis.ppf(0.999), 1000) x2 = np.linspace(chi2_dis.ppf(0.001), chi2_dis.ppf(0.999), 1000) x3 = np.linspace(t_dis.ppf(0.001), t_dis.ppf(0.999), 1000) x4 = np.linspace(f_dis.ppf(0.001), f_dis.ppf(0.999), 1000) fig, ax = plt.subplots(1, 1, figsize=(16, 8)) ax.plot(x1, nor_dis.pdf(x1), 'r-', lw=2, label=r'N(0, 1)') ax.plot(x2, chi2_dis.pdf(x2), 'g-', lw=2, label=r'$\chi^2$(%d)' % eval(k_1.get())) ax.plot(x3, t_dis.pdf(x3), 'b-', lw=2, label='t(%d)' % eval(t_1.get())) ax.plot(x4, f_dis.pdf(x4), 'm-', lw=2, label='F(%d, %d)' % (eval(f_1.get()), eval(f_2.get()))) plt.xlabel("x") plt.ylabel('Probability') plt.title(r'PDF of Three Sampling Distribution') ax.legend(loc='best', frameon=False) plt.grid() plt.show()
def TQuantilesLowerTail(probs, df): if len(probs)>0 and df>0: outputStr = "" yArray = [] for prob in probs: outputStr += str(prob) if prob> 0 and prob<1: rv = stats.t(df, loc = 0, scale = 1) y = rv.ppf(prob) y = "{0:.5f}".format(y) yArray.append(y) else: yArray.append("NaN") if len(probs) >1 and probs.index(prob) < len(probs) - 1 : outputStr += ", " else: outputStr += "" outputStr += ", serbestlik derecesi: " + str(df) return outputStr, yArray elif df<=0: return False, "Serbestlik derecesi 0'dan kucuk olamaz." else: return False, "Gecerli olasilik degeri girilmelidir."
def coherr(C,J1,J2,p=0.05,Nsp1=None,Nsp2=None): """ Function to compute lower and upper confidence intervals on coherency (absolute value of coherence). C: coherence (real or complex) J1,J2: tapered fourier transforms p: the target P value (default 0.05) Nsp1: number of spikes in J1, used for finite size correction. Nsp2: number of spikes in J2, used for finite size correction. Default is None, for no correction Outputs: CI: confidence interval for C, N x 2 array, (lower, upper) phi_std: stanard deviation of phi, N array """ from numpy import iscomplexobj, absolute, fix, zeros, setdiff1d, real, sqrt,\ arctanh, tanh from scipy.stats import t J1 = _combine_trials(J1) J2 = _combine_trials(J2) N,K = J1.shape assert J1.shape==J2.shape, "J1 and J2 must have the same dimensions." assert N == C.size, "S and J lengths don't match" if iscomplexobj(C): C = absolute(C) pp = 1 - p/2 dof = 2*K dof1 = dof if Nsp1 is None else fix(2.*Nsp1*dof/(2.*Nsp1+dof)) dof2 = dof if Nsp2 is None else fix(2.*Nsp2*dof/(2.*Nsp2+dof)) dof = min(dof1,dof2) Cerr = zeros((N,2)) tcrit = t(dof-1).ppf(pp).tolist() atanhCxyk = zeros((N,K)) phasefactorxyk = zeros((N,K),dtype='complex128') for k in xrange(K): indxk = setdiff1d(range(K),[k]) J1k = J1[:,indxk] J2k = J2[:,indxk] eJ1k = real(J1k * J1k.conj()).sum(1) eJ2k = real(J2k * J2k.conj()).sum(1) eJ12k = (J1k.conj() * J2k).sum(1) Cxyk = eJ12k/sqrt(eJ1k*eJ2k) absCxyk = absolute(Cxyk) atanhCxyk[:,k] = sqrt(2*K-2)*arctanh(absCxyk) phasefactorxyk[:,k] = Cxyk / absCxyk atanhC = sqrt(2*K-2)*arctanh(C); sigma12 = sqrt(K-1)* atanhCxyk.std(1) Cu = atanhC + tcrit * sigma12 Cl = atanhC - tcrit * sigma12 Cerr[:,0] = tanh(Cl / sqrt(2*K-2)) Cerr[:,1] = tanh(Cu / sqrt(2*K-2)) phistd = (2*K-2) * (1 - absolute(phasefactorxyk.mean(1))) return Cerr, phistd
def __init__(self, a, b, beginning=None, ending=None, beginning_factor=None, ending_factor=None): """ start and end can be in either datetime or unix time """ a, b = UnixTime(a), UnixTime(b) assert a < b, "'b' should be greater than 'a'" if (beginning, ending) != (None, None): assert (beginning_factor, ending_factor) == (None, None), "PiecewiseTemporalEvent() only accepts " \ "either 'beginning_factor' and 'ending_factor' " \ "or 'beginning' and 'ending'" if not a < beginning and ending < b and (beginning < ending or almost_equals(beginning, ending)): raise AttributeError("The inputs should satisfy 'a < beginning <= ending < b' relation") if beginning_factor is not None: assert beginning_factor > 0 self.beginning_factor = beginning_factor if ending_factor is not None: assert ending_factor > 0 self.ending_factor = ending_factor if (beginning, ending) == (None, None): beginning, ending = 0, 0 while not a < beginning < ending < b: beginning = random_time( a, b, probability_distribution=t( # df, mean, variance 4, a + float(b - a) / self.beginning_factor, float(b - a) / self.beginning_factor ) ) ending = random_time( a, b, probability_distribution=t( # df, mean, variance 4, b - float(b - a) / self.ending_factor, float(b - a) / self.ending_factor ) ) TemporalEvent.__init__(self, uniform(loc=a, scale=UnixTime(beginning - a)), uniform(loc=ending, scale=UnixTime(b - ending)), bins=4)
def calculate_mean_confidence_interval_small(series, confidence_interval=0.95): mean = series.mean() s = math.sqrt(series.var()) count = series.count() rv = t(count - 1) z = rv.isf((1 - confidence_interval) / 2) delta = round(z * (s / math.sqrt(count)), 2) return FloatInterval.closed(mean - delta, mean + delta)
def umean(bb,stdlev=0.05): '''calculates correct uncertainty using student coefficient ''' from uncertainties import ufloat from numpy import mean,std from math import sqrt from scipy import stats return ufloat(mean(bb),std(bb)*stats.t(len(bb)).isf(stdlev)/nu.sqrt(len(bb)-1))
def PosteriorParameters(Y): stdY=np.std(Y) meanY=np.mean(Y) rv=stats.t(Y.size-1,loc=meanY,scale=stdY**2/Y.size) #sigma_posterior=((Y.size-1)*stdY**2)/stats.chi2.pdf(Y[:-1],stdY**2) mu_posterior=stats.t.pdf(Y,Y.size-1) return(mu_posterior)
def __init__(self, a, b, beginning=None, ending=None, beginning_factor=None, ending_factor=None): """ start and end can be in either datetime or unix time """ a, b = UnixTime(a), UnixTime(b) assert a < b, "'b' should be greater than 'a'" if (beginning, ending) != (None, None): assert (beginning_factor, ending_factor) == (None, None), "PiecewiseTemporalEvent() only accepts " \ "either 'beginning_factor' and 'ending_factor' " \ "or 'beginning' and 'ending'" if beginning_factor is not None: assert beginning_factor > 0 self.beginning_factor = beginning_factor if ending_factor is not None: assert ending_factor > 0 self.ending_factor = ending_factor if (beginning, ending) != (None, None): beginning = UnixTime(beginning) ending = UnixTime(ending) else: beginning, ending = 0, 0 while not a < beginning < ending < b: beginning = random_time( a, b, probability_distribution=t( # df, mean, variance 4, a + float(b - a) / self.beginning_factor, float(b - a) / self.beginning_factor ) ) ending = random_time( a, b, probability_distribution=t( # df, mean, variance 4, b - float(b - a) / self.ending_factor, float(b - a) / self.ending_factor ) ) TemporalEventPiecewiseLinear.__init__(self, [a, beginning, ending, b], [0, 1, 1, 0])
def show_continuous(): """Show a variety of continuous distributions""" x = linspace(-10,10,201) # Normal distribution showDistribution(x, stats.norm, stats.norm(loc=2, scale=4), 'Normal Distribution', 'Z', 'P(Z)','') # Exponential distribution showDistribution(x, stats.expon, stats.expon(loc=-2, scale=4), 'Exponential Distribution', 'X', 'P(X)','') # Students' T-distribution # ... with 4, and with 10 degrees of freedom (DOF) plot(x, stats.norm.pdf(x), 'g') hold(True) showDistribution(x, stats.t(4), stats.t(10), 'T-Distribution', 'X', 'P(X)',['normal', 't=4', 't=10']) # F-distribution # ... with (3,4) and (10,15) DOF showDistribution(x, stats.f(3,4), stats.f(10,15), 'F-Distribution', 'F', 'P(F)',['(3,4) DOF', '(10,15) DOF']) # Uniform distribution showDistribution(x, stats.uniform,'' , 'Uniform Distribution', 'X', 'P(X)','') # Logistic distribution showDistribution(x, stats.norm, stats.logistic, 'Logistic Distribution', 'X', 'P(X)',['Normal', 'Logistic']) # Lognormal distribution x = logspace(-9,1,1001)+1e-9 showDistribution(x, stats.lognorm(2), '', 'Lognormal Distribution', 'X', 'lognorm(X)','', xmin=-0.1) # The log-lin plot has to be done by hand: plot(log(x), stats.lognorm.pdf(x,2)) xlim(-10, 4) title('Lognormal Distribution') xlabel('log(X)') ylabel('lognorm(X)') show()
def infer_1dgaussian(self, init_labels, output_file = None): """Perform inference on class labels assuming data are 1-d gaussian, without OpenCL acceleration. """ total_time = 0 a_time = time() cluster_labels = init_labels if self.record_best: self.auto_save_sample(cluster_labels) for i in xrange(self.niter): # identify existing clusters and generate a new one uniq_labels = np.unique(cluster_labels) _, _, new_cluster_label = smallest_unused_label(uniq_labels) uniq_labels = np.hstack((new_cluster_label, uniq_labels)) # compute the sufficient statistics of each cluster logpost = np.empty((self.N, uniq_labels.shape[0])) for label_index in xrange(uniq_labels.shape[0]): label = uniq_labels[label_index] if label == new_cluster_label: n, mu, var = 0, 0, 0 else: cluster_obs = self.obs[np.where(cluster_labels == label)] n = cluster_obs.shape[0] mu = np.mean(cluster_obs) var = np.var(cluster_obs) k_n = self.gaussian_k0 + n mu_n = (self.gaussian_k0 * self.gaussian_mu0 + n * mu) / k_n alpha_n = self.gamma_alpha0 + n / 2 beta_n = self.gamma_beta0 + 0.5 * var * n + \ self.gaussian_k0 * n * (mu - self.gaussian_mu0) ** 2 / (2 * k_n) Lambda = alpha_n * k_n / (beta_n * (k_n + 1)) t_frozen = t(df = 2 * alpha_n, loc = mu_n, scale = (1 / Lambda) ** 0.5) logpost[:,label_index] = t_frozen.logpdf(self.obs[:,0]) logpost[:,label_index] += np.log(n/(self.N + self.alpha)) if n > 0 else np.log(self.alpha/(self.N+self.alpha)) # sample and implement the changes temp_cluster_labels = np.empty(cluster_labels.shape, dtype=np.int32) for j in xrange(self.N): target_cluster = sample(a = uniq_labels, p = lognormalize(logpost[j])) temp_cluster_labels[j] = target_cluster if self.record_best: if self.auto_save_sample(temp_cluster_labels): cluster_labels = temp_cluster_labels if self.no_improvement(500): break else: if i >= self.burnin and i % self.thining == 0: print(*temp_cluster_labels, file = output_file, sep=',') self.total_time += time() - a_time return self.gpu_time, self.total_time, Counter(cluster_labels).most_common()
def confidence_interval_mean(std, num, confidence=0.95): """ calculates the confidence interval of the mean given a standard deviation and a number of observations, assuming a normal distribution """ sem = std/np.sqrt(num) # estimate of the standard error of the mean # get confidence interval from student-t distribution factor = stats.t(num - 1).ppf(0.5 + 0.5*confidence) return factor*sem
def T(v, tag=None): """ A Student-T random variate Parameters ---------- v : int The degrees of freedom of the distribution (must be greater than one) """ assert isinstance(v, int) and v>1, 'v must be an int greater than 1' return uv(rv=ss.t(v), tag=tag)