def __init__(self, data, **kwargs): r"""Constructor. This will fit both chi2 function in the different regimes. *data* - Data sample to use for fitting Keyword Argument: *chi1/2* - Keyword arguments like floc, fshape, etc. that are passed to the constructor of the corresponding chi2 scipy object. """ data = np.asarray(data) c1 = kwargs.pop("chi1", dict()) c2 = kwargs.pop("chi2", dict()) self.par1 = chi2.fit(data[data > 0.], **c1) self.par2 = chi2.fit(-data[data < 0.], **c2) self.f1 = chi2(*self.par1) self.f2 = chi2(*self.par2) self.eta = float(np.count_nonzero(data > 0.)) / len(data) self.eta_err = np.sqrt(self.eta * (1. - self.eta) / len(data)) # get fit-quality self.ks1 = kstest(data[data > 0.], "chi2", args=self.par1)[1] self.ks2 = kstest(-data[data < 0.], "chi2", args=self.par2)[1] return
def __init__(self, data, **kwargs): r"""Constructor, evaluates the percentage of events equal to zero and fits a chi2 to the rest of the data. Parameters ----------- data : array Data values to be fit """ data = np.asarray(data) if len(data) == 2: self.eta = data[0] self.par = [data[1], 0., 1.] self.eta_err = np.nan self.ks = np.nan self.f = chi2(*self.par) return self.par = chi2.fit(data[data > 0], **kwargs) self.f = chi2(*self.par) self.eta = float(np.count_nonzero(data > 0)) / len(data) self.eta_err = np.sqrt(self.eta * (1. - self.eta) / len(data)) self.ks = kstest(data[data > 0], "chi2", args=self.par)[0] return
def T1_test(sample_cov,true_cov,n): """ Test the hypothesis that a sample covariance matrix comes from a multivariate normal distribution whose true covariance is given sample_cov: sample covariance matrix true_cov: known covariance matrix n: number of observations per variable Returns the probability of obtaining a covariance matrix like this if the distribution were multivariate normal. Based on Nagao 1973, this is true only for n large (and larger than the size of the matrix). By Anne M. Archibald 2007 """ from numpy import dot, shape, trace, eye from scipy.linalg import inv from scipy.stats import chi2 p, r = shape(sample_cov) if p!=r or (p,r) != shape(true_cov): raise ValueError, "Sample covariance matrix (%d by %d) and true covariance matrix (%d by %d) must be square matrices of the same size" % (p,r,shape(true_cov)[0],shape(true_cov)[1]) if p>n: raise ValueError, "This statistic is not correct for matrices with n smaller than the matrix size" M = dot(sample_cov,inv(true_cov))-eye(p) T1 = (n-1)/2*trace(dot(M,M)) f = p*(p+1)/2 return chi2(f).sf(T1)-(1./(n-1))*(p/12.*(4*p**2+9*p+7)*chi2(f+6).cdf(T1)- p/8.*(6*p**2+13*p+8)*chi2(f+4).cdf(T1)+ p/2.*(p+1)**2*chi2(f+2).cdf(T1)- p/24.*(2*p**2+3*p-1)*chi2(f).cdf(T1))
def __setstate(self, state): for key, val in state.iteritems(): setattr(self, key, val) self.f1 = chi2(*self.par1) self.f2 = chi2(*self.par2) return
def test_1D_is_chisquared(self): # The 1-dimensional Wishart with an identity scale matrix is just a # chi-squared distribution. # Test variance, mean, entropy, pdf # Kolgomorov-Smirnov test for rvs np.random.seed(482974) sn = 500 dim = 1 scale = np.eye(dim) df_range = np.arange(1, 10, 2, dtype=float) X = np.linspace(0.1,10,num=10) for df in df_range: w = wishart(df, scale) c = chi2(df) # Statistics assert_allclose(w.var(), c.var()) assert_allclose(w.mean(), c.mean()) assert_allclose(w.entropy(), c.entropy()) # PDF assert_allclose(w.pdf(X), c.pdf(X)) # rvs rvs = w.rvs(size=sn) args = (df,) alpha = 0.01 check_distribution_rvs('chi2', args, alpha, rvs)
def test_is_scaled_chisquared(self): # The 2-dimensional Wishart with an arbitrary scale matrix can be # transformed to a scaled chi-squared distribution. # For :math:`S \sim W_p(V,n)` and :math:`\lambda \in \mathbb{R}^p` we have # :math:`\lambda' S \lambda \sim \lambda' V \lambda \times \chi^2(n)` np.random.seed(482974) sn = 500 df = 10 dim = 4 # Construct an arbitrary positive definite matrix scale = np.diag(np.arange(4)+1) scale[np.tril_indices(4, k=-1)] = np.arange(6) scale = np.dot(scale.T, scale) # Use :math:`\lambda = [1, \dots, 1]'` lamda = np.ones((dim,1)) sigma_lamda = lamda.T.dot(scale).dot(lamda).squeeze() w = wishart(df, sigma_lamda) c = chi2(df, scale=sigma_lamda) # Statistics assert_allclose(w.var(), c.var()) assert_allclose(w.mean(), c.mean()) assert_allclose(w.entropy(), c.entropy()) # PDF X = np.linspace(0.1,10,num=10) assert_allclose(w.pdf(X), c.pdf(X)) # rvs rvs = w.rvs(size=sn) args = (df,0,sigma_lamda) alpha = 0.01 check_distribution_rvs('chi2', args, alpha, rvs)
def correct_covariance(self, data): """Apply a correction to raw Minimum Covariance Determinant estimates. Correction using the empirical correction factor suggested by Rousseeuw and Van Driessen in [Rouseeuw1984]_. Parameters ---------- data: array-like, shape (n_samples, n_features) The data matrix, with p features and n samples. The data set must be the one which was used to compute the raw estimates. Returns ------- covariance_corrected: array-like, shape (n_features, n_features) Corrected robust covariance estimate. """ X_centered = data - self.raw_location_ dist = np.sum( np.dot(X_centered, linalg.pinv(self.raw_covariance_)) * X_centered, 1) correction = np.median(dist) / chi2(data.shape[1]).isf(0.5) covariance_corrected = self.raw_covariance_ * correction self._set_estimates(covariance_corrected) return covariance_corrected
def jtest(self, theta, **kwargs): """J-test for misspecification of the model. Tests whether all intercepts alphas are simultaneously zero. Parameters ---------- theta : (dim_k*(dim_n+1)-1, ) array Parameter vector Returns ------- jstat : int J-statistic jpval : int Corresponding p-value of the test, percent """ dim_n, dim_k = self.__get_dimensions()[1:] param_var = self.compute_theta_var(theta, **kwargs) alpha_var = param_var[0:dim_n*dim_k:dim_k, 0:dim_n*dim_k:dim_k] eig = np.linalg.eigvalsh(alpha_var).min() if eig <= 0: alpha_var -= np.eye(dim_n) * eig * 1.1 inv_var = np.linalg.pinv(alpha_var) try: np.linalg.cholesky(inv_var) except np.linalg.LinAlgError: warnings.warn('Inverse of alpha variance is not P.D.!') alpha = self.convert_theta_to2d(theta)[0] jstat = (alpha.dot(inv_var) * alpha).sum() jpval = 1 - chi2(dim_n).cdf(jstat) return jstat, jpval*100
def calculate_Var_confidence_interval_large(series, confidence_interval=0.95): count = series.count() var = series.var() upper = (count - 1) * var rv = chi2(count - 1) alpha = 1 - confidence_interval return FloatInterval.closed(round(upper / rv.isf(alpha / 2), 2), round(upper / rv.isf(1 - alpha / 2), 2))
def PlotChi2DistributionDistributionFunction(df): if df>0: main_frame = QtGui.QWidget() dpi = 100 fig = Figure((5.0, 4.0), dpi=dpi) canvas = FigureCanvas(fig) canvas.setParent(main_frame) axes = fig.add_subplot(111) mpl_toolbar = NavigationToolbar(canvas, main_frame) hbox = QtGui.QHBoxLayout() vbox = QtGui.QVBoxLayout() vbox.addWidget(canvas) vbox.addWidget(mpl_toolbar) vbox.addLayout(hbox) main_frame.setLayout(vbox) alpha = 0.0005 sequence = stats.chi2.isf(alpha, df) x = np.linspace(-sequence, sequence, 1000) rv = stats.chi2(df) y = rv.cdf(x) axes.plot(x,y) canvas.draw() return main_frame else: return False, "Serbestlik derecesi 0'dan kucuk olamaz."
def plot_gmm_confidence_ellipses(ax, means, covariances, colors, confidence=0.95, plot_eigenvectors=True): """Plots ellipses for gmm covariances. :param ax: :param means: (n_components, n_features) means. :param covariances: (n_components, n_features, n_features) covariances. :param colors: ellipse colors. :param confidence: :param plot_eigenvectors: :return: """ n_components, n_features = means.shape alpha = np.sqrt(chi2(n_features).ppf(confidence)) for k in range(n_components): # plot ellipse from covariance values, vectors = _eig_sort(covariances[k]) w, h = 2 * alpha * np.sqrt(values) angle = np.degrees(np.arctan2(vectors[1, 0], vectors[0, 0])) ax.add_artist( Ellipse(means[k], w, h, angle, color=colors[k], fill=False)) # plot eigenvectors if needed if plot_eigenvectors: arrow_params = {'color': colors[k], 'length_includes_head': True, 'head_width': 0.05, 'head_length': 0.1} ax.arrow(*means[k], *(vectors[:, 0] * w / 2), **arrow_params) ax.arrow(*means[k], *(vectors[:, 1] * h / 2), **arrow_params)
def get_stats(P): pdfPk = [] for i in range(P.shape[1]): N = float(P.shape[0]) var = np.sum(P[:,i])/(N*(4.-2.)) pdfPk.append(chi2(4.,scale=var)) return pdfPk
def distModelIndexChanged_hndlr(self): ''' handler for changin item in combobox under probability plot :return: ''' index = self.distModelBox.currentIndex() if index == 0: self.probModel = stats.norm self.ddofEdit.setDisabled(True) elif index == 1: self.probModel = stats.expon self.ddofEdit.setDisabled(True) elif index == 2: self.probModel = stats.laplace self.ddofEdit.setDisabled(True) elif index == 3: try: self.df = np.float64(self.ddofEdit.text()) except: self.ddofEdit.setText(str(self.df)) self.probModel = stats.chi2(self.df) self.ddoflabel.setText(_('ProboPlot','Number of degrees of freedom')) self.ddofEdit.setEnabled(True) elif index == 4: try: self.df = np.float64(self.ddofEdit.text()) except: self.ddofEdit.setText(str(self.df)) self.probModel = stats.exponweib(a=1,c=self.df) self.ddoflabel.setText(_('ProboPlot','Shape of distribution')) self.ddofEdit.setEnabled(True) try: self.drawProbPlot(self.currDist) except: return
def correct_covariance(self, data): """Apply a correction to raw Minimum Covariance Determinant estimates. Correction using the empirical correction factor suggested by Rousseeuw and Van Driessen in [RVD]_. Parameters ---------- data : array-like, shape (n_samples, n_features) The data matrix, with p features and n samples. The data set must be the one which was used to compute the raw estimates. References ---------- .. [RVD] `A Fast Algorithm for the Minimum Covariance Determinant Estimator, 1999, American Statistical Association and the American Society for Quality, TECHNOMETRICS` Returns ------- covariance_corrected : array-like, shape (n_features, n_features) Corrected robust covariance estimate. """ correction = np.median(self.dist_) / chi2(data.shape[1]).isf(0.5) covariance_corrected = self.raw_covariance_ * correction self.dist_ /= correction return covariance_corrected
def profile_likelihood_crit(profile_likelihood, max_likelihood, clevels=[0.674, 0.95, 0.997], log=True): """ Return the critical values of the profile likelihood that correspond to the given confidence levels (based on the likelihood ratio test). Useful for the calculation of confidence intervals. Parameters ---------- profile_likelihood : n-d array_like the profile (log)likelihood max_likelihood : float maximized value of the (log) likelihood clevels : list confidence levels log : bool must be True if log-likelihoods are provided """ df = profile_likelihood.ndim lambda_crit = [stats.chi2(df).ppf(cl) for cl in clevels] ploglike_crit = (2. * max_likelihood - lambda_crit) / 2. if log: return ploglike_crit else: return np.exp(ploglike_crit)
def error_string1(df, chi2): rv = stats.chi2(df) p_val = 1 - rv.cdf(chi2) ans = '(degrees of freedom) df = %s and the p-value = %.3g'\ % (df, 1 - p_val) return ans
def __setstate__(self, state): for key, val in state.iteritems(): setattr(self, key, val) self.f = chi2(*self.par) return
def Chi2ProbabilitiesLowerTail(values, df): if len(values)>0 and df>0: outputStr = "" areas = [] for val in values: outputStr += str(val) rv = stats.chi2(df, loc=0, scale=1) area = rv.cdf(val) area = "{0:.5f}".format(area) areas.append(area) if len(values) >1 and values.index(val) < len(values) - 1 : outputStr += ", " else: outputStr += "" outputStr += ", serbestlik derecesi: " + str(df) return outputStr, areas elif df<=0: return False, "Standart sapma 0'dan kucuk olamaz." else: return False, "Gecerli olasilik degeri girilmelidir."
def Chi2QuantilesLowerTail(probs, df): if len(probs)>0 and df>0: outputStr = "" yArray = [] for prob in probs: outputStr += str(prob) if prob> 0 and prob<1: rv = stats.chi2(df, loc = 0, scale = 1) y = rv.ppf(prob) y = "{0:.5f}".format(y) yArray.append(y) else: yArray.append("NaN") if len(probs) >1 and probs.index(prob) < len(probs) - 1 : outputStr += ", " else: outputStr += "" outputStr += ", serbestlik derecesi: " + str(df) return outputStr, yArray elif df<=0: return False, "Standart sapma 0'dan kucuk olamaz." else: return False, "Gecerli olasilik degeri girilmelidir."
def mcnemar_test(test_1, test_2, significance=0.01): """ Perform McNemar's statistical test. Parameters ---------- test_1 : numpy array Test 1 sample(s). test_2 : numpy array Test 2 sample(s). significance : float, optional Significance level. Returns ------- significance : int Significance {-1, 0, +1}. p_value : float P-value. Notes ----- Please see: http://en.wikipedia.org/wiki/McNemar%27s_test +-----------------+-----------------+-----------------+-----------+ | | Test 2 positive | Test 2 negative | Row total | +-----------------+-----------------+-----------------+-----------+ | Test 1 positive | a | b | a + b | | Test 1 negative | c | d | c + d | +-----------------+-----------------+-----------------+-----------+ | Column total | a + c | b + d | n | +-----------------+-----------------+-----------------+-----------+ """ from scipy.stats import chi2 # convert the tests to numpy arrays test_1 = np.asarray(test_1) test_2 = np.asarray(test_2) # both test must have the same length if not (test_1.size == test_2.size and test_1.shape == test_2.shape): raise ValueError("Both tests must have the same size and shape.") # calculate a, b, c, d # a = np.sum(test_1 * test_2) b = np.sum(test_1 > test_2) c = np.sum(test_1 < test_2) # d = np.sum(-test_1 * -test_2) # is the approximation ok? if b + c < 25: raise NotImplementedError("implement correct binomial distribution or " "use bigger sample sizes (b + c > 25)") # statistical test stat = (b - c) ** 2 / float(b + c) # test under chi square distribution p = chi2(1).sf(stat) # direction of significance sig = 0 if p < significance: sig = 1 if b > c else -1 return sig, p
def setup_class(cls): cls.rng = RandomState(23456) fixed_rng = stats.chi2(10) cls.t = t = 1000 cls.k = k = 50 cls.losses = fixed_rng.rvs((t, k)) index = pd.date_range('2000-01-01', periods=t) cls.losses_df = pd.DataFrame(cls.losses, index=index)
def star_optimize_alpha_threshold(self): alpha = self.doubleSpinBox_optimize_alpha.value() apsize = float(self.comboBox_apsize.currentText()) nobs = np.count_nonzero(self.p.aperture[apsize].frames_mask) chi2dist = chi2(nobs-2) chi2limits = np.divide(chi2dist.interval(alpha), nobs-2) self.doubleSpinBox_optimize_lower.setValue(chi2limits[0]) self.doubleSpinBox_optimize_upper.setValue(chi2limits[1])
def find_optimal_T_chi2(bg_rate, m, P): """Returns the min. T so that bg_rate is <= than lower_conf_rate(m,T,P). This is equivalent but much faster than find_optimal_T_iter(). Note: This is based on the confidence intervall of multiple exponential """ T = 0.5*chi2(2*m).ppf(P)/bg_rate return T
def get_mswd_limits(n, k=1): dof = n - k # calculate the reduced chi2 95% interval for given dof # use scale parameter to calculate the chi2_reduced from chi2 from scipy.stats import chi2 rv = chi2(dof, scale=1 / float(dof)) return rv.interval(0.95)
def __init__(self, x, y, sig=[], chi2limit=0.95, customlimits=[], outlier_threshold=[-3.0, 3.0], maxiter=50): nanmask = np.logical_or(np.isnan(x), np.isnan(y)) mask = np.ones(len(x), dtype=bool) iter = 1 rejn = 0 # reject outliers while True: xrlm = x[mask & ~nanmask] yrlm = y[mask & ~nanmask] X = sm.add_constant(xrlm) rlm = sm.RLM(yrlm, X, missing='none', M=sm.robust.norms.TukeyBiweight()).fit() residuals = y - (rlm.params[0]+rlm.params[1]*x) mad = np.median(np.absolute(residuals)) sigmad = mad*1.4286 ratio = residuals/sigmad maskit = (ratio > outlier_threshold[0]) & (ratio < outlier_threshold[1]) if np.array_equal(mask, maskit) == True or iter >= maxiter: self.outliers_mask = mask self.rlm_params = rlm.params self.niter = iter break else: mask = np.copy(maskit) iter += 1 # weghted linear fit to cleaned data xlfit = x[self.outliers_mask] ylfit = y[self.outliers_mask] siglfit = sig[self.outliers_mask] if len(sig) else [] weights = 1/siglfit**2 if len(siglfit) > 0 else None polyfit = np.polyfit(xlfit, ylfit, deg=1, w=weights) polyfit_resid = ylfit - (polyfit[1]+polyfit[0]*xlfit) polyfit_dof = len(xlfit) - 2 if len(sig) > 0: polyfit_zval = polyfit_resid/siglfit polyfit_chi2 = np.sum(polyfit_zval**2) self.polyfit_redchi2 = polyfit_chi2/polyfit_dof self.polyfit_rms = math.sqrt(np.sum(polyfit_resid**2)/polyfit_dof) self.polyfit_chi2dist = chi2(polyfit_dof) self.polyfit = polyfit if len(customlimits) == 2: self.polyfit_chi2limits = customlimits else: self.polyfit_chi2limits = np.divide(self.polyfit_chi2dist.interval(chi2limit), float(polyfit_dof))
def chi2pval(data): av = np.average(data) va = np.var(data) hist, binEdges = np.histogram(data, bins=50, density=True) rvn = stats.norm(loc = av, scale = np.sqrt(va)) eHist = np.array([rvn.cdf(binEdges[i+1])-rvn.cdf(binEdges[i]) for i in range(len(binEdges)-1)]) chi2 = np.sum(np.power(hist-eHist,2)/eHist) df = len(hist)-1 rv = stats.chi2(df) print chi2, df, 1-rv.cdf(chi2)
def normal_plevels(n): """ Return an array of values of the probability within a +- k*sigma region centered on the mean of a normal distribution, for k=1 to n. """ c1cdf = stats.chi2(1).cdf levels = [] for i in range(1,n+1): levels.append(c1cdf(i**2)) return array(levels)
def gauss_ell(mu, va, dim = [0, 1], npoints = 100, level = 0.39): """ Given a mean and covariance for multi-variate gaussian, returns npoints points for the ellipse of confidence given by level (all points will be inside the ellipsoides with a probability equal to level) Returns the coordinate x and y of the ellipse""" c = np.array(dim) if mu.size < 2: raise RuntimeError("this function only make sense for dimension 2 and more") if mu.size == va.size: mode = 'diag' else: if va.ndim == 2: if va.shape[0] == va.shape[1]: mode = 'full' else: raise DenError("variance not square") else: raise DenError("mean and variance are not dim conformant") # If X ~ N(mu, va), then [X` * va^(-1/2) * X] ~ Chi2 chi22d = stats.chi2(2) mahal = np.sqrt(chi22d.ppf(level)) # Generates a circle of npoints theta = np.linspace(0, 2 * np.pi, npoints) circle = mahal * np.array([np.cos(theta), np.sin(theta)]) # Get the dimension which we are interested in: mu = mu[dim] if mode == 'diag': va = va[dim] elps = np.outer(mu, np.ones(npoints)) elps += np.dot(np.diag(np.sqrt(va)), circle) elif mode == 'full': va = va[c,:][:,c] #print "va = ", v a # Method: compute the cholesky decomp of each cov matrix, that is # compute cova such as va = cova * cova' # WARN: scipy is different than matlab here, as scipy computes a lower # triangular cholesky decomp: # - va = cova * cova' (scipy) # - va = cova' * cova (matlab) # So take care when comparing results with matlab ! cova = np.linalg.cholesky(va) elps = np.outer(mu, np.ones(npoints)) elps += np.dot(cova, circle) else: raise DenParam("var mode not recognized") return elps[0, :], elps[1, :]
def reweight_covariance(self, data): """Reweight raw Minimum Covariance Determinant estimates. Reweight observations using Rousseeuw's method (equivalent to deleting outlying observations from the data set before computing location and covariance estimates). [1] Parameters ---------- data: array-like, shape (n_samples, n_features) The data matrix, with p features and n samples. The data set must be the one which was used to compute the raw estimates. Returns ------- location_reweighted: array-like, shape (n_features, ) Reweighted robust location estimate. covariance_reweighted: array-like, shape (n_features, n_features) Reweighted robust covariance estimate. support_reweighted: array-like, type boolean, shape (n_samples,) A mask of the observations that have been used to compute the reweighted robust location and covariance estimates. Notes ----- References: [1] A Fast Algorithm for the Minimum Covariance Determinant Estimator, 1999, American Statistical Association and the American Society for Quality, TECHNOMETRICS """ n_samples, n_features = data.shape X_centered = data - self.location_ if self.store_precision: precision = self.precision_ else: precision = linalg.pinv(self.covariance_) dist = np.sum( np.dot(X_centered, precision) * X_centered, 1) mask = dist < chi2(n_features).isf(0.025) if self.assume_centered: location_reweighted = np.zeros(n_features) else: location_reweighted = data[mask].mean(0) covariance_reweighted = self._nonrobust_covariance( data[mask], assume_centered=self.assume_centered) support_reweighted = np.zeros(n_samples).astype(bool) support_reweighted[mask] = True self._set_estimates(covariance_reweighted) self.location_ = location_reweighted self.support_ = support_reweighted return location_reweighted, covariance_reweighted, support_reweighted
def Chi2(df, tag=None): """ A Chi-Squared random variate Parameters ---------- df : int The degrees of freedom of the distribution (must be greater than one) """ assert isinstance(df, int) and df>1, 'DF must be an int greater than 1' return uv(rv=ss.chi2(df), tag=tag)
def buildKernelAdapt(self, X, C, y, regions, reml=True, maxiter=100): #prepare initial values for sig2e and for fixed effects hyp0_sig2e, hyp0_fixedEffects = self.getInitialHyps(X, C, y) bestKernelNames = [] kernelsListAll = [] hyp_kernels = [] funcToSolve = self.infExact_scipy yVar = y.var() for r_i, r in enumerate(regions): #if (r_i == 0): kernelsToTry = ['lin'] #else: # kernelsToTry = ['lin', 'poly2_lin', 'rbf_lin', 'nn_lin'] kernelsToTry = ['lin', 'poly2_lin', 'rbf_lin', 'nn_lin'] if self.verbose: print print 'selecting a kernel for region', r_i, 'with', r.sum( ), 'SNPs' #add linear kernel X_lastRegion = X[:, r] linKernel = kernels.linearKernel(X_lastRegion) kernelsListAll.append(kernels.ScaledKernel(linKernel)) kernelsListAll.append(None) bestFun = np.inf bestKernelName = None best_hyp0 = None bestKernel = None bestPval = np.inf #iterate over every possible kernel for kernelToTry in kernelsToTry: hyp0 = [0.5 * np.log(0.5 * yVar)] if self.verbose: print 'Testing kernel:', kernelToTry #create the kernel if (kernelToTry == 'lin'): kernel = None df = None elif (kernelToTry == 'rbf_lin'): kernel = kernels.RBFKernel(X_lastRegion) hyp0.append(np.log(1.0)) #ell df = 2 elif (kernelToTry == 'nn_lin'): kernel = kernels.NNKernel(X_lastRegion) hyp0.append(np.log(1.0)) #ell df = 2 elif (kernelToTry == 'poly2_lin'): kernel = kernels.Poly2KernelHomo(linKernel) df = 1 else: raise Exception('unrecognized kernel name') if (kernel is not None): #scale the kernel kernel = kernels.ScaledKernel(kernel) hyp0.append(0.5 * np.log(0.5 * yVar)) #scaling hyp #add the kernel as the final kernel in the kernels list kernelsListAll[-1] = kernel sumKernel = kernels.SumKernel(kernelsListAll) else: sumKernel = kernels.SumKernel(kernelsListAll[:-1]) #test log likelihood obtained with this kernel for this region args = (sumKernel, C, y, reml) self.optimization_counter = 0 hyp0_all = np.concatenate( (hyp0_sig2e, hyp0_fixedEffects, hyp_kernels + hyp0)) optObj = gpUtils.minimize(hyp0_all, funcToSolve, -maxiter, *args) if (not optObj.success): print 'Optimization status:', optObj.status print 'optimization message:', optObj.message raise Exception('optimization failed') print 'final LL: %0.5e' % (-optObj.fun) if (kernelToTry == 'lin'): linLL = -optObj.fun pVal = 1.0 else: llDiff = -optObj.fun - linLL if (llDiff < 0): pVal = 1.0 else: pVal = 0.5 * stats.chi2(df).sf(llDiff) print 'llDiff: %0.5e' % llDiff, 'pVal:%0.5e' % pVal if (kernelToTry == 'lin' or (pVal < bestPval and (len(kernelsToTry) == 1 or pVal < 0.05 / (len(kernelsToTry) - 1)))): bestOptObj = optObj bestPval = pVal bestKernelName = kernelToTry best_hyp0 = hyp0 best_sumKernel = sumKernel bestKernel = kernel if (bestKernel is not None): kernelsListAll[-1] = bestKernel else: kernelsListAll = kernelsListAll[:-1] hyp_kernels += best_hyp0 bestKernelNames.append(bestKernelName) if self.verbose: print 'selected kernel:', bestKernelName if self.verbose: print 'selected kernels:', bestKernelNames print return bestKernelNames
def test_Uniform_to_ChiSquare(self): X = RV(Uniform(a=0, b=1)) sims = (-2 * log(X)).sim(Nsim) cdf = stats.chi2(df=2).cdf pval = stats.kstest(sims, cdf).pvalue self.assertTrue(pval > .01)
# In[89]: # Create crosstab of variables of interest tab = pd.crosstab(data['V1'], data['V7']) # In[90]: tab # In[91]: from scipy.stats import chi2_contingency as chi2 # In[92]: chi2(tab) # #### Persons who belong to a farmers association are more likely to have attended a training before # ### Are those visited by extension officers more likely to have attended trainings in the past # In[93]: tab2 = pd.crosstab(data['V3'], data['V7']) chi2(tab2) # #### Persons who have been visited by an extension officer were more likely to have attended a training before # *** # # Distribution of correct responses
def test(): import DDFacet.ToolsDir.Gaussian _,_,PSF=DDFacet.ToolsDir.Gaussian.Gaussian(10,311,1.) #PSF.fill(1.) #import scipy.signal #PP=scipy.signal.fftconvolve(PSF,PSF, mode='same') #print Fact import pylab pylab.clf() pylab.imshow(PSF,interpolation="nearest") pylab.colorbar() pylab.draw() pylab.show(False) pylab.pause(0.1) Dirty=np.zeros_like(PSF) nx,_=Dirty.shape Dirty[nx//2,nx//2+10]+=2. Dirty[nx//2+10,nx//2+10]+=2. Dirty=np.random.randn(*(Dirty.shape)) PSF=PSF.reshape((1,1,nx,nx))*np.ones((2,1,1,1)) Dirty=Dirty.reshape((1,1,nx,nx))*np.ones((2,1,1,1)) Dirty[1,:,:,:]=Dirty[0,:,:,:]*2 x,y=np.mgrid[0:nx,0:nx] dx=10 nc=nx//2 x=x[nc-dx:nc+dx,nc-dx:nc+dx].flatten() y=y[nc-dx:nc+dx,nc-dx:nc+dx].flatten() ListPixParms=[(x[i],y[i]) for i in range(x.size)] x,y=np.mgrid[0:nx,0:nx] dx=10 x=x[nc-dx:nc+dx,nc-dx:nc+dx].flatten() y=y[nc-dx:nc+dx,nc-dx:nc+dx].flatten() ListPixData=[(x[i],y[i]) for i in range(x.size)] CC=ClassConvMachine(PSF,ListPixParms,ListPixData,"Matrix") NFreqBands,_,_,_=Dirty.shape NPixListParms=len(ListPixParms) NPixListData=len(ListPixData) Array=np.zeros((NFreqBands,1,NPixListParms),np.float32) x0,y0=np.array(ListPixParms).T for iBand in range(NFreqBands): Array[iBand,0,:]=Dirty[iBand,0,x0,y0] Array=Array.reshape((NFreqBands,NPixListParms)) import pylab Lchi0=[] Lchi1=[] NTries=5000 ArrKeep0=np.zeros((NTries,NPixListParms),Array.dtype) ArrKeep1=np.zeros((NTries,NPixListParms),Array.dtype) for i in range(NTries): Array=np.random.randn(*Array.shape) #T=ClassTimeIt.ClassTimeIt() chi0=np.sum(Array**2) Lchi0.append(chi0) ConvArray0=CC.Convolve(Array) chi1=np.sum(ConvArray0**2) #T.timeit("0") #ConvArray1=CC.Convolve(Array,ConvMode="Vector").ravel() #T.timeit("1") #r=chi1/chi0 #print "%f -> %f [%r]"%(chi0,chi1,r) NChan,_,NN=ConvArray0.shape NN=int(np.sqrt(NN)) ArrKeep0[i]=Array[0].ravel() ArrKeep1[i]=ConvArray0[0].ravel() # pylab.clf() # pylab.imshow(ConvArray0.reshape((2,NN,NN))[0],interpolation="nearest") # pylab.draw() # pylab.show(False) # pylab.pause(0.1) Lchi1.append(chi1) #print np.var(Array),np.var(ConvArray0)/Fact Fact=CC.NormData[0] print(np.median(np.std(ArrKeep0,axis=0)**2)) print(np.median(np.std(ArrKeep1,axis=0)**2/Fact)) return from scipy.stats import chi2 from DDFacet.ToolsDir.GeneDist import ClassDistMachine DM=ClassDistMachine() rv = chi2(Array.size) x=np.linspace(0,2*rv.moment(1),1000) P=rv.cdf(x) pylab.clf() pylab.subplot(2,1,1) #yd,xe=pylab.histogram(Lchi0,bins=100,normed=True) #xd=(xe[1::]+xe[0:-1])/2. #yd/=np.sum(yd) xd,yd=DM.giveCumulDist(np.array(Lchi0),Ns=100) #dx=xd[1]-xd[0] #yd/=dx pylab.plot(xd,yd) pylab.plot(x,P) pylab.xlim(0,1600) pylab.subplot(2,1,2) xd,yd=DM.giveCumulDist(np.array(Lchi1),Ns=20) # yd,xe=pylab.histogram(Lchi1,bins=100,normed=True) # xd=(xe[1::]+xe[0:-1])/2. # dx=xd[1]-xd[0] # yd/=np.sum(yd) # yd/=dx print(np.mean(Lchi1)/Fact) print(np.mean(Lchi0)) # #pylab.xlim(0,800) # #pylab.hist(Lchi1,bins=100) import scipy.interpolate cdf=scipy.interpolate.interp1d(xd, yd,"cubic") x=np.linspace(xd.min(),xd.max(),1000) #pylab.plot(x,cdf(x),ls="",marker=".") #pylab.plot(xd,yd,ls="",marker="s") y=cdf(x) x,y=xd, yd y=y[1::]-y[0:-1] x=(x[1::]+x[0:-1])/2. pylab.plot(x,y,ls="",marker=".") #pylab.xlim(0,1600) pylab.draw() pylab.show(False) # import pylab # pylab.clf() # #pylab.plot(ConvArray0.ravel()) # pylab.imshow(PSF[0,0]) # #pylab.plot(ConvArray1) # #pylab.plot(ConvArray1-ConvArray0) # pylab.draw() # pylab.show(False) stop
def test_Exponential_to_ChiSquare(self): X = RV(Exponential(rate=1 / 2)) sims = X.sim(Nsim) cdf = stats.chi2(df=2).cdf pval = stats.kstest(sims, cdf).pvalue self.assertTrue(pval > .01)
def find_optimal_threshold(m, P): """Returns the min. threshold to have prob. < P to be BG (averaging m ph). Same formula as find_optimal_T() (must be multiplied by bg to have the rate. """ return m / (0.5 * chi2(2 * m).ppf(P))
# you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Define the distribution parameters to be plotted k_values = [1, 2, 5, 7] linestyles = ['-', '--', ':', '-.'] mu = 0 x = np.linspace(-1, 20, 1000) #------------------------------------------------------------ # plot the distributions fig, ax = plt.subplots(figsize=(5, 3.75)) fig.subplots_adjust(bottom=0.12) for k, ls in zip(k_values, linestyles): dist = chi2(k, mu) plt.plot(x, dist.pdf(x), ls=ls, c='black', label=r'$k=%i$' % k) plt.xlim(0, 10) plt.ylim(0, 0.5) plt.xlabel('$Q$') plt.ylabel(r'$p(Q|k)$') plt.title(r'$\chi^2\ \mathrm{Distribution}$') plt.legend() plt.show()
def __init__(self, theta): self._chi2 = chi2(theta)
# # -*- coding: utf-8 -*- import numpy as np import matplotlib.pyplot as plt from scipy.stats import chi2 df = 5 rv = chi2(df) #Сгенерируйте из него выборку объёма 1000 sampleRange = chi2.rvs(df, size=1000) #Постройте гистограмму выборки и нарисуйте поверх неё теоретическую плотность распределения вашей случайной величины. # plt.hist(sampleRange, normed=True, bins=20, alpha=0.5, label='hist samples') # plt.ylabel('number of samples') # plt.xlabel('$x$') #теоретическая плотность распределения случайной величины left = chi2.ppf(0.01, df) right = chi2.ppf(0.99, df) x = np.linspace(left, 20, 100) # plt.plot(x, rv.pdf(x), 'r-', lw=5, alpha=0.7, label='chi2 pdf') plt.legend(loc='best') # plt.show() # values = np.array([pareto.rvs(k, size=10) for x in range(10)]) # print values # plt.hist(values.mean(axis=1), normed=True) m = [] # for _ in xrange(20): # m.append(np.mean(chi2.rvs(df, size=1000)))
def r(arr): arr = arr.dropna() rhos = acf(arr, nlags=12, fft=True) test = arr.shape[0] * rhos[-1]**2 / (1 + (rhos[1:-1]**2).sum()) return test > stats.chi2(1).ppf(0.9)
#viz.plot_vel_err(ownship, navsys,boxplot=False) # Tracking results xy_measurements = [polar_to_cartesian(ground_radar.data[:,k]) for k in range(len(radar_time))] xy_measurements = np.vstack(xy_measurements).T _, ax_xy = plt.subplots(1,2) ax_xy[0].plot(ownship.state[1,:], ownship.state[0,:]) viz.target_xy(target, perfect_pose_imm, ax=ax_xy[0], measurements=xy_measurements) ax_xy[0].set_title('IMM - perfect pose') ax_xy[1].plot(ownship.state[1,:], ownship.state[0,:]) viz.target_xy(target, navigation_imm, ax=ax_xy[1], measurements=xy_measurements) ax_xy[1].set_title('IMM - navigation pose') viz.target_velocity(target, navigation_imm) viz.target_velocity(target, perfect_pose_imm) # NEES plot UB = chi2(df=2*N_MC).ppf(0.975)/N_MC*np.ones_like(radar_time) LB = chi2(df=2*N_MC).ppf(0.025)/N_MC*np.ones_like(radar_time) NEES_fig, consistency_ax = plt.subplots(1,2) NEES_ax = consistency_ax[0] NEES_ax.plot(radar_time, UB, 'k') NEES_ax.plot(radar_time, LB, 'k') NEES_ax.plot(radar_time, np.mean(NEES_nav, axis=0), label='navigation pose') NEES_ax.plot(radar_time, np.mean(NEES_perf, axis=0), label='perfect pose') NEES_ax.legend() NEES_ax.set_title('NEES of tracking for ' + str(N_MC) + ' monte carlo runs') RMS_ax = consistency_ax[1] RMS_ax.plot(radar_time, np.sqrt(np.mean(RMSE_nav, axis=0)), label='navigation pose') RMS_ax.plot(radar_time, np.sqrt(np.mean(RMSE_perf, axis=0)), label='perfect pose') RMS_ax.legend() RMS_ax.set_title('Position RMS error for ' + str(N_MC) + ' monte carlo runs')
def runMetroSingleChain(self, individual0, NSteps=1000, chain_dict={}): df = self.PM.NPixListData self.rv = chi2(df) _, Chi2 = self.GiveFitness(individual0) self.MinChi2 = Chi2 logProb = self.rv.logpdf(Chi2) x = np.linspace(0, 2 * self.rv.moment(1), 1000) lP = self.rv.logpdf(x) iMax = np.argmax(lP) self.Chi2PMax = x[iMax] # ##################### # # V0 #self.Var=self.MinChi2/self.Chi2PMax #Chi20_n=self.MinChi2/self.Var #VarMin=(3e-3)**2 #ThVar=np.max([self.Var,VarMin]) #ShrinkFactor=np.min([1.,self.Var/ThVar]) # # print # # print ShrinkFactor # # print # # stop # ##################### VarMin = (3e-4)**2 #self.Var=np.max([self.EstimatedStdFromResid**2,VarMin]) Var = self.MinChi2 / self.Chi2PMax S = self.PM.ArrayToSubArray(individual0, Type="S") B = np.sum(np.abs(S)) / float(S.size) B0 = 7e-4 Sig0 = 3e-3 Sig = B * Sig0 / B0 # print # print "%f %f %f -> %f"%(B,B0,Sig0,Sig) # print self.Var = np.max([4. * self.EstimatedStdFromResid**2, Sig**2]) Chi20_n = self.MinChi2 / self.Var ShrinkFactor = 1. # ##################### DicoChains = {} Parms = individual0 # ################################## DoPlot = True if DoPlot: import pylab pylab.figure(1) x = np.linspace(0, 2 * self.rv.MeanChi2, 1000) P = self.rv.pdf(x) pylab.clf() pylab.plot(x, P) Chi2Red = Chi2_0 #/self.Var pylab.scatter(Chi2Red, np.mean(P), c="black") pylab.draw() pylab.show(False) # ################################## # ################################## DoPlot = False # DoPlot=True if DoPlot: import pylab x = np.linspace(0, 2 * self.rv.moment(1), 1000) P = self.rv.pdf(x) pylab.clf() pylab.plot(x, P) pylab.scatter(Chi20_n, np.mean(P), c="black") pylab.draw() pylab.show(False) # ################################## DicoChains["Parms"] = [] DicoChains["Chi2"] = [] DicoChains["logProb"] = [] logProb0 = self.rv.logpdf(Chi20_n) Mut_pFlux, Mut_p0, Mut_pMove = 0.2, 0., 0.3 #T.disable() FactorAccelerate = 1. lAccept = [] NBurn = self.GD["MetroClean"]["MetroNBurnin"] NSteps = NSteps + NBurn NAccepted = 0 iStep = 0 NMax = NSteps #10000 #for iStep in range(NSteps): while NAccepted < NSteps and iStep < NMax: iStep += 1 #print "========================" #print iStep individual1, = self.MutMachine.mutGaussian(individual0.copy(), Mut_pFlux, Mut_p0, Mut_pMove) #, #FactorAccelerate=FactorAccelerate) # ds=Noise # individual1,=self.MutMachine.mutNormal(individual0.copy(),ds*1e-1*FactorAccelerate) # #T.timeit("mutate") _, Chi2 = self.GiveFitness(individual1) # if Chi2<self.MinChi2: # self.Var=Chi2/self.Chi2PMax # #print " >>>>>>>>>>>>>> %f"%np.min(Chi2) Chi2_n = Chi2 / self.Var Chi2_n = Chi20_n + ShrinkFactor * (Chi2_n - Chi20_n) logProb = self.rv.logpdf(Chi2_n) p1 = logProb p0 = logProb0 #DicoChains["logProb"][-1] if p1 - p0 > 5: R = 1 elif p1 - p0 < -5: R = 0 else: R = np.min([1., np.exp(p1 - p0)]) r = np.random.rand(1)[0] #print "%5.3f [%f -> %f]"%(R,p0,p1) # print "MaxDiff ",np.max(np.abs(self.pop[iChain]-DicoChains[iChain]["Parms"][-1])) lAccept.append((r < R)) if r < R: # accept individual0 = individual1 logProb0 = logProb NAccepted += 1 if NAccepted > NBurn: DicoChains["logProb"].append(p1) DicoChains["Parms"].append(individual1) DicoChains["Chi2"].append(Chi2_n) if DoPlot: pylab.scatter(Chi2_n, np.exp(p1), lw=0) pylab.draw() pylab.show(False) pylab.pause(0.1) # print " accept" # # Model=self.StackChain() # # Asq=self.ArrayMethodsMachine.PM.ModelToSquareArray(Model,TypeInOut=("Parms","Parms")) # # _,npol,NPix,_=Asq.shape # # A=np.mean(Asq,axis=0).reshape((NPix,NPix)) # # Mask=(A==0) # # pylab.clf() # # pylab.imshow(A,interpolation="nearest") # # pylab.draw() # # pylab.show(False) # # pylab.pause(0.1) else: # # ####################### if DoPlot: pylab.scatter(Chi2_n, np.exp(p1), c="red", lw=0) pylab.draw() pylab.show(False) pylab.pause(0.1) # # ####################### pass #T.timeit("Compare") AccRate = np.count_nonzero(lAccept) / float(len(lAccept)) #print "[%i] Acceptance rate %f [%f with ShrinkFactor %f]"%(iStep,AccRate,FactorAccelerate,ShrinkFactor) if (iStep % 50 == 0) & (iStep > 10): if AccRate > 0.234: FactorAccelerate *= 1.5 else: FactorAccelerate /= 1.5 FactorAccelerate = np.min([3., FactorAccelerate]) FactorAccelerate = np.max([.01, FactorAccelerate]) lAccept = [] #T.timeit("Acceptance") T.timeit("Chain") chain_dict["logProb"] = np.array(DicoChains["logProb"]) chain_dict["Parms"] = np.array(DicoChains["Parms"]) chain_dict["Chi2"] = np.array(DicoChains["Chi2"])
return (y - func(x, a, b, c, d, e))**2 # Leastsquare Method p0 = [1, 1, 1, 1, 1] # Starting Values plsq, cov = curve_fit(func, xData, yData, p0, sigma=yerr) a, b, c, d, e = plsq[0], plsq[1], plsq[2], plsq[3], plsq[4] np.set_printoptions(precision=2) print cov yFit = func(xData, a, b, c, d, e) print "Param for UpFit ist:", 'a= ', a, ' b= ', b, ' c =', c, ' d= ', d, ' e= ', e # Chisquare test S = np.sum(residuals(xData, yData, a, b, c, d, e) / (yerr**2)) dof = len(xData) - 5 #Put number of Parameters here rv = chi2(dof) chimin, chimax = rv.ppf(0.025), rv.ppf(0.975) # two sided pvalue test if S >= dof: pvalue = 2 * (rv.cdf(S) - 1) if S < dof: pvalue = 2 * rv.cdf(S) #pvalue = rv.cdf(S) print 'chimin:' + str('%.2f' % chimin), 'chimax:' + str( '%.2f' % chimax), 'chisquare:' + str('%.2f' % S), 'pvalue: ' + str( '%.2f' % pvalue) # plot Reult #plt.figure() #plt.subplot(211) #plt.title(r'Ein vielsagender Titel') plt.errorbar(xData, yData, yerr, fmt='o', label=r'Up', color='r') plt.plot(xData, yFit, label='Up Fit', color='r')
_DIST_MAP = { dist.BernoulliProbs: lambda probs: osp.bernoulli(p=probs), dist.BernoulliLogits: lambda logits: osp.bernoulli(p=_to_probs_bernoulli(logits)), dist.Beta: lambda con1, con0: osp.beta(con1, con0), dist.BinomialProbs: lambda probs, total_count: osp.binom(n=total_count, p=probs), dist.BinomialLogits: lambda logits, total_count: osp.binom(n=total_count, p=_to_probs_bernoulli(logits)), dist.Cauchy: lambda loc, scale: osp.cauchy(loc=loc, scale=scale), dist.Chi2: lambda df: osp.chi2(df), dist.Dirichlet: lambda conc: osp.dirichlet(conc), dist.Exponential: lambda rate: osp.expon(scale=np.reciprocal(rate)), dist.Gamma: lambda conc, rate: osp.gamma(conc, scale=1. / rate), dist.HalfCauchy: lambda scale: osp.halfcauchy(scale=scale), dist.HalfNormal: lambda scale: osp.halfnorm(scale=scale), dist.InverseGamma: lambda conc, rate: osp.invgamma(conc, scale=rate), dist.LogNormal: lambda loc, scale: osp.lognorm(s=scale, scale=np.exp(loc)), dist.MultinomialProbs:
def test_linear_model_parameters_risk_free_gls(data): mod = LinearFactorModel(data.portfolios, data.factors, risk_free=True) p = mod.portfolios.ndarray sigma = np.cov(p.T) val, vec = np.linalg.eigh(sigma) sigma_m12 = vec @ np.diag(1.0 / np.sqrt(val)) @ vec.T sigma_inv = np.linalg.inv(sigma) mod = LinearFactorModel(data.portfolios, data.factors, risk_free=True, sigma=sigma) assert 'using GLS' in str(mod) res = mod.fit() f = mod.factors.ndarray p = mod.portfolios.ndarray n = f.shape[0] moments = np.zeros( (n, p.shape[1] * (f.shape[1] + 1) + f.shape[1] + 1 + p.shape[1])) fc = np.c_[np.ones((n, 1)), f] betas = np.linalg.lstsq(fc, p)[0] eps = p - fc @ betas loc = 0 for i in range(eps.shape[1]): for j in range(fc.shape[1]): moments[:, loc] = eps[:, i] * fc[:, j] loc += 1 bc = np.c_[np.ones((p.shape[1], 1)), betas[1:, :].T] lam = np.linalg.lstsq(sigma_m12 @ bc, sigma_m12 @ p.mean(0)[:, None])[0] pricing_errors = p - (bc @ lam).T for i in range(lam.shape[0]): lam_error = pricing_errors @ sigma_inv @ bc[:, [i]] moments[:, loc] = lam_error.squeeze() loc += 1 alphas = p.mean(0)[:, None] - bc @ lam moments[:, loc:] = pricing_errors - alphas.T mod_moments = mod._moments(eps, bc, lam, alphas, pricing_errors) assert_allclose(res.betas, bc[:, 1:]) assert_allclose(res.risk_premia, lam.squeeze()) assert_allclose(res.alphas, alphas.squeeze()) assert_allclose(moments, mod_moments) m = moments.shape[1] jac = np.eye(m) block1 = p.shape[1] * (f.shape[1] + 1) # 1,1 jac[:block1, :block1] = np.kron(np.eye(p.shape[1]), fc.T @ fc / n) # 2, 1 loc = 0 nport, nf = p.shape[1], f.shape[1] block2 = block1 + nf + 1 bct = sigma_inv @ bc at = sigma_inv @ alphas for i in range(nport): block = np.zeros((nf + 1, nf + 1)) for j in range(nf + 1): # rows for k in range(1, nf + 1): # cols block[j, k] = bct[i][j] * lam[k] if j == k: block[j, k] -= at[i] jac[block1:block2, loc:loc + nf + 1] = block loc += nf + 1 # 2, 2 jac[block1:block2, block1:block2] = bc.T @ sigma_inv @ bc # 3,1 block = np.zeros((nport, nport * (nf + 1))) row = col = 0 for i in range(nport): for j in range(nf + 1): if j != 0: block[row, col] = lam[j] col += 1 row += 1 jac[-nport:, :(nport * (nf + 1))] = block # 3, 2 jac[-nport:, (nport * (nf + 1)):(nport * (nf + 1)) + nf + 1] = bc # 3, 3: already done since eye mod_jac = mod._jacobian(bc, lam, alphas) assert_allclose(mod_jac[:block1], jac[:block1]) assert_allclose(mod_jac[block1:block2, :block1], jac[block1:block2, :block1]) assert_allclose(mod_jac[block1:block2, block1:block2], jac[block1:block2, block1:block2]) assert_allclose(mod_jac[block1:block2, block2:], jac[block1:block2, block2:]) assert_allclose(mod_jac[block2:], jac[block2:]) s = moments.T @ moments / (n - (nf + 1)) ginv = np.linalg.inv(jac) cov = ginv @ s @ ginv.T / n order = np.zeros((nport, nf + 1), dtype=np.int64) order[:, 0] = np.arange(block2, block2 + nport) for i in range(nf): order[:, i + 1] = (nf + 1) * np.arange(nport) + (i + 1) order = np.r_[order.ravel(), block1:block2] cov = cov[order][:, order] cov = (cov + cov.T) / 2 assert_allclose(cov, res.cov) acov = cov[:block1:(nf + 1), :block1:(nf + 1)] jstat = float(alphas.T @ np.linalg.pinv(acov) @ alphas) assert_allclose(res.cov.values[:block1:(nf + 1), :block1:(nf + 1)], acov) assert_allclose(res.j_statistic.stat, jstat, rtol=1e-1) assert_allclose(res.j_statistic.pval, 1 - stats.chi2(nport - nf - 1).cdf(jstat), rtol=1e-2) get_all(res)
from math import log, exp from numpy import partition from numpy import mean from numpy_sugar.special import logsumexp def _get_median_terms(n): if n % 2 == 0: nh = n // 2 kth = [nh - 1, nh] else: kth = [(n - 1) // 2] return kth _chi2_df1 = chi2(df=1) def gcontrol(chi2_values): """ Genomic control """ n = len(chi2_values) kth = _get_median_terms(n) chi2_values = partition(chi2_values, kth) x2obs = mean(chi2_values[kth]) x2exp = _chi2_df1.ppf(0.5) return x2obs / x2exp def qvalues(pv): import rpy2.robjects as robjects
def test_linear_model_parameters(data): mod = LinearFactorModel(data.portfolios, data.factors) res = mod.fit() f = mod.factors.ndarray p = mod.portfolios.ndarray n = f.shape[0] moments = np.zeros( (n, p.shape[1] * (f.shape[1] + 1) + f.shape[1] + p.shape[1])) fc = np.c_[np.ones((n, 1)), f] betas = np.linalg.lstsq(fc, p)[0] eps = p - fc @ betas loc = 0 for i in range(eps.shape[1]): for j in range(fc.shape[1]): moments[:, loc] = eps[:, i] * fc[:, j] loc += 1 b = betas[1:, :].T lam = np.linalg.lstsq(b, p.mean(0)[:, None])[0] pricing_errors = p - (b @ lam).T for i in range(lam.shape[0]): lam_error = (p - (b @ lam).T) @ b[:, [i]] moments[:, loc] = lam_error.squeeze() loc += 1 alphas = pricing_errors.mean(0)[:, None] moments[:, loc:] = pricing_errors - alphas.T mod_moments = mod._moments(eps, b, lam, alphas, pricing_errors) assert_allclose(res.betas, b) assert_allclose(res.risk_premia, lam.squeeze()) assert_allclose(res.alphas, alphas.squeeze()) assert_allclose(moments, mod_moments) m = moments.shape[1] jac = np.eye(m) block1 = p.shape[1] * (f.shape[1] + 1) # 1,1 jac[:block1, :block1] = np.kron(np.eye(p.shape[1]), fc.T @ fc / n) # 2, 1 loc = 0 nport, nf = p.shape[1], f.shape[1] block2 = block1 + nf for i in range(nport): block = np.zeros((nf, nf + 1)) for j in range(nf): # rows for k in range(1, nf + 1): # cols block[j, k] = b[i][j] * lam[k - 1] if j + 1 == k: block[j, k] -= alphas[i] jac[block1:block2, loc:loc + nf + 1] = block loc += nf + 1 # 2, 2 jac[block1:block2, block1:block2] = b.T @ b # 3,1 block = np.zeros((nport, nport * (nf + 1))) row = col = 0 for i in range(nport): for j in range(nf + 1): if j != 0: block[row, col] = lam[j - 1] col += 1 row += 1 jac[-nport:, :(nport * (nf + 1))] = block # 3, 2 jac[-nport:, (nport * (nf + 1)):(nport * (nf + 1)) + nf] = b # 3, 3: already done since eye mod_jac = mod._jacobian(b, lam, alphas) assert_allclose(mod_jac[:block1], jac[:block1]) assert_allclose(mod_jac[block1:block2, :block1], jac[block1:block2, :block1]) assert_allclose(mod_jac[block1:block2, block1:block2], jac[block1:block2, block1:block2]) assert_allclose(mod_jac[block1:block2, block2:], jac[block1:block2, block2:]) assert_allclose(mod_jac[block2:], jac[block2:]) s = moments.T @ moments / (n - (nf + 1)) ginv = np.linalg.inv(jac) cov = ginv @ s @ ginv.T / n order = np.zeros((nport, nf + 1), dtype=np.int64) order[:, 0] = np.arange(block2, block2 + nport) for i in range(nf): order[:, i + 1] = (nf + 1) * np.arange(nport) + (i + 1) order = np.r_[order.ravel(), block1:block2] cov = cov[order][:, order] cov = (cov + cov.T) / 2 assert_allclose(cov, res.cov) acov = cov[:block1:(nf + 1), :block1:(nf + 1)] jstat = float(alphas.T @ np.linalg.pinv(acov) @ alphas) assert_allclose(res.j_statistic.stat, jstat) assert_allclose(res.j_statistic.pval, 1 - stats.chi2(nport - nf).cdf(jstat)) get_all(res) res = LinearFactorModel(data.portfolios, data.factors).fit(cov_type='kernel', debiased=False) std_mom = moments / moments.std(0)[None, :] mom = std_mom.sum(1) bw = kernel_optimal_bandwidth(mom) w = kernel_weight_bartlett(bw, n - 1) s = _cov_kernel(moments, w) cov = ginv @ s @ ginv.T / n cov = cov[order][:, order] cov = (cov + cov.T) / 2 assert_allclose(cov, res.cov)
def epf_ineff_manipulate_alt(truepar, dictd, dictv, n=75000, rlow=0, rhigh=500, offset=0, per=1, getlow=-1, gethigh=-1, direct="", jchoice=-1): filename = dictd['filename'] dataname = dictd['dataname'] dataname2 = dictd['dataname2'] wdataname = dictd['wdataname'] vdataname = dictd['vdataname'] if 'filename2' in dictd: filename2 = dictd['filename2'] if 'fixed' in dictv: filename = filename + '_fix' Summest = [] for i in range(int(rlow / per + offset / per), int(ceil(rhigh / per + offset / per))): try: Summest = Summest + [ pd.read_csv("../Results/MC/Trials/" + direct + "summary_" + filename + "_" + str(i), header=None) ] except: continue if 'filename2' in dictd: Summest2 = [] for i in range(int(rlow / per + offset / per), int(ceil(rhigh / per + offset / per))): try: Summest2 = Summest2 + [ pd.read_csv("../Results/MC/Trials/" + direct + "summary_" + filename2 + "_" + str(i), header=None) ] except: continue else: Summest2 = Summest jacdes = "" if jchoice >= 0: jacdes = '_jac_' + str(jchoice) Varoos = [] for i in range(int(rlow / per + offset / per), int(ceil(rhigh / per + offset / per))): try: Varoos = Varoos + [ pd.read_csv("../Results/MC/Trials/" + direct + "varoos_" + filename + "_" + str(i), header=None) ] except: continue Summest = np.array(pd.concat(Summest)) Summest2 = np.array(pd.concat(Summest2)) Varoos = np.array(pd.concat(Varoos)) Qdat = np.array(pd.read_csv("../Results/MC/" + dataname, header=None)) Qdat2 = np.array(pd.read_csv("../Results/MC/" + dataname2, header=None)) if wdataname == 'identity': Wdat = np.identity(8) else: Wdat = np.array(pd.read_csv("../Results/MC/" + wdataname, header=None)) Vdat = np.array(pd.read_csv("../Results/MC/" + vdataname, header=None)) #older = Summest[:,1] > 0 #Summest = Summest[older, :] #Varoos = Varoos[older, :] #jacw = np.prod(np.isnan(Summest[:,njac1:njac1_end].astype(float))==False,1) if 'njac' in dictv: njac = dictv['njac'] else: njac = njac1 if 'njaca' in dictv: njaca = dictv['njaca'] else: njaca = njac2 if 'njacend' in dictv: njacend = dictv['njacend'] else: njacend = njac1_end if 'njacaend' in dictv: njacaend = dictv['njacaend'] else: njacaend = njac2_end if 'nimom' in dictv: nimom = dictv['nimom'] else: nimom = nmom if 'nimom_end' in dictv: nimom_end = dictv['nimom_end'] else: nimom_end = nmom_end if 'namom' in dictv: namom = dictv['namom'] else: namom = nepfq if 'namom_end' in dictv: namom_end = dictv['namom_end'] else: namom_end = nepfq_end if 'npar_est' in dictv: npar_est = dictv['npar_est'] else: npar_est = 4 if 'nresi' in dictv: nresi = dictv['nresi'] else: nresi = nres if 'nresi_end' in dictv: nresi_end = dictv['nresi_end'] else: nresi_end = nres_end if 'jacname' in dictd: jacname = dictd['jacname'] else: jacname = 'jacobians_' + filename n_mom = nimom_end - nimom print(n_mom) n_mom2 = namom_end - namom if jchoice == -1: jacs = Summest[:, njac:njacend] jacs2 = Summest[:, njaca:njacaend] #jacs = Summest[:, njac:njacend] #jacs2 = Summest[:, njaca:njacaend] jacs[jacs == ' '] = ' -nan' jacs2[jacs2 == ' '] = ' -nan' jacs = jacs.astype(float) jacs2 = jacs2.astype(float) elif jchoice == -2: Jaccest = [] for i in range(int(rlow / per + offset / per), int(ceil(rhigh / per + offset / per))): try: Jaccest = Jaccest + [ pd.read_csv("../Results/MC/Trials/" + direct + "summary_" + jacname + "_" + str(i), header=None) ] except: continue Jaccest = np.array(pd.concat(Jaccest)) if np.shape(Jaccest)[0] > np.shape(Summest)[0]: Jaccest = Jaccest[0:np.shape(Summest)[0]] jacs = Jaccest[:, njac:njacend] jacs2 = Jaccest[:, njaca:njacaend] #jacs = Summest[:, njac:njacend] #jacs2 = Summest[:, njaca:njacaend] jacs[jacs == ' '] = ' -nan' jacs2[jacs2 == ' '] = ' -nan' jacs = jacs.astype(float) jacs2 = jacs2.astype(float) else: Jaccest = [ pd.read_csv("../Results/MC/Trials/" + direct + jacname + "_" + str(i), header=None) for i in range(int(rlow / per + offset / per), int(ceil(rhigh / per + offset / per))) ] Jaccest = np.array(pd.concat(Jaccest)) Jaccest = Jaccest[ind, :] Jaccest = Jaccest[selected, :] jacs = Jaccest[:, jchoice * (njac2_end - njac1) + njac - njac1:jchoice * (njac2_end - njac1) + njacend - njac1].astype(float) jacs2 = Jaccest[:, jchoice * (njac2_end - njac1) + njaca - njac1:jchoice * (njac2_end - njac1) + njacaend - njac1].astype(float) jacw = np.prod(np.isnan(jacs) == False, 1) == 1 Summest = Summest[0:np.shape(jacs)[0], :] Varoos = Varoos[0:np.shape(jacs)[0], :] Summest = Summest[jacw, :] Varoos = Varoos[jacw, :] jacs = jacs[jacw, :] jacs2 = jacs2[jacw, :] trash, ind = np.unique(Summest[:, 0], return_index=True) Summest = Summest[ind, :] Varoos = Varoos[ind, :] jacs = jacs[ind, :] jacs2 = jacs2[ind, :] if getlow < 0: getlow = np.nanmin(Summest[:, 0].astype(float)) if gethigh < 0: gethigh = np.nanmax(Summest[:, 0].astype(float)) + 1 selected = np.array([(si >= getlow) and (si < gethigh) for si in Summest[:, 0].astype(float)]) Summest = Summest[selected, :] Varoos = Varoos[selected, :] jacs = jacs[selected, :] jacs2 = jacs2[selected, :] Mest = np.array(Summest[:, nres:nres_end]).astype(float) n_par = Mest.shape[1] print(n_par) err = Mest - truepar #print(err) bias = np.nanmean(err, 0) mse = np.nanmean(err**2, 0).astype(float) #print(mse) #rmse = np.sqrt(np.array(mse)) rmse = 0 #for i in range(0, 4): # bias[i] = np.nanmean(err[:,i][np.abs(err[:,i])>0]) # mse[i] = np.nanmean(err[:,i][np.abs(err[:,i])>0]**2) #print(mse) rmse = np.sqrt(np.array(mse)) sd = np.zeros((Mest.shape[0], 4)) ub = np.zeros((Mest.shape[0], 4)) lb = np.zeros((Mest.shape[0], 4)) jacworked = np.prod(np.isnan(jacs) == False, 1) njacworked = np.sum(jacworked) inb = np.zeros((njacworked, Mest.shape[1])) ts = np.zeros((njacworked, 4)) jn = 0 jstat = np.zeros(njacworked) jstato = np.zeros(njacworked) tsn = np.zeros((njacworked, n_mom)) Diff = np.zeros((Summest.shape[0], n_mom)) Diff2 = np.zeros((njacworked, n_mom)) Diffo = np.zeros((njacworked, n_mom2)) J = np.zeros((njacworked, n_mom * 4)) J2 = np.zeros((njacworked, n_mom2 * 4)) JWJ = np.zeros((njacworked, 4 * 4)) CV = np.zeros((njacworked, n_mom * 4)) V = np.zeros((njacworked, n_mom * n_mom)) VV = np.zeros((njacworked, n_mom * n_mom)) VVV = np.zeros((njacworked, n_mom * n_mom)) VO = np.zeros((njacworked, n_mom2 * n_mom2)) for i in range(0, Summest.shape[0]): jac = jacs[i, :].reshape(n_mom, n_par).transpose().astype(float) jac = jac[0:npar_est, :] jac[np.abs(jac) < 1e-8] = 0 #jac[np.abs(jac) > 50] = 0 jac2 = jacs2[i, :].reshape(n_mom2, n_par).transpose().astype(float) jac2 = jac2[0:npar_est, :] jac2[np.abs(jac2) < 1e-8] = 0 #jac2[np.abs(jac2) > 50] = 0 #if moment: # jac2 = JE # jac = JM #else: # jac = JE # jac2 = JM try: diff = Summest[i, nimom:nimom_end] - Qdat[Summest[i, 0], :] except: diff = Summest[i, nimom:nimom_end] * np.nan #if n_mom == 8: # if np.sum(jacs2[i,:]==0)>0: # jac2 = jac2 * np.nan #else: # if np.sum(jacs[i,:]==0)>0: # jac = jac * np.nan #for i in range(0, n_par): # for j in range(0, n_mom): # jac[i, j] = float(jac[i,j]) Diff[i, :] = diff try: if wdataname == 'identity': w = np.identity(n_mom) else: w = Wdat[Summest[i, 0], :].reshape(n_mom, n_mom) v = Vdat[Summest[i, 0], :].reshape(n_mom, n_mom) diff2 = Qdat2[Summest[i, 0], :] - Summest[i, namom:namom_end] except: w = np.ones((n_mom, n_mom)) * np.nan v = w diff2 = Summest[i, namom:namom_end] * np.nan try: jwj = np.linalg.inv(quad(jac, w)) #jwjj = np.dot(jwj,jac) jwjj = np.linalg.solve(quad(jac, w), jac) except: jwj = np.ones((4, 4)) * np.nan jwjj = np.ones((4, n_mom)) * np.nan brd = quad(w, v) jbrd = quad(jac, brd) avar = quad(jwj, jbrd) / n sd[i, :] = np.sqrt(np.diag(avar) * (1 + 1 / 10)) ub[i, :] = Mest[i, 0:npar_est] + 1.96 * sd[i, :] lb[i, :] = Mest[i, 0:npar_est] - 1.96 * sd[i, :] if jacworked[i] == 1: for j in range(0, 4): if (ub[i, j] > truepar[j]) and (lb[i, j] < truepar[j]): inb[jn, j] = 1.0 #bread = np.eye(n_mom) - np.dot(np.dot(jac.transpose(), jwjj), w) #cv = np.dot(np.dot(jac.transpose(), jwjj), np.dot(w, v)) cv = np.dot(np.dot(jac.transpose(), jwjj), np.dot(w, v)) vv = (v - cv - cv.transpose() + quad(np.dot(jac.transpose(), jwjj), quad(w, v))) try: wnew = np.linalg.pinv(vv) * n * 10 / 11 #wnew = np.linalg.pinv(quad(bread, v) * (1 + 1/10)) * n #wnew = np.linalg.pinv((v - cv*(1-1/10) - cv.transpose()*(1-1/10) + quad(np.dot(jac.transpose(), jwjj), quad(w, v))*(1+1/10))) * n except: wnew = np.ones((nimom_end - nimom, nimom_end - nimom)) * np.nan try: wnew2 = np.linalg.pinv(Varoos[i, :].reshape( namom_end - namom, namom_end - namom)) * n * 10 / 11 except: wnew2 = np.ones( (namom_end - namom, namom_end - namom)) * np.nan #print(quad(diff,wnew)) jstat[jn] = quad(diff, wnew) jstato[jn] = quad(diff2, wnew2) #if n_mom == 8: # if np.sum(jacs2[i,:]==0)>0: # jstato[jn] = np.nan #else: # if np.sum(jacs[i,:]==0)>0: # jstat[jn] = np.nan Diff2[jn, :] = diff vv = v - cv * (1 - 1 / 10) - cv.transpose() * (1 - 1 / 10) + quad( np.dot(jac.transpose(), jwjj), quad(w, v)) * (1 + 1 / 10) VV[jn, :] = (v - cv * (1 - 1 / 10) - cv.transpose() * (1 - 1 / 10) + quad(np.dot(jac.transpose(), jwjj), quad(w, v)) * (1 + 1 / 10)).reshape(1, n_mom * n_mom) V[jn, :] = wnew.reshape(1, n_mom * n_mom) VVV[jn, :] = v.reshape(1, n_mom * n_mom) VO[jn, :] = Varoos[i, :] Diffo[jn, :] = diff2 JWJ[jn, :] = jwj.reshape(1, 4 * 4) J2[jn, :] = jac2.reshape(n_mom2 * npar_est) J[jn, :] = jac.reshape(n_mom * npar_est) CV[jn, :] = np.dot(jwj, np.dot(jac, np.dot(w, v))).reshape(1, 4 * n_mom) try: ts[jn, :] = err[i, 0:npar_est] / sd[i, :] except: ts[jn, :] = np.nan tsn[jn, :] = diff / np.sqrt(np.diag(vv) / n) jn += 1 jstat_true = np.abs(Summest2[isfloat(Summest2[:, 2]), 2].astype(float) * n * (10 / 11)) prt = np.nanmean( np.abs(ts[np.prod(np.isnan(ts) == False, 1) == 1, :]) > 1.96, 0) dist = stats.chi2(n_mom - 4) dist2 = stats.chi2(n_mom) dist3 = stats.chi2(n_mom2) dnorm = stats.norm() pjstat = dist.cdf(jstat) pjstat2 = dist2.cdf(jstat_true) pjstato = dist3.cdf(jstato) if wdataname == 'ientity': Wdat = np.identity(n_mom) results = { 'mest': np.nanmean(Mest, 0), 'mestf': Mest, 'bias': bias, 'bias_pct': bias / truepar, 'mse': mse, 'rmse': rmse, 'rmse_pct': rmse / truepar, 'sd_pct': np.mean(sd[np.prod(np.isnan(tsn) == False, 1) == 1, :], 0) / truepar[0:npar_est], 'sd': sd, 'inb': inb, 'prt': prt, 'summ': Summest, 'jstat': jstat, 'jstat_true': jstat_true, 'pjstat': pjstat, 'pjstat2': pjstat2, 'err': err, 'ts': ts, 'tsn': tsn, 'jstato': jstato, 'pjstato': pjstato, 'w': Wdat, 'diff': Diff, 'diff2': Diff2, 'v': V, 'vv': VV, 'vvv': VVV, 'cv': CV, 'jwj': JWJ, 'ju1': J, 'ju2': J2, 'vfile': Vdat, 'vo': VO, 'diffo': Diffo, 'dat': Qdat } plt.figure(figsize=(20, 20)) try: plt.plot(dist.cdf(np.nanpercentile(jstat, np.arange(0, 101))), np.arange(0, 101) / 100.0, color='r', lw=4) plt.plot(dist2.cdf(np.nanpercentile(jstat_true, np.arange(0, 101))), np.arange(0, 101) / 100.0, 'b--', lw=4) except: print("No in sample moments") plt.plot(np.arange(0, 101) / 100.0, np.arange(0, 101) / 100.0, 'k', lw=2) try: plt.plot(dist3.cdf(np.nanpercentile(jstato, np.arange(0, 101))), np.arange(0, 101) / 100.0, 'g-.', lw=4) except: print("No out of sample moments") plt.xlabel(r'Theoretical percentile', fontsize=40) plt.ylabel(r'Actual percentile', fontsize=40) plt.legend([ r'Estimated parameters', r'True parameters', r'Theoretical', r'Out-of-sample' ], loc=2, frameon=False, fontsize=35) plt.savefig("../WR/chi2plot_" + filename + ".png") plt.close() try: plt.figure() for i in range(0, tsn.shape[1]): plt.plot(np.nanpercentile(dnorm.cdf(tsn[:, i]), np.arange(0, 101)), np.arange(0, 101) / 100.0) plt.plot(np.arange(0, 101) / 100.0, np.arange(0, 101) / 100.0, 'k--') plt.xlabel('Theoretical percentile') plt.ylabel('Actual percentile') plt.savefig("../WR/tplot_" + filename + ".png") plt.close() except: print("no plot") try: plt.figure() for i in range(0, ts.shape[1]): plt.plot(np.nanpercentile(dnorm.cdf(ts[:, i]), np.arange(0, 101)), np.arange(0, 101) / 100.0) plt.plot(np.arange(0, 101) / 100.0, np.arange(0, 101) / 100.0, 'k--') plt.xlabel('Theoretical percentile') plt.ylabel('Actual percentile') plt.savefig("../WR/tplot_par_" + filename + ".png") plt.close() except: print("no plot") return (results)
def _mvn_to_scipy(loc, cov, prec, tril): jax_dist = dist.MultivariateNormal(loc, cov, prec, tril) mean = jax_dist.mean cov = jax_dist.covariance_matrix return osp.multivariate_normal(mean=mean, cov=cov) _DIST_MAP = { dist.BernoulliProbs: lambda probs: osp.bernoulli(p=probs), dist.BernoulliLogits: lambda logits: osp.bernoulli(p=_to_probs_bernoulli(logits)), dist.Beta: lambda con1, con0: osp.beta(con1, con0), dist.BinomialProbs: lambda probs, total_count: osp.binom(n=total_count, p=probs), dist.BinomialLogits: lambda logits, total_count: osp.binom(n=total_count, p=_to_probs_bernoulli(logits)), dist.Cauchy: lambda loc, scale: osp.cauchy(loc=loc, scale=scale), dist.Chi2: lambda df: osp.chi2(df), dist.Dirichlet: lambda conc: osp.dirichlet(conc), dist.Exponential: lambda rate: osp.expon(scale=np.reciprocal(rate)), dist.Gamma: lambda conc, rate: osp.gamma(conc, scale=1./rate), dist.HalfCauchy: lambda scale: osp.halfcauchy(scale=scale), dist.HalfNormal: lambda scale: osp.halfnorm(scale=scale), dist.LogNormal: lambda loc, scale: osp.lognorm(s=scale, scale=np.exp(loc)), dist.MultinomialProbs: lambda probs, total_count: osp.multinomial(n=total_count, p=probs), dist.MultinomialLogits: lambda logits, total_count: osp.multinomial(n=total_count, p=_to_probs_multinom(logits)), dist.MultivariateNormal: _mvn_to_scipy, dist.Normal: lambda loc, scale: osp.norm(loc=loc, scale=scale), dist.Pareto: lambda alpha, scale: osp.pareto(alpha, scale=scale), dist.Poisson: lambda rate: osp.poisson(rate), dist.StudentT: lambda df, loc, scale: osp.t(df=df, loc=loc, scale=scale), dist.Uniform: lambda a, b: osp.uniform(a, b - a),
# In[19]: elements = np.array([1, 5, 12]) probabilities = [0.05, 0.7, 0.25] np.random.choice(elements, 10, p=probabilities) # # Другие распределения # Существует большое количество других стандартных семейств распределений, многие из которых также можно генерировать в Питоне. # Например, распределение хи-квадрат $\chi^2_k$, имеющее наутральный параметр $k$, который называется числом степеней свободы: # In[20]: x = np.linspace(0, 30, 100) for k in [1, 2, 3, 4, 6, 9]: rv = sts.chi2(k) cdf = rv.cdf(x) plt.plot(x, cdf, label="$k=%s$" % k) plt.legend() plt.title("CDF ($\chi^2_k$)") # In[21]: x = np.linspace(0, 30, 100) for k in [1, 2, 3, 4, 6, 9]: rv = sts.chi2(k) pdf = rv.pdf(x) plt.plot(x, pdf, label="$k=%s$" % k) plt.legend() plt.title("PDF ($\chi^2_k$)")
def ITGP(X, Y, alpha1=0.50, alpha2=0.975, nsh=2, ncc=2, nrw=1, maxiter=None, return_predict=True, callback=None, callback_args=(), warm_start=False, optimize_kwargs={}, **gp_kwargs): """ Robust Gaussian Process Regression Based on Iterative Trimming. Parameters ---------- X: array shape (n, d) Y: array shape (n, 1) Input data with shape (# of data, # of dims). alpha1, alpha2: float in (0, 1) Trimming and reweighting parameters respectively. nsh, ncc, nrw: int (>=0) Number of shrinking, concentrating, and reweighting iterations respectively. return_predict: bool If True, then the predicted mean, variance, and score of input data will be returned. callback: callable Function for monitoring the iteration process. It takes the iteration number i and the locals() dict as input e.g. callback=lambda i, locals: print(i, locals['gp'].num_data, locals['gp'].param_array) or callback=lambda i, locals: locals['gp'].plot() callback_args: Extra parameters for callback. warm_start: bool, int From which step it uses the warm start for optimizing hyper-parameters. 0: (default) disable warm start, always use a fresh initial guess (provided by input gp object). >=1: start optimization with hyper-parameters trained from last iteration for steps >= warm_start, A warm start might help converge faster with the risk of being trapped at a local solution. optimize_kwargs: GPy.core.GP.optimize parameters. **gp_kwargs: GPy.core.GP parameters, including likelihood and kernel. Gaussian and RBF are used as defaults. Returns ------- ITGPResult: named tuple object gp: GPy.core.GP object. consistency: Consistency factor. ix_sub: Boolean index for trimming sample. niter: Total iterations performed, <= 1 + nsh + ncc + nrw. Y_avg, Y_var: Expectation and variance of input data points. None if return_predict=False. score: Scaled residuals. None if return_predict=False. """ # check parameters if X.ndim == 1: X = np.atleast_2d(X).T if Y.ndim == 1: Y = np.atleast_2d(Y).T if len(X) != len(Y): raise ValueError("X should have the same length as Y") n, p = Y.shape if p != 1: raise ValueError("Y is expected in shape (n, 1).") if n * alpha1 - 0.5 <= 2: raise ValueError("The dataset is unreasonably small!") if nsh < 0 or ncc < 0 or nrw < 0: raise ValueError("nsh, ncc and nrw are expected to be nonnegative numbers.") gp_kwargs.setdefault('likelihood', GPy.likelihoods.Gaussian(variance=1.0)) gp_kwargs.setdefault('kernel', GPy.kern.RBF(X.shape[1])) gp_kwargs.setdefault('name', 'ITGP regression') # use copies so that input likelihood and kernel will not be changed likelihood_init = gp_kwargs['likelihood'].copy() kernel_init = gp_kwargs['kernel'].copy() # temp vars declaration d_sq = None ix_old = None niter = 0 # shrinking and concentrating for i in range(1 + nsh + ncc): if i == 0: # starting with the full sample ix_sub = slice(None) consistency = 1.0 else: # reducing alpha from 1 to alpha1 gradually if i <= nsh: alpha = alpha1 + (1 - alpha1) * (1 - i / (nsh + 1)) else: alpha = alpha1 chi_sq = chi2(p).ppf(alpha) h = int(min(np.ceil(n * alpha - 0.5), n - 1)) # alpha <= (h+0.5)/n # XXX: might be buggy when there are identical data points # better to use argpartition! but may break ix_sub == ix_old. ix_sub = (d_sq <= np.partition(d_sq, h)[h]) # alpha-quantile consistency = alpha / chi2(p + 2).cdf(chi_sq) # check convergence if (i > nsh + 1) and (ix_sub == ix_old).all(): break # converged ix_old = ix_sub # warm start? if 0 == warm_start or niter < warm_start: gp_kwargs['likelihood'] = likelihood_init.copy() gp_kwargs['kernel'] = kernel_init.copy() # train GP gp = GPy.core.GP(X[ix_sub], Y[ix_sub], **gp_kwargs) gp.optimize(**optimize_kwargs) # make prediction Y_avg, Y_var = gp.predict(X, include_likelihood=True) d_sq = ((Y - Y_avg)**2 / Y_var).ravel() if callback is not None: callback(niter, locals(), *callback_args) niter += 1 # reweighting for i in range(nrw): alpha = alpha2 chi_sq = chi2(p).ppf(alpha) # XXX: might be buggy when there are identical data points ix_sub = (d_sq <= chi_sq * consistency) consistency = alpha / chi2(p + 2).cdf(chi_sq) # check convergence if (ix_sub == ix_old).all(): break # converged ix_old = ix_sub # warm start? if 0 == warm_start or niter < warm_start: gp_kwargs['likelihood'] = likelihood_init.copy() gp_kwargs['kernel'] = kernel_init.copy() # train GP gp = GPy.core.GP(X[ix_sub], Y[ix_sub], **gp_kwargs) gp.optimize(**optimize_kwargs) # make prediction if i < nrw - 1 or return_predict: Y_avg, Y_var = gp.predict(X, include_likelihood=True) d_sq = ((Y - Y_avg)**2 / Y_var).ravel() else: pass # skip final training unless prediction is wanted if callback is not None: callback(niter, locals(), *callback_args) niter += 1 if return_predict: # outlier detection score = (d_sq / consistency)**0.5 return ITGPResult(gp, consistency, ix_sub, niter, Y_avg, Y_var, score) else: return ITGPResult(gp, consistency, ix_sub, niter, None, None, None)
from statsmodels.stats.diagnostic import acorr_ljungbox from statsmodels.tsa.stattools import adfuller def normalnoisesim(nobs=500, loc=0.0): return (loc+np.random.randn(nobs)) def lb(x): s,p = acorr_ljungbox(x, lags=4) return np.r_[s, p] mc1 = StatTestMC(normalnoisesim, lb) mc1.run(5000, statindices=lrange(4)) print(mc1.summary_quantiles([1,2,3], stats.chi2([2,3,4]).ppf, varnames=['lag 1', 'lag 2', 'lag 3'], title='acorr_ljungbox')) print('\n\n') frac = [0.01, 0.025, 0.05, 0.1, 0.975] crit = stats.chi2([2,3,4]).ppf(np.atleast_2d(frac).T) print(mc1.summary_cdf([1,2,3], frac, crit, varnames=['lag 1', 'lag 2', 'lag 3'], title='acorr_ljungbox')) print(mc1.cdf(crit, [1,2,3])[1]) #---------------------- def randwalksim(nobs=500, drift=0.0): return (drift+np.random.randn(nobs)).cumsum()
def test_Gamma_to_ChiSquare(self): X = RV(Gamma(shape=10 / 2, scale=2)) sims = X.sim(Nsim) cdf = stats.chi2(df=10).cdf pval = stats.kstest(sims, cdf).pvalue self.assertTrue(pval > .01)
def lmm(self, data, phe, covars, cpgnames, logdelta, reml=True): """ returns output sorted by pvalues: sorted_cpgnames, sorted_cpg_indices, p_vals, beta_est, sigma_e_est, sigma_g_est, statistics where beta_est is 2d array where beta_est[i] is the coefficients of site i beta_est[i][0] is the coefficient of the interception beta_est[i][-1] is the coefficient of site i beta_est[i][1:-1] is the coefficient of the covariates """ number_of_samples = phe.shape[0] #Prepare required matrices Uy = np.dot(self.U.T, phe).flatten() UX = self.U.T.dot(covars) Sd = self.s + np.exp(logdelta) UyS = Uy / Sd yKy = UyS.T.dot(Uy) logdetK = np.log(Sd).sum() num_of_non_zero_eigenvalues = len(Sd) num_of_zero_eigenvalues = number_of_samples - num_of_non_zero_eigenvalues logging.debug("Found %d zero eigenvalues." % num_of_zero_eigenvalues) #Compute null LL XX = covars.T.dot(covars) [Sxx, Uxx] = la.eigh(XX) logdetXX = np.log(Sxx).sum() null_ll, beta_0, null_F = lleval(Uy, UX, Sd, yKy, logdetK, logdetXX, reml=reml) logging.debug('null LL: %s.' % null_ll) #Add an extra column to UX, that will hold UX for the tested site UX = np.concatenate((np.zeros((UX.shape[0], 1)), UX), axis=1) UX_all = self.U.T.dot(data) #Compute logdetXX - we assume it is the same for all sites because they are standardized covars = np.concatenate((np.zeros((number_of_samples, 1)), covars), axis=1) covars[:, 0] = data[:, 0] XX = covars.T.dot(covars) [Sxx, Uxx] = la.eigh(XX) logdetXX = np.log(Sxx).sum() #perform GWAS results = [] for site_i, site_name in enumerate(cpgnames): UX[:, 0] = UX_all[:, site_i] ll, beta, F = lleval( Uy, UX, Sd, yKy, logdetK, logdetXX, reml=reml ) # Note that the order of coefficient in beta is: site under test, covaraites, intercept # Calculate sigms_g, sigms_e sigma_g = np.sum([((Uy[i] - np.dot(UX[i, :], beta))**2) / Sd[i] for i in range(num_of_non_zero_eigenvalues)]) sigma_g += np.sum([ ((Uy[i] - np.dot(UX[i, :], beta))**2) / np.exp(logdelta) for i in range(num_of_zero_eigenvalues) ]) if reml: sigma_g = (sigma_g / (number_of_samples - UX.shape[1]))**0.5 else: sigma_g = (sigma_g / number_of_samples)**0.5 sigma_e = (np.exp(logdelta) * (sigma_g**2))**0.5 results.append((site_i, site_name, ll, F, beta, sigma_g, sigma_e)) #sort and print results if reml: results.sort(key=lambda t: t[3], reverse=True) fDist = stats.f(1, number_of_samples - 1) p_vals = fDist.sf([t[3] for t in results]) else: results.sort(key=lambda t: t[2], reverse=True) chi2 = stats.chi2(1) p_vals = chi2.sf(2 * (np.array([t[2] for t in results]) - null_ll)) sorted_cpg_indices = [ res[0] for res in results ] # sorted_cpg_indices[i] is the index of sorted_cpgnames[i] in cpgnames. i.e cpgnames[sorted_cpg_indices[i]] == sorted_cpgnames[i] sorted_cpgnames = [res[1] for res in results] beta_est = [res[4] for res in results] sigma_g_est = [res[5] for res in results] sigma_e_est = [res[6] for res in results] statistics = [] if reml: statistics = [res[3] for res in results] else: statistics = [res[2] for res in results] return sorted_cpgnames, sorted_cpg_indices, p_vals, beta_est, sigma_e_est, sigma_g_est, statistics
def test_sum_Normal_to_ChiSquare(self): X, Y, Z, A, B = RV(Normal(mean=0, var=1)**5) sims = ((X**2) + (Y**2) + (Z**2) + (A**2) + (B**2)).sim(Nsim) cdf = stats.chi2(df=5).cdf pval = stats.kstest(sims, cdf).pvalue self.assertTrue(pval > .01)
import rvlib as rl import scipy.stats as st import numpy as np # Get random points to evaluate functions np.random.seed(1234) x = np.random.rand(10) # Create normal distrtibution N_rl = rl.Normal(0, 1) N_st = st.norm(0, 1) # Check normal cdfs/pdfs against each other N_rl_cdf = N_rl.cdf(x) N_st_cdf = N_st.cdf(x) np.allclose(N_rl_cdf, N_st_cdf) # Create chi2 distributions chi2_rl = rl.Chisq(5) chi2_st = st.chi2(5) # Check chi2 cdfs/pdfs against each other chi2_rl_cdf = chi2_rl.cdf(x) chi2_st_cdf = chi2_st.cdf(x) np.allclose(chi2_rl_cdf, chi2_st_cdf)
mean_revocation_fraction_of_discharges = { crime: mean(revocations[crime]) / mean(discharges[crime]) for crime in crimes } mean_completion_duration = { crime: (1 - mean_revocation_fraction_of_discharges[crime]) * mean(total_population[crime]) / (mean(discharges[crime]) - mean(revocations[crime])) for crime in crimes } transitions_data = pd.DataFrame() for crime in crimes: # populate transition data completion_pdf = chi2(mean_completion_duration[crime]).pdf probation_transition_table = pd.DataFrame({ "compartment": ["probation"] * 100, "compartment_duration": [i + 1 for i in range(50)] * 2, "outflow_to": ["release"] * 50 + ["prison"] * 50, "total_population": [completion_pdf(i + 1) for i in range(50)] + [ completion_pdf(i + 1) * mean_revocation_fraction_of_discharges[crime] for i in range(50) ], "crime_type": [crime] * 100, }) secondary_transition_table = pd.DataFrame({ "compartment": ["release", "prison"], "compartment_duration": [1, 1], "outflow_to": ["release", "prison"], "total_population": [1, 1],
def chatterjeeMachlerHadi(X, y, **kwargs): # basic info options = parseKeywords(kwargs) # for the distances, will use absX - do this before adding intercept term # a column of all ones will cause problems with non full rank covariance matrices absX = np.absolute(X) # now calculate p and n n = absX.shape[0] p = absX.shape[1] # we treat the X matrix as a multivariate matrix with n observations and p variables # first need to find a basic subset free of outliers correctionFactor = 1 + (1.0 * (p + 1) / (n - p)) + (2.0 / (n - 1 - 3 * p)) chi = stats.chi2(p, 0) alpha = 0.05 chi2bound = correctionFactor * chi.pdf(alpha / n) # calculate h, this is the size of the firt basic subset # note that this is the value h, the index of the hth element is h-1 h = int(1.0 * (n + p + 1) / 2) # here, only want the integer part of this # need to get the coordinatewise medians - this is the median of the columns medians = np.median(absX) # now compute the matrix to help calculate the distance A = np.zeros(shape=(p, p)) for i in xrange(0, n): tmp = absX[i, :] - medians A += np.outer(tmp, tmp) A = 1.0 / (n - 1) * A # now calculate initial distances dInit = calculateDistCMH(n, absX, medians, A) # now get the h smallest values of d sortOrder = np.argsort(dInit) indices = sortOrder[0:h] means = np.average(absX[indices, :]) covariance = np.cov( absX[indices], rowvar=False) # observations in rows, columns are variables dH = calculateDistCMH(n, absX, means, covariance) # rearrange into n observations into order and partition into two initial subsets # one subset p+1, the n-p-1 sortOrder = np.argsort(dH) indicesBasic = sortOrder[:p + 1] # there is a rank issue here, but ignore for now - natural observations will presumably be full rank means = np.average(absX[indicesBasic, :]) covariance = np.cov(absX[indicesBasic], rowvar=False) dist = calculateDistCMH(n, absX, means, covariance) # create the basic subset r = p + 2 increment = (h - r) / 100 if increment < 1: increment = 1 # here, limiting to 100 iterations of this while r <= h: sortOrder = np.argsort(dist) indices = sortOrder[:r] # indices start from zero, hence the - 1 means = np.average(absX[indices]) covariance = np.cov(absX[indices], rowvar=False) dist = calculateDistCMH(n, absX, means, covariance) if h - r > 0 and h - r < increment: r = h else: r += increment # now the second part = add more points and exclude outliers to basic set # all distances above r+1 = outliers #r = p + 1 #increment = (n - 1 - r)/100 while r < n: sortOrder = np.argsort(dist) dist2 = np.power(dist, 2) if dist2[sortOrder[r]] > chi2bound: break # then leave, everything else is an outlier - it would be good if this could be saved somehow # otherwise, continue adding points sortOrder = np.argsort(dist) indices = sortOrder[:r] means = np.average(absX[indices]) covariance = np.cov(absX[indices], rowvar=False) dist = calculateDistCMH(n, absX, means, covariance) if n - 1 - r > 0 and n - 1 - r < increment: r = n - 1 else: r += increment # now with the Hadi distances calculated, can proceed to do the robust regression # normalise and manipulate Hadi distances dist = dist / np.max(dist) # for the median, use the basic subset # indicesBasic = sortOrder[:r] # distMedian = np.median(dist[indicesBasic]) # I am using on indicesBasic distMedian = np.median( dist) # the paper suggests using the median of the complete tmp = np.maximum(dist, np.ones(shape=(n)) * distMedian) dist = np.reciprocal(tmp) dist2 = np.power(dist, 2) dist = dist2 / np.sum(dist2) # calculate first set of weights - this is simply dist weights = dist # now add the additional constant intercept column if required if options["intercept"] == True: # add column of ones for constant term X = np.hstack((np.ones(shape=(X.shape[0], 1), dtype="complex"), X)) n = X.shape[0] p = X.shape[1] # iteratively weighted least squares iteration = 0 while iteration < options["maxiter"]: # do the weighted least-squares Anew, ynew = weightLS(X, y, weights) paramsNew, squareResidNew, rankNew, sNew = linalg.lstsq(Anew, ynew) residsNew = y - np.dot(X, paramsNew) # check residsNew to make sure not all zeros (i.e. will happen in undetermined or equally determined system) if np.sum(np.absolute(residsNew)) < eps(): # then return everything here return paramsNew, residsNew, weights residsAbs = np.absolute(residsNew) residsSquare = np.power(residsAbs, 2) residsNew = residsSquare / np.sum(residsSquare) residsMedian = np.median(residsAbs) # calculate the new weights tmpDenom = np.maximum(residsNew, np.ones(shape=(n), dtype="float") * residsMedian) tmp = (1 - dist) / tmpDenom weightsNew = np.power(tmp, 2) / np.sum(np.power(tmp, 2)) # increment iteration iteration = iteration + 1 weights = weightsNew params = paramsNew if iteration > 1: # check to see whether the change is smaller than the tolerance changeResids = linalg.norm(residsNew - resids) / linalg.norm(residsNew) if changeResids < eps(): # update resids resids = residsNew break # update resids resids = residsNew # at the end, return the components return params, resids, weights
G[:N * K + N, :N * K + N] = kron(eye(N), SigmaX) G[N * K + N:, N * K + N:] = -beta @ beta.T # Hertil for i in range(N): temp = zeros((K, K + 1)) values = mean(u[:, i]) - multiply(all_coef[:, i], riskPremia) # beta[:, i] temp[:, 1:] = diag(values) G[N * K + N:, i * (K + 1):(i + 1) * (K + 1)] = temp vcv = inv(G.T) * S * inv(G) / T vcvAlpha = vcv[0:N * K + N:4, 0:N * K + N:4] J = alpha @ inv(vcvAlpha) @ alpha.T J = J[0, 0] Jpval = 1 - chi2(25).cdf(J) vcvRiskPremia = vcv[N * K + N:, N * K + N:] annualizedRP = 12 * riskPremia arp = list(squeeze(annualizedRP)) arpSE = list(sqrt(12 * diag(vcvRiskPremia))) print(' Annualized Risk Premia') print(' Market SMB HML') print('--------------------------------------') print('Premia {0:0.4f} {1:0.4f} {2:0.4f}'.format(arp[0], arp[1], arp[2])) print('Std. Err. {0:0.4f} {1:0.4f} {2:0.4f}'.format(arpSE[0], arpSE[1], arpSE[2])) print('\n\n') print('J-test: {:0.4f}'.format(J)) print('P-value: {:0.4f}'.format(Jpval))
acceptance = kernels.evaluate_acceptance(values) logging.info('obtained %d posterior samples with acceptance %.3f', args.num_samples, acceptance) logging.info('posterior mean: %s', dict(zip(feature_names, np.mean(xs, axis=0)))) logging.info('posterior std: %s', dict(zip(feature_names, np.std(xs, axis=0)))) if 'theta' in data: logging.info('true values: %s', dict(zip(feature_names, data['theta']))) residuals = np.mean(xs, axis=0) - data['theta'] logging.info('z-scores: %s', dict(zip(feature_names, residuals / np.std(xs, axis=0)))) cov_ = np.cov(xs.T) chi2 = residuals.dot(np.linalg.inv(cov_)).dot(residuals) pval = 1 - stats.chi2(len(cov_)).cdf(chi2) logging.info('chi2 for %d dof: %f; p-val: %f', len(cov_), chi2, pval) # Package the data and results and save them --------------------------------------------------- os.makedirs(os.path.dirname(filename), exist_ok=True) with atomic_write(filename, mode='wb', overwrite=True) as fp: pickle.dump( { 'arghash': arghash, 'args': config, 'data': data, 'result': result, 'samples': { 'xs': xs, 'values': values, },