def _ks_2samp(self, data1, data2): from numpy import asarray from scipy.stats import kstwobign data1, data2 = map(asarray, (data1, data2)) # change data list to array n1 = len(data1) # number of elements in data1 n2 = len(data2) # number of elements in data2 #data1 = np.sort(data1) # sort data1 by ascending values #data2 = np.sort(data2) # sort data2 by ascending values data_all = np.concatenate( [data1, data2] ) # concatenate both data arrays and sort - this is basically the array of all possible values cdf1 = np.searchsorted(data1, data_all, side='right') / n1 cdf2 = np.searchsorted(data2, data_all, side='right') / n2 d = np.max(np.absolute(cdf1 - cdf2)) # Note: d absolute not signed distance en = np.sqrt(n1 * n2 / float(n1 + n2)) try: prob = kstwobign.sf((en + 0.12 + 0.11 / en) * d) except: prob = 1.0 return prob
def ks2d2s(x1, y1, x2, y2, nboot=None, extra=False): '''Two-dimensional Kolmogorov-Smirnov test on two samples. Parameters ---------- x1, y1 : ndarray, shape (n1, ) Data of sample 1. x2, y2 : ndarray, shape (n2, ) Data of sample 2. Size of two samples can be different. extra: bool, optional If True, KS statistic is also returned. Default is False. Returns ------- p : float Two-tailed p-value. D : float, optional KS statistic. Returned if keyword `extra` is True. Notes ----- This is the two-sided K-S test. Small p-values means that the two samples are significantly different. Note that the p-value is only an approximation as the analytic distribution is unkonwn. The approximation is accurate enough when N > ~20 and p-value < ~0.20 or so. When p-value > 0.20, the value may not be accurate, but it certainly implies that the two samples are not significantly different. (cf. Press 2007) References ---------- Peacock, J.A. 1983, Two-Dimensional Goodness-of-Fit Testing in Astronomy, Monthly Notices of the Royal Astronomical Society, vol. 202, pp. 615-627 Fasano, G. and Franceschini, A. 1987, A Multidimensional Version of the Kolmogorov-Smirnov Test, Monthly Notices of the Royal Astronomical Society, vol. 225, pp. 155-170 Press, W.H. et al. 2007, Numerical Recipes, section 14.8 ''' assert (len(x1) == len(y1)) and (len(x2) == len(y2)) n1, n2 = len(x1), len(x2) D = avgmaxdist(x1, y1, x2, y2) if nboot is None: sqen = np.sqrt(n1 * n2 / (n1 + n2)) r1 = pearsonr(x1, y1)[0] r2 = pearsonr(x2, y2)[0] r = np.sqrt(1 - 0.5 * (r1**2 + r2**2)) d = D * sqen / (1 + r * (0.25 - 0.75 / sqen)) p = kstwobign.sf(d) else: n = n1 + n2 x = np.concatenate([x1, x2]) y = np.concatenate([y1, y2]) d = np.empty(nboot, 'f') for i in range(nboot): idx = random.choice(n, n, replace=True) ix1, ix2 = idx[:n1], idx[n1:] #ix1 = random.choice(n, n1, replace=True) #ix2 = random.choice(n, n2, replace=True) d[i] = avgmaxdist(x[ix1], y[ix1], x[ix2], y[ix2]) p = np.sum(d > D).astype('f') / nboot if extra: return p, D else: return p
def Prob(self): try: from scipy.stats import kstwobign prob = kstwobign.sf((en + .12 + .11 / en) * d) except: prob = 1.0 # KS Probability Function self.prob = prob
def multi_ks2samp(X, Y, alphas, gamma=None): # two sample KS test in higher dimensions using QOCSVM sets m, n = X.shape[0], Y.shape[0] quants = QOCSVM(alphas, gamma=gamma) quants.fit(X) result1 = quants.transform(X) result2 = quants.transform(Y) F1 = result1.mean(axis=0) F2 = result2.mean(axis=0) max_delta = max(np.abs(F1-F2)) teststat = np.sqrt((n * m) / (n + m)) * max_delta pval = kstwobign.sf(teststat) return pval, teststat, max_delta
def ks_2samp(a, b, wa=None, wb=None): ''' Compute the Kolmogorov-Smirnov statistic on 2 samples. This is a two-sided test for the null hypothesis that 2 independent samples are drawn from the same continuous distribution. Weights for each sample are accepted. If no weights are provided, then the function :func:`scipy.stats.ks_2samp` is called instead. :param a: first sample. :type a: numpy.ndarray :param b: second sample. :type b: numpy.ndarray :param wa: set of weights for "a". Same length as "a". :type wa: numpy.ndarray or None. :param wb: set of weights for "b". Same length as "b". :type wb: numpy.ndarray or None. :returns: Test statistic and two-tailed p-value. :rtype: float, float ''' if wa is None and wb is None: return scipy_ks_2samp(a, b) a, cwa, na = _ks_2samp_values(a, wa) b, cwb, nb = _ks_2samp_values(b, wb) m = np.concatenate([a, b]) cdfa = cwa[np.searchsorted(a, m, side='right')] cdfb = cwb[np.searchsorted(b, m, side='right')] d = np.max(np.abs(cdfa - cdfb)) en = np.sqrt(na*nb/float(na + nb)) try: prob = kstwobign.sf((en + 0.12 + 0.11/en)*d) except: prob = 1. return d, prob
def ks_2samp_w(data1, data2, weights1, weights2): """ Reimplementation of ks_2samp from scipy/stats that allows weighted samples. From: https://stackoverflow.com/questions/40044375/how-to-calculate-the-kolmogorov-smirnov-statistic-between-two-weighted-samples NOT QUITE SURE IT WORKS. HELPERS ARE WELCOME """ from scipy.stats import kstwobign n1 = np.sum(weights1) n2 = np.sum(weights2) ix1 = np.argsort(data1) ix2 = np.argsort(data2) data1 = data1[ix1] data2 = data2[ix2] weights1 = weights1[ix1] weights2 = weights2[ix2] data1 = np.sort(data1) data2 = np.sort(data2) data_all = np.concatenate([data1, data2]) cwei1 = np.hstack([0, np.cumsum(weights1) / sum(weights1)]) cwei2 = np.hstack([0, np.cumsum(weights2) / sum(weights2)]) cdf1we = cwei1[[np.searchsorted(data1, data_all, side='right')]] cdf2we = cwei2[[np.searchsorted(data2, data_all, side='right')]] d = np.max(np.absolute(cdf1we - cdf2we)) # Note: d absolute not signed distance en = np.sqrt(n1 * n2 / float(n1 + n2)) try: prob = kstwobign.sf( (en + 0.12 + 0.11 / en) * d) #where this come from? except: prob = 1.0 return d, prob
def insertion_p_value(indexes, nlive, batch=0): """Compute the p-value from insertion indexes, assuming constant nlive. Note that this function doesn't use scipy.stats.kstest as the latter assumes continuous distributions. For more detail, see https://arxiv.org/abs/2006.03371 For a rolling test, you should provide the optional parameter batch!=0. In this case the test computes the p value on consecutive batches of size nlive * batch, selects the smallest one and adjusts for multiple comparisons using a Bonferroni correction. Parameters ---------- indexes: array-like list of insertion indexes, sorted by death contour nlive: int number of live points batch: float batch size in units of nlive for a rolling p-value Returns ------- ks_result: dict Kolmogorov-Smirnov test results: D: Kolmogorov-Smirnov statistic sample_size: sample size p-value: p-value # if batch != 0 iterations: bounds of batch with minimum p-value nbatches: the number of batches in total uncorrected p-value: p-value without Bonferroni correction """ if batch == 0: bins = np.arange(-0.5, nlive + 0.5, 1.) empirical_pmf = np.histogram(indexes, bins=bins, density=True)[0] empirical_cmf = np.cumsum(empirical_pmf) uniform_cmf = np.arange(1., nlive + 1., 1.) / nlive D = abs(empirical_cmf - uniform_cmf).max() sample_size = len(indexes) K = D * np.sqrt(sample_size) ks_result = {} ks_result["D"] = D ks_result["sample_size"] = sample_size ks_result["p-value"] = kstwobign.sf(K) return ks_result else: batch = int(batch * nlive) batches = [indexes[i:i + batch] for i in range(0, len(indexes), batch)] ks_results = [insertion_p_value(c, nlive) for c in batches] ks_result = min(ks_results, key=lambda t: t["p-value"]) index = ks_results.index(ks_result) ks_result["iterations"] = (index * batch, (index + 1) * batch) ks_result["nbatches"] = n = len(batches) ks_result["uncorrected p-value"] = p = ks_result["p-value"] ks_result["p-value"] = 1. - (1. - p)**n if ks_result["p-value"] == 0.: ks_result["p-value"] = p * n return ks_result
def ks_2samp(data1, data2): """ Computes the Kolmogorov-Smirnov statistic on 2 samples. This is a two-sided test for the null hypothesis that 2 independent samples are drawn from the same continuous distribution. Parameters ---------- a, b : sequence of 1-D ndarrays two arrays of sample observations assumed to be drawn from a continuous distribution, sample sizes can be different Returns ------- D : float KS statistic p-value : float two-tailed p-value Notes ----- This tests whether 2 samples are drawn from the same distribution. Note that, like in the case of the one-sample K-S test, the distribution is assumed to be continuous. This is the two-sided test, one-sided tests are not implemented. The test uses the two-sided asymptotic Kolmogorov-Smirnov distribution. If the K-S statistic is small or the p-value is high, then we cannot reject the hypothesis that the distributions of the two samples are the same. Examples -------- >>> from scipy import stats >>> np.random.seed(12345678) #fix random seed to get the same result >>> n1 = 200 # size of first sample >>> n2 = 300 # size of second sample For a different distribution, we can reject the null hypothesis since the pvalue is below 1%: >>> rvs1 = stats.norm.rvs(size=n1, loc=0., scale=1) >>> rvs2 = stats.norm.rvs(size=n2, loc=0.5, scale=1.5) >>> stats.ks_2samp(rvs1, rvs2) (0.20833333333333337, 4.6674975515806989e-005) For a slightly different distribution, we cannot reject the null hypothesis at a 10% or lower alpha since the p-value at 0.144 is higher than 10% >>> rvs3 = stats.norm.rvs(size=n2, loc=0.01, scale=1.0) >>> stats.ks_2samp(rvs1, rvs3) (0.10333333333333333, 0.14498781825751686) For an identical distribution, we cannot reject the null hypothesis since the p-value is high, 41%: >>> rvs4 = stats.norm.rvs(size=n2, loc=0.0, scale=1.0) >>> stats.ks_2samp(rvs1, rvs4) (0.07999999999999996, 0.41126949729859719) """ data1, data2 = map(asarray, (data1, data2)) n1 = data1.shape[0] n2 = data2.shape[0] n1 = len(data1) n2 = len(data2) data1 = np.sort(data1) data2 = np.sort(data2) data_all = np.concatenate([data1,data2]) #print(len(data_all)) cdf1 = np.searchsorted(data1,data_all,side='right')/(1.0*n1) cdf2 = np.searchsorted(data2,data_all,side='right')/(1.0*n2) tau=0 darray=cdf1-cdf2 d = np.max(np.absolute(darray)) if d==-np.min(darray): d=-d jamfri=np.min(np.where(darray == np.min(darray))[0]) else: jamfri=np.min(np.where(darray == darray.max())[0]) #print(jamfri) tau=data_all[jamfri] #tau=data_all[list(darray).index(d)] # Note: d absolute not signed distance en = np.sqrt(n1*n2/float(n1+n2)) try: prob = kstwobign.sf((en + 0.12 + 0.11 / en) * d) #prob = distributions.kstwobign.sf((en + 0.12 + 0.11 / en) * d) except: prob = 1.0 return d, prob, tau
def ks_2samp(a, b, aw=None, bw=None): """ Computes the Kolmogorov-Smirnov (KS) statistic on 2 samples. This is a two-sided test for the null hypothesis that 2 independent samples are drawn from the same continuous distribution. Parameters ---------- a, b : Sequence of 1D ndarrays Two arrays of sample observations assumed to be drawn from a continuous distribution, sample sizes can be different. aw, bw: Sequence of 1D ndarrays, optional The weights of each observation in a, b. Must be the same length as the associated array of observations. If omitted or None, every measurement will be assigned an equal weight. Returns ------- D : float KS statistic p-value : float Two-tailed p-value Notes ----- This tests whether 2 samples are drawn from the same distribution. Note that, like in the case of the one-sample KS test, the distribution is assumed to be continuous. This is the two-sided test, one-sided tests are not implemented. The test uses the two-sided asymptotic KS distribution. If the KS statistic is small or the p-value is high, then we cannot reject the hypothesis that the distributions of the two samples are the same. This function accounts for weights using the recommendations found in [1]. Convergence is improved in the large-sample KS distribution by using the form found by [2]. References ---------- [1] J. Monahan, "Numerical Methods of Statistics" 2nd Ed., 2011 [2] M. A. Stephens "Use of the Kolmogorov-Smirnov, Cramer-Von Mises and Related Statistics Without Extensive Tables", Journal of the Royal Statistical Society, Series B (Methodological), Vol. 32, No. 1., pp. 115-122, 1970 """ # Methodology for weighted Kolmogorov-Smirnov test taken from Numerical # Methods of Statistics - J. Monahan ab = np.sort(np.concatenate((a, b))) D = np.max(np.abs(ecdf(a, aw)(ab) - ecdf(b, bw)(ab))) n1 = len(a) if aw is None else np.sum(aw)**2 / np.sum(aw**2) n2 = len(b) if bw is None else np.sum(bw)**2 / np.sum(bw**2) en = np.sqrt(n1 * n2 / float(n1 + n2)) p = kstwobign.sf((en + 0.12 + 0.11 / en) * D) # Stephens (1970) return D, p
def calibrateModelDRO(self, sigma_n_grid, batch_size=16, parallel=True): n1 = self.data_feature.shape[0] data_feature_sorted = self.data_feature.copy() data_feature_sorted.sort(axis=0) empirical_quantile = np.array([np.arange(0., 1 - 1e-15, 1 / n1)] * self.encode_length).T # self.plotFeatureDistribution(data_feature_sorted) scaler = MinMaxScaler() simulate_quantile_list = [] for sigma_n in sigma_n_grid: time_start = time.time() mid_prices_dist = self.generateTimeSeriesDistribution( sigma_n, batch_size, parallel) mid_prices_dist = np.array([ scaler.fit_transform(mid_prices) for mid_prices in mid_prices_dist ]) feature_dist = self.encode(mid_prices_dist[:, :, 0]) feature_dist.sort(axis=0) n2 = len(feature_dist) simulate_quantile = np.full_like(empirical_quantile, 0.) for i in range(self.encode_length): simulate_quantile[:, i] = np.searchsorted( feature_dist[:, i], data_feature_sorted[:, i], side='right') / n2 simulate_quantile_list.append(simulate_quantile) loss = np.max(np.abs(empirical_quantile - simulate_quantile)) loss_argmax = np.argmax( np.abs(empirical_quantile - simulate_quantile)) arg_x, arg_y = (loss_argmax // self.encode_length, loss_argmax % self.encode_length) time_end = time.time() print("sigma_n {} finished with time {}, loss is {} at {}.".format( sigma_n, time_end - time_start, loss, (arg_x, arg_y))) logprint( "sigma_n {} finished with time {}, loss is {} at {}.\n".format( sigma_n, time_end - time_start, loss, (arg_x, arg_y))) # self.plotFeatureDistribution(feature_dist) # self.plotQuantileHeatmap(empirical_quantile, simulate_quantile) simulate_quantile_list = np.array(simulate_quantile_list) print("Quantile calculation finished. Start optimization.") logprint("Quantile calculation finished. Start optimization.\n") m = gurobipy.Model("DRO") q = m.addVar(vtype=GRB.CONTINUOUS, name='q') W = dict() W_sum = 0 quantile_avg = dict() m.addConstr(q >= 0, name="postive_q") for i in range(len(sigma_n_grid)): W[i] = m.addVar(vtype=GRB.CONTINUOUS, name='W' + str(i)) m.addConstr(W[i] >= 0, "postive_W" + str(i)) W_sum += W[i] for j in range(n1): for k in range(self.encode_length): if (j, k) in quantile_avg: quantile_avg[( j, k)] += W[i] * simulate_quantile_list[i][j][k] else: quantile_avg[( j, k)] = W[i] * simulate_quantile_list[i][j][k] m.addConstr(W_sum == 1, name="sum_prob") for j in range(n1): for k in range(self.encode_length): m.addConstr(empirical_quantile[j][k] - q / np.sqrt(n1 * n2 / (n1 + n2)) <= quantile_avg[(j, k)], name="qCons1_" + str(j) + str(k)) m.addConstr(empirical_quantile[j][k] + q / np.sqrt(n1 * n2 / (n1 + n2)) >= quantile_avg[(j, k)], name="qCons2_" + str(j) + str(k)) m.setObjective(q, GRB.MINIMIZE) m.optimize() print("Optimization finished.") logprint("Optimization finished.\n") W_optimal = [W[i].x for i in W] plt.plot(sigma_n_grid, W_optimal) plt.xlabel("$\sigma_n^2$") plt.ylabel("Weights") plt.savefig("weights_l_{}_b_{}.png".format(self.lambda_a, batch_size)) plt.show() return m.objVal, kstwobign.sf(m.objVal), W_optimal
def calibrateModelDRO(self, sigma_n_grid, batch_size=16, parallel=True): n1 = len(self.fundamental_returns) fundamental_returns_sorted = np.sort(self.fundamental_returns) fund_quantile = np.searchsorted(fundamental_returns_sorted, self.fundamental_returns, side='right') / n1 quantile_dict = dict() for sigma_n in sigma_n_grid: time_start = time.time() dist_sim = self.generateReturnDistribution(sigma_n, batch_size, parallel) dist_sim = np.sort(dist_sim) n2 = len(dist_sim) quantile_dict[sigma_n] = np.searchsorted( dist_sim, self.fundamental_returns, side='right') / n2 time_end = time.time() print("sigma_n {} finished with total time {}, loss {}.".format( sigma_n, time_end - time_start, np.max(quantile_dict[sigma_n] - fund_quantile))) logprint( "sigma_n {} finished with total time {}, loss {}.\n".format( sigma_n, time_end - time_start, np.max(quantile_dict[sigma_n] - fund_quantile))) print("Quantile calculation finished. Start optimization") m = Model("DRO") q = m.addVar(vtype=GRB.CONTINUOUS, name='q') W = dict() W_sum = 0 quantile_avg = dict() m.addConstr(q >= 0, name="postive_q") for i in range(len(sigma_n_grid)): W[i] = m.addVar(vtype=GRB.CONTINUOUS, name='W' + str(i)) m.addConstr(W[i] >= 0, "postive_W" + str(i)) W_sum += W[i] for j in range(len(fund_quantile)): if j in quantile_avg: quantile_avg[j] += W[i] * quantile_dict[sigma_n_grid[i]][j] else: quantile_avg[j] = W[i] * quantile_dict[sigma_n_grid[i]][j] m.addConstr(W_sum == 1, name="sum_prob") for j in range(len(fund_quantile)): m.addConstr( fund_quantile[j] - q / np.sqrt(n1 * n2 / (n1 + n2)) <= quantile_avg[j], name="qCons1_" + str(j)) m.addConstr( fund_quantile[j] + q / np.sqrt(n1 * n2 / (n1 + n2)) >= quantile_avg[j], name="qCons2_" + str(j)) m.setObjective(q, GRB.MINIMIZE) m.optimize() print("Optimization finished.") W_optimal = [W[i].x for i in W] plt.plot(sigma_n_grid, W_optimal) plt.xlabel("$\sigma_n^2$") plt.ylabel("Weights") plt.savefig(log_file + "_weights.png") plt.show() return m.objVal, kstwobign.sf(m.objVal), W_optimal