def ks_stat(data1, data2): # Compute ECDF from data: x, y x, y = dcst.ecdf(data1) # Compute corresponding values of the target CDF cdf = dcst.ecdf_formal(x, data2) # Compute distances between concave corners and CDF D_top = y - cdf # Compute distance between convex corners and CDF D_bottom = cdf - y + 1 / len(data1) return np.max((D_top, D_bottom))
def test_ecdf_formal_custom(): assert dcst.ecdf_formal(0.1, [0, 1, 2, 3]) == 0.25 assert dcst.ecdf_formal(-0.1, [0, 1, 2, 3]) == 0.0 assert dcst.ecdf_formal(0.1, [3, 2, 0, 1]) == 0.25 assert dcst.ecdf_formal(-0.1, [3, 2, 0, 1]) == 0.0 assert dcst.ecdf_formal(2, [3, 2, 0, 1]) == 0.75 assert dcst.ecdf_formal(1, [3, 2, 0, 1]) == 0.5 assert dcst.ecdf_formal(3, [3, 2, 0, 1]) == 1.0 assert dcst.ecdf_formal(0, [3, 2, 0, 1]) == 0.25 with pytest.raises(RuntimeError) as excinfo: dcst.ecdf_formal([np.nan, np.inf], [0, 1, 2, 3]) excinfo.match("Input cannot have NaNs.") correct = np.array([1.0, 1.0]) result = dcst.ecdf_formal([3.1, np.inf], [3, 2, 0, 1]) assert np.allclose(correct, result, atol=atol)
def predict(self, data=None, digits=3): """ Prediction based on IDR model fit Parameters ---------- idr_object : object from class idrobject data : pd.DataFrame, optional containing variables with which to predict. The default is None. digits : integer value, optional digits number of decimal places for predictive CDF. The default is 3. Returns ------- object of class idrpredict. predictions : Object of class predictions_idr: points : where predictie CDF has jumps cdf : estimated CDF evaluated at points lower : bounds for estimated CDF (out-of-sample predictions) upper : bounds for estimated CDF (out-of-sample predictions) incomparables : gives the indices of all predictions for which the climatological forecast is returned because the forecast variables are not comparable to the training data. None if not available. """ cdf = self.ecdf.copy() thresholds = self.thresholds.copy() order_indices = [] preds = [] if data is None: indices = self.indices for i in range(indices.shape[0]): edf = np.round(cdf[i, :], digits) sel = np.hstack([edf[0] > 0, np.diff(edf) > 0]) #dat = {'points': thresholds[sel], 'cdf': edf[sel]} #tmp = pd.DataFrame(dat, columns = ['points', 'cdf']) tmp = predictions_idr(ecdf=edf[sel], points=thresholds[sel], lower=[], upper=[]) for j in indices[i]: order_indices.append(j) preds.append(tmp) preds_rearanged = [preds[k] for k in np.argsort(order_indices)] idr_predictions = idrpredict(predictions=preds_rearanged, incomparables=None) return (idr_predictions) if isinstance(data, pd.DataFrame) == False: raise ValueError("data must be a pandas data frame") X = self.X.copy() M = all(elem in data.columns for elem in X.columns) if M == False: raise ValueError("some variables of idr fit are missing in data") data = data.copy() data = prepareData(data[X.columns], groups=self.groups, orders=self.orders) nVar = data.shape[1] if nVar == 1: X = np.array(X[X.columns[0]]) x = np.array(data[data.columns[0]]) #fct = all(X[i] <= X[i+1] for i in range(len(X)-1)) #fct = False #if fct: # X = X.astype(int) # x = x.astype(int) #smaller = findInterval(x, X) smaller = np.array([bisect.bisect_left(X, a) for a in x]) smaller = np.where(smaller == 0, 1, smaller) - 1 wg = np.interp( x, X, np.arange( 1, X.shape[0] + 1), left=1, right=X.shape[0]) - np.arange( 1, X.shape[0] + 1)[smaller.astype(int)] greater = smaller + (wg > 0).astype(int) #if fct == False: ws = 1 - wg #else: # ws = np.zeros(x.shape[0])+0.5 # wg = ws # mapping function l = np.round(cdf[greater.astype(int), :], digits) u = np.round(cdf[smaller.astype(int), :], digits) def fun_preds(l, u, ws, wg): ls = np.insert(l[:-1], 0, 0) us = np.insert(u[:-1], 0, 0) ind = (ls < l) + (us < u) l = l[ind] u = u[ind] cdf = np.round(np.multiply(l, wg) + np.multiply(u, ws), digits) #dat = {"points": thresholds[ind], "lower": l, "cdf": cdf, "upper": u} #tmp = pd.DataFrame(dat, columns = ['points', 'lower', 'cdf', 'upper']) return predictions_idr(ecdf=cdf, points=thresholds[ind], lower=l, upper=u) preds = list(map(fun_preds, l, u, list(ws), list(wg))) idr_predictions = idrpredict(predictions=preds, incomparables=None) return idr_predictions nPoints = neighbor_points(data, X, order_X=self.constraints) smaller = nPoints[0] greater = nPoints[1] incomparables = np.array(list(map(len, smaller))) + np.array( list(map(len, greater))) == 0 if any(incomparables): y = self.y edf = np.round(dcst.ecdf_formal(thresholds, y.explode()), digits) sel = edf > 0 edf = edf[sel] points = thresholds[sel] upr = np.where(edf == 1)[0] if upr < len(edf) - 1: points = np.delete(points, np.arange(upr, len(edf))) edf = np.delete(edf, np.arange(upr, len(edf))) #dat = {'points':points, 'lower':edf, 'cdf':edf, 'upper':edf} #tmp = pd.DataFrame(dat, columns = ['points', 'lower', 'cdf', 'upper']) tmp = predictions_idr(ecdf=edf, points=points, lower=edf, upper=edf) for i in np.where(incomparables == True)[0]: preds.append(tmp) order_indices.append(i) for i in np.where(incomparables == False)[0]: if smaller[i].size > 0 and greater[i].size == 0: upper = np.round( np.amin(cdf[smaller[i].astype(int), :], axis=0), digits) sel = np.hstack([upper[0] != 0, np.diff(upper) != 0]) upper = upper[sel] lower = np.zeros(len(upper)) estimCDF = upper elif smaller[i].size == 0 and greater[i].size > 0: lower = np.round( np.amax(cdf[greater[i].astype(int), :], axis=0), digits) sel = np.hstack([lower[0] != 0, np.diff(lower) != 0]) lower = lower[sel] upper = np.ones(len(lower)) estimCDF = lower else: lower = np.round( np.amax(cdf[greater[i].astype(int), :], axis=0), digits) upper = np.round( np.amin(cdf[smaller[i].astype(int), :], axis=0), digits) sel = np.hstack( [lower[0] != 0, np.diff(lower) != 0]) + np.hstack( [upper[0] != 0, np.diff(upper) != 0]) lower = lower[sel] upper = upper[sel] estimCDF = np.round(0.5 * (lower + upper), digits) #dat = {'points': thresholds[sel], 'lower': lower, 'cdf': estimCDF, 'upper': upper} #tmp = pd.DataFrame(dat, columns = ['points', 'lower', 'cdf', 'upper']) tmp = predictions_idr(ecdf=estimCDF, points=thresholds[sel], lower=lower, upper=upper) order_indices.append(i) preds.append(tmp) preds_rearanged = [preds[k] for k in np.argsort(order_indices)] idr_predictions = idrpredict(predictions=preds_rearanged, incomparables=np.where(incomparables)) #return preds_rearanged return idr_predictions
def test_ecdf_formal(x, data): correct = np.searchsorted(np.sort(data), x, side="right") / len(data) assert np.allclose(dcst.ecdf_formal(x, data), correct, atol=atol, equal_nan=True)