def _simulate_vol( self, r: np.ndarray, theta: np.ndarray = None, ) -> np.ndarray: """Simulates the garch(1,1) volatility model Args: r (np.ndarray): Returns vector theta (np.ndarray, optional): estimated weights from fitting. Defaults to None. shape = (p_features,) Returns: [np.ndarray]: predicted volatility """ n = r.shape[0] vol = np.zeros(n) if theta is None: omega, gamma, beta = self.theta else: omega, gamma, beta = theta # set unconditional variance of garch(1,1) as init est #vol[0] = omega / (1 - gamma - beta) vol[0] = r.var() # simulate the garch(1,1) process for idx in range(1, n): vol[idx] = omega + gamma * r[idx - 1]**2 + beta * vol[idx - 1] return vol
def fit(self, r: np.ndarray) -> Garch: """Fits training data via quasi maximum likelihood :param r: return series :type r: np.ndarray, shape = (n_samples,) :return: Garch object with estimated parameters :rtype: Garch """ if self.mean: a_t = r - r.mean() else: a_t = r # omega, gamma and beta guess_params = np.array([a_t.var(), 0.09, 0.90]) finfo = np.finfo(np.float64) bounds = [(finfo.eps, 2 * r.var(ddof=1)), (0.0, 1.0), (0.0, 1.0)] cons = {'type': 'ineq', 'fun': self._constraint} self.theta = minimize(self._objective_func, guess_params, method='SLSQP', jac=self._jacobian, options={'disp': True}, bounds=bounds, args=(a_t), constraints=cons)['x'] return self
def seuclidean(x: np.ndarray, y: np.ndarray) -> float: """ Compute the Euclidean distance between the mean of a multivariate candidate sample with respect to the mean of a reference sample. This method is scale-invariant. Parameters ---------- x : np.ndarray (n,d) Reference sample. y : np.ndarray (m,d) Candidate sample. Returns ------- float Standardized Euclidean Distance between the mean of the samples ranging from 0 to infinity. Notes ----- This metric considers neither the information from individual points nor the standard deviation of the candidate distribution. References ---------- Veloz et al. (2011) Identifying climatic analogs for Wisconsin under 21st-century climate-change scenarios. Climatic Change, DOI 10.1007/s10584-011-0261-z. """ mx = x.mean(axis=0) my = y.mean(axis=0) return spatial.distance.seuclidean(mx, my, x.var(axis=0, ddof=1))
def update_batch(self, x: np.ndarray): m = x.shape[0] mu_x = x.mean(axis=0) if m == 1: v_x = 0 else: # print(x) # import matplotlib.pyplot as plt # f,ax=plt.subplots(x.shape[0]) # for i in range(x.shape[0]): # im = x[i,10,:,:] # # im = im.transpose((1,2,)) # # ax[i].imshow(im) # plt.savefig(f"batch{self.n}.png") # print(x[:,4:6,16,14]) v_x = x.var(axis=0, ddof=0) # print(v_x[:,14:16,14:16]) n = self.n self.n = m + n # print("pre") if n == 0: self.mu = mu_x # print(m,v_x) self.v = v_x else: c1, c2 = n / self.n, m / self.n c3 = c1 * c2 mu = self.mu self.mu = c1 * self.mu + c2 * mu_x self.v = c1 * self.v + c2 * v_x + c3 * ((mu - mu_x)**2)
def compute_statistics(self, sample: np.ndarray) -> Tuple: """ Computes mean and variance of a sample :param sample: A sample to compute statistics for. :return: A tuple (mean, variance). """ return sample.mean(), sample.var()
def rmse_to_smse(rmse: float, y_test: np.ndarray) -> float: """Computes the standardized mean squared error (SMSE) The trivial method of guessing the mean of the training targets will have a SMSE of approximately 1 """ mse = rmse**2 target_variance = y_test.var() return mse / target_variance
def _generate_regression_tree(self, X: np.ndarray, y: np.ndarray, depth: int = 0): """ 递归生成最小二乘回归树 """ # 初始化最优分割点 best_feature = best_point = None pair = (best_feature, best_point) y_var = y.var() min_loss = y_var * np.size(y) rows, features = X.shape # 如果样本数量少于 2 ,则停止分割,生成叶节点 if rows < 2: return Node(y.mean(), y.var(), rows, pair, str(uuid1())) # 如果样本全部属于一个类则停止分割,生成叶节点 if np.size(np.unique(y)) == 1: return Node(y[0], 0, rows, pair, str(uuid1())) for f in range(features): # 去重 unique_point = np.unique(X[:, f]) # 计算相邻元素中值作为分割点 split_point = [(unique_point[i] + unique_point[i + 1]) / 2.0 for i in range(np.size(unique_point) - 1)] # 遍历分割点 for p in split_point: _, _, left_y, right_y = self._split(X, y, f, p) loss = left_y.var() * np.size(left_y) + right_y.var() * np.size(right_y) if loss < min_loss: best_feature, best_point = f, p min_loss = loss pair = (best_feature, best_point) root = Node(y.mean(), y_var, rows, pair, str(uuid1())) # 如果遍历完没找到最优分割特征,则停止分割,生成叶节点, 配合预剪枝 if best_feature is None: return root left_x, right_x, left_y, right_y = self._split(X, y, best_feature, best_point) depth += 1 if depth <= self.max_depth: root._left = self._generate_regression_tree(left_x, left_y, depth) root._right = self._generate_regression_tree(right_x, right_y, depth) return root
def _weak_regressor(self, X: np.ndarray, r: np.ndarray) -> tuple: # best_f_p = (feature_index, split_point, left_output, right_output) best_f_p = (None, None, None, None) min_loss = r.var() * np.size(r) for f in range(self.n_features): unique_x = np.unique(X[:, f]) split_point = [(unique_x[i] + unique_x[i + 1]) / 2.0 for i in range(np.size(unique_x) - 1)] for p in split_point: _, _, left_y, right_y = self._split(X, r, f, p) loss = left_y.var() * np.size(left_y) + right_y.var() * np.size(right_y) if loss < min_loss: min_loss = loss best_f_p = (f, p, left_y.mean(), right_y.mean()) return best_f_p
def intrinsic_dimension(X:np.ndarray, k1:int=6, k2:int=12, estimator:str='levina', metric:str='vector', trafo:str='var', mem_threshold:int=5000): """Calculate intrinsic dimension based on the MLE by Levina and Bickel [1]_. Parameters ---------- X : ndarray - An ``m x n`` vector data matrix with ``n`` objects in an ``m`` dimensional feature space - An ``n x n`` distance matrix. NOTE: The type must be defined via parameter `metric`! k1 : int, optional (default: 6) Start of neighborhood range to search in. k2 : int, optional (default: 12) End of neighborhood range to search in. estimator : {'levina', 'mackay'}, optional (default: 'levina') Determine the summation strategy: see [2]_. metric : {'vector', 'distance'}, optional (default: 'vector') Determine data type of `X`. NOTE: the MLE was derived for euclidean distances. Using other dissimilarity measures may lead to undefined results. trafo : {None, 'std', 'var'}, optional (default: 'var') Transform vector data. - None: no transformation - 'std': standardization - 'var': subtract mean, divide by variance (default behavior of Laurens van der Maaten's DR toolbox; most likely for other ID/DR techniques). mem_treshold : int, optional, default: 5000 Controls speed-memory usage trade-off: If number of points is higher than the given value, don't calculate complete distance matrix at once (fast, high memory), but per row (slower, less memory). Returns ------- d_mle : int Intrinsic dimension estimate (rounded to next integer) References ---------- .. [1] Levina, E., & Bickel, P. (2004). Maximum likelihood estimation of intrinsic dimension. Advances in Neural Information …, 17, 777–784. http://doi.org/10.2307/2335172 .. [2] http://www.inference.phy.cam.ac.uk/mackay/dimension/ """ n = X.shape[0] if estimator not in ['levina', 'mackay']: raise ValueError("Parameter 'estimator' must be 'levina' or 'mackay'.") if k1 < 1 or k2 < k1 or k2 >= n: raise ValueError("Invalid neighborhood: Please make sure that " "0 < k1 <= k2 < n. (Got k1={} and k2={}).". format(k1, k2)) X = X.copy().astype(float) if metric == 'vector': # New array with unique rows X = X[np.lexsort(np.fliplr(X).T)] if trafo is None: pass elif trafo == 'var': X -= X.mean(axis=0) # broadcast X /= X.var(axis=0) + 1e-7 # broadcast elif trafo == 'std': # Standardization X -= X.mean(axis=0) # broadcast X /= X.std(axis=0) + 1e-7 # broadcast else: raise ValueError("Transformation must be None, 'std', or 'var'.") # Compute matrix of log nearest neighbor distances X2 = (X**2).sum(1) if n <= mem_threshold: # speed-memory trade-off distance = X2.reshape(-1, 1) + X2 - 2*np.dot(X, X.T) #2x br.cast distance.sort(1) # Replace invalid values with a small number distance[distance<0] = 1e-7 knnmatrix = .5 * np.log(distance[:, 1:k2+1]) else: knnmatrix = np.zeros((n, k2)) for i in range(n): distance = np.sort(X2[i] + X2 - 2 * np.dot(X, X[i, :])) # Replace invalid values with a small number distance[distance < 0] = 1e-7 knnmatrix[i, :] = .5 * np.log(distance[1:k2+1]) elif metric == 'distance': raise NotImplementedError("ID currently only supports vector data.") #======================================================================= # # TODO calculation WRONG # X.sort(1) # X[X < 0] = 1e-7 # knnmatrix = np.log(X[:, 1:k2+1]) #======================================================================= elif metric == 'similarity': raise NotImplementedError("ID currently only supports vector data.") #======================================================================= # # TODO calculation WRONG # print("WARNING: using similarity data may return " # "undefined results.", file=sys.stderr) # X[X < 0] = 0 # distance = 1 - (X / X.max()) # knnmatrix = np.log(distance[:, 1:k2+1]) #======================================================================= else: raise ValueError("Parameter 'metric' must be 'vector' or 'distance'.") # Compute the ML estimate S = np.cumsum(knnmatrix, 1) indexk = np.arange(k1, k2+1) # broadcasted afterwards dhat = -(indexk - 2) / (S[:, k1-1:k2] - knnmatrix[:, k1-1:k2] * indexk) if estimator == 'levina': # Average over estimates and over values of k no_dims = dhat.mean() if estimator == 'mackay': # Average over inverses dhat **= -1 dhat_k = dhat.mean(0) no_dims = (dhat_k ** -1).mean() return int(no_dims.round())