コード例 #1
0
 def _simulate_vol(
     self,
     r: np.ndarray,
     theta: np.ndarray = None,
 ) -> np.ndarray:
     """Simulates the garch(1,1) volatility model
     Args:
         r (np.ndarray): Returns vector
         theta (np.ndarray, optional): estimated weights from fitting.
         Defaults to None. shape = (p_features,)
     Returns:
         [np.ndarray]: predicted volatility
     """
     n = r.shape[0]
     vol = np.zeros(n)
     if theta is None:
         omega, gamma, beta = self.theta
     else:
         omega, gamma, beta = theta
     # set unconditional variance of garch(1,1) as init est
     #vol[0] = omega / (1 - gamma - beta)
     vol[0] = r.var()
     # simulate the garch(1,1) process
     for idx in range(1, n):
         vol[idx] = omega + gamma * r[idx - 1]**2 + beta * vol[idx - 1]
     return vol
コード例 #2
0
    def fit(self, r: np.ndarray) -> Garch:
        """Fits training data via quasi maximum likelihood

        :param r: return series
        :type r: np.ndarray, shape = (n_samples,)
        :return: Garch object with estimated parameters
        :rtype: Garch
        """
        if self.mean:
            a_t = r - r.mean()
        else:
            a_t = r
        # omega, gamma and beta
        guess_params = np.array([a_t.var(), 0.09, 0.90])
        finfo = np.finfo(np.float64)
        bounds = [(finfo.eps, 2 * r.var(ddof=1)), (0.0, 1.0), (0.0, 1.0)]
        cons = {'type': 'ineq', 'fun': self._constraint}
        self.theta = minimize(self._objective_func,
                              guess_params,
                              method='SLSQP',
                              jac=self._jacobian,
                              options={'disp': True},
                              bounds=bounds,
                              args=(a_t),
                              constraints=cons)['x']
        return self
コード例 #3
0
def seuclidean(x: np.ndarray, y: np.ndarray) -> float:
    """
    Compute the Euclidean distance between the mean of a multivariate candidate sample with respect to the mean of a reference sample.

    This method is scale-invariant.

    Parameters
    ----------
    x : np.ndarray (n,d)
      Reference sample.
    y : np.ndarray (m,d)
      Candidate sample.

    Returns
    -------
    float
        Standardized Euclidean Distance between the mean of the samples
        ranging from 0 to infinity.

    Notes
    -----
    This metric considers neither the information from individual points nor
    the standard deviation of the candidate distribution.

    References
    ----------
    Veloz et al. (2011) Identifying climatic analogs for Wisconsin under
    21st-century climate-change scenarios. Climatic Change,
    DOI 10.1007/s10584-011-0261-z.
    """
    mx = x.mean(axis=0)
    my = y.mean(axis=0)

    return spatial.distance.seuclidean(mx, my, x.var(axis=0, ddof=1))
コード例 #4
0
    def update_batch(self, x: np.ndarray):
        m = x.shape[0]
        mu_x = x.mean(axis=0)
        if m == 1:
            v_x = 0
        else:
            # print(x)
            # import matplotlib.pyplot as plt
            # f,ax=plt.subplots(x.shape[0])
            # for i in range(x.shape[0]):
            #     im = x[i,10,:,:]
            #     # im = im.transpose((1,2,))
            #
            #     ax[i].imshow(im)
            # plt.savefig(f"batch{self.n}.png")
            # print(x[:,4:6,16,14])
            v_x = x.var(axis=0, ddof=0)
            # print(v_x[:,14:16,14:16])

        n = self.n
        self.n = m + n
        # print("pre")
        if n == 0:
            self.mu = mu_x
            # print(m,v_x)
            self.v = v_x
        else:
            c1, c2 = n / self.n, m / self.n
            c3 = c1 * c2
            mu = self.mu
            self.mu = c1 * self.mu + c2 * mu_x
            self.v = c1 * self.v + c2 * v_x + c3 * ((mu - mu_x)**2)
コード例 #5
0
    def compute_statistics(self, sample: np.ndarray) -> Tuple:
        """
        Computes mean and variance of a sample

        :param sample: A sample to compute statistics for.
        :return: A tuple (mean, variance).
        """
        return sample.mean(), sample.var()
コード例 #6
0
def rmse_to_smse(rmse: float, y_test: np.ndarray) -> float:
    """Computes the standardized mean squared error (SMSE)

    The trivial method of guessing the mean of the training targets will have a SMSE of approximately 1
    """
    mse = rmse**2
    target_variance = y_test.var()
    return mse / target_variance
コード例 #7
0
 def _generate_regression_tree(self, X: np.ndarray, y: np.ndarray, depth: int = 0):
     """
     递归生成最小二乘回归树
     """
     # 初始化最优分割点
     best_feature = best_point = None
     pair = (best_feature, best_point)
     y_var = y.var()
     min_loss = y_var * np.size(y)
     rows, features = X.shape
     # 如果样本数量少于 2 ,则停止分割,生成叶节点
     if rows < 2:
         return Node(y.mean(), y.var(), rows, pair, str(uuid1()))
     # 如果样本全部属于一个类则停止分割,生成叶节点
     if np.size(np.unique(y)) == 1:
         return Node(y[0], 0, rows, pair, str(uuid1()))
     for f in range(features):
         # 去重
         unique_point = np.unique(X[:, f])
         # 计算相邻元素中值作为分割点
         split_point = [(unique_point[i] + unique_point[i + 1]) / 2.0 for i in range(np.size(unique_point) - 1)]
         # 遍历分割点
         for p in split_point:
             _, _, left_y, right_y = self._split(X, y, f, p)
             loss = left_y.var() * np.size(left_y) + right_y.var() * np.size(right_y)
             if loss < min_loss:
                 best_feature, best_point = f, p
                 min_loss = loss
     pair = (best_feature, best_point)
     root = Node(y.mean(), y_var, rows, pair, str(uuid1()))
     # 如果遍历完没找到最优分割特征,则停止分割,生成叶节点, 配合预剪枝
     if best_feature is None:
         return root
     left_x, right_x, left_y, right_y = self._split(X, y, best_feature, best_point)
     depth += 1
     if depth <= self.max_depth:
         root._left = self._generate_regression_tree(left_x, left_y, depth)
         root._right = self._generate_regression_tree(right_x, right_y, depth)
     return root
コード例 #8
0
 def _weak_regressor(self, X: np.ndarray, r: np.ndarray) -> tuple:
     # best_f_p = (feature_index, split_point, left_output, right_output)
     best_f_p = (None, None, None, None)
     min_loss = r.var() * np.size(r)
     for f in range(self.n_features):
         unique_x = np.unique(X[:, f])
         split_point = [(unique_x[i] + unique_x[i + 1]) / 2.0 for i in range(np.size(unique_x) - 1)]
         for p in split_point:
             _, _, left_y, right_y = self._split(X, r, f, p)
             loss = left_y.var() * np.size(left_y) + right_y.var() * np.size(right_y)
             if loss < min_loss:
                 min_loss = loss
                 best_f_p = (f, p, left_y.mean(), right_y.mean())
     return best_f_p
コード例 #9
0
def intrinsic_dimension(X:np.ndarray, k1:int=6, k2:int=12, 
                        estimator:str='levina', metric:str='vector', 
                        trafo:str='var', mem_threshold:int=5000):
    """Calculate intrinsic dimension based on the MLE by Levina and Bickel [1]_.
    
    Parameters
    ----------
    X : ndarray
        - An ``m x n`` vector data matrix with ``n`` objects in an 
          ``m`` dimensional feature space 
        - An ``n x n`` distance matrix.
        
        NOTE: The type must be defined via parameter `metric`!
        
    k1 : int, optional (default: 6)
        Start of neighborhood range to search in.
        
    k2 : int, optional (default: 12)
        End of neighborhood range to search in.
        
    estimator : {'levina', 'mackay'}, optional (default: 'levina')
        Determine the summation strategy: see [2]_.
    
    metric : {'vector', 'distance'}, optional (default: 'vector')
        Determine data type of `X`. 
        
        NOTE: the MLE was derived for euclidean distances. Using 
        other dissimilarity measures may lead to undefined results.
        
    trafo : {None, 'std', 'var'}, optional (default: 'var')
        Transform vector data. 
        
        - None: no transformation
        - 'std': standardization 
        - 'var': subtract mean, divide by variance (default behavior of 
          Laurens van der Maaten's DR toolbox; most likely for other 
          ID/DR techniques).

    mem_treshold : int, optional, default: 5000
        Controls speed-memory usage trade-off: If number of points is higher
        than the given value, don't calculate complete distance matrix at
        once (fast, high memory), but per row (slower, less memory).

    Returns
    -------
    d_mle : int
        Intrinsic dimension estimate (rounded to next integer)
    
    References
    ----------
    .. [1] Levina, E., & Bickel, P. (2004). Maximum likelihood estimation of 
           intrinsic dimension. Advances in Neural Information …, 17, 777–784. 
           http://doi.org/10.2307/2335172
    .. [2] http://www.inference.phy.cam.ac.uk/mackay/dimension/
    """
    n = X.shape[0]
    if estimator not in ['levina', 'mackay']:
        raise ValueError("Parameter 'estimator' must be 'levina' or 'mackay'.")
    if k1 < 1 or k2 < k1 or k2 >= n:
        raise ValueError("Invalid neighborhood: Please make sure that "
                         "0 < k1 <= k2 < n. (Got k1={} and k2={}).".
                         format(k1, k2))
    X = X.copy().astype(float)
        
    if metric == 'vector':
        # New array with unique rows   
        X = X[np.lexsort(np.fliplr(X).T)]
        
        if trafo is None:
            pass
        elif trafo == 'var':
            X -= X.mean(axis=0) # broadcast
            X /= X.var(axis=0) + 1e-7 # broadcast
        elif trafo == 'std':
            # Standardization
            X -= X.mean(axis=0) # broadcast
            X /= X.std(axis=0) + 1e-7 # broadcast
        else:
            raise ValueError("Transformation must be None, 'std', or 'var'.")
        
        # Compute matrix of log nearest neighbor distances
        X2 = (X**2).sum(1)
        
        if n <= mem_threshold: # speed-memory trade-off
            distance = X2.reshape(-1, 1) + X2 - 2*np.dot(X, X.T) #2x br.cast
            distance.sort(1)
            # Replace invalid values with a small number
            distance[distance<0] = 1e-7
            knnmatrix = .5 * np.log(distance[:, 1:k2+1])
        else:
            knnmatrix = np.zeros((n, k2))
            for i in range(n):
                distance = np.sort(X2[i] + X2 - 2 * np.dot(X, X[i, :]))
                # Replace invalid values with a small number
                distance[distance < 0] = 1e-7
                knnmatrix[i, :] = .5 * np.log(distance[1:k2+1])
    
    elif metric == 'distance':
        raise NotImplementedError("ID currently only supports vector data.")
        #=======================================================================
        # # TODO calculation WRONG
        # X.sort(1)
        # X[X < 0] = 1e-7
        # knnmatrix = np.log(X[:, 1:k2+1])
        #=======================================================================
    elif metric == 'similarity':
        raise NotImplementedError("ID currently only supports vector data.")
        #=======================================================================
        # # TODO calculation WRONG
        # print("WARNING: using similarity data may return "
        #       "undefined results.", file=sys.stderr)
        # X[X < 0] = 0
        # distance = 1 - (X / X.max())
        # knnmatrix = np.log(distance[:, 1:k2+1])
        #=======================================================================
    else:
        raise ValueError("Parameter 'metric' must be 'vector' or 'distance'.")
    
    # Compute the ML estimate
    S = np.cumsum(knnmatrix, 1)
    indexk = np.arange(k1, k2+1) # broadcasted afterwards
    dhat = -(indexk - 2) / (S[:, k1-1:k2] - knnmatrix[:, k1-1:k2] * indexk)
       
    if estimator == 'levina':  
        # Average over estimates and over values of k
        no_dims = dhat.mean()
    if estimator == 'mackay':
        # Average over inverses
        dhat **= -1
        dhat_k = dhat.mean(0)
        no_dims = (dhat_k ** -1).mean()
           
    return int(no_dims.round())