Example #1
0
    def __init__(self, data, **kwargs):
        r"""Constructor. This will fit both chi2 function in the different
        regimes.
            *data*      -   Data sample to use for fitting

        Keyword Argument:
            *chi1/2*    -   Keyword arguments like floc, fshape, etc. that are
                            passed to the constructor of the corresponding
                            chi2 scipy object.

        """
        data = np.asarray(data)

        c1 = kwargs.pop("chi1", dict())
        c2 = kwargs.pop("chi2", dict())

        self.par1 = chi2.fit(data[data > 0.], **c1)
        self.par2 = chi2.fit(-data[data < 0.], **c2)

        self.f1 = chi2(*self.par1)
        self.f2 = chi2(*self.par2)

        self.eta = float(np.count_nonzero(data > 0.)) / len(data)
        self.eta_err = np.sqrt(self.eta * (1. - self.eta) / len(data))

        # get fit-quality
        self.ks1 = kstest(data[data > 0.], "chi2", args=self.par1)[1]
        self.ks2 = kstest(-data[data < 0.], "chi2", args=self.par2)[1]

        return
Example #2
0
    def __init__(self, data, **kwargs):
        r"""Constructor, evaluates the percentage of events equal to zero and
        fits a chi2 to the rest of the data.

        Parameters
        -----------
        data : array
            Data values to be fit

        """
        data = np.asarray(data)

        if len(data) == 2:
            self.eta = data[0]
            self.par = [data[1], 0., 1.]

            self.eta_err = np.nan
            self.ks = np.nan

            self.f = chi2(*self.par)

            return

        self.par = chi2.fit(data[data > 0], **kwargs)

        self.f = chi2(*self.par)

        self.eta = float(np.count_nonzero(data > 0)) / len(data)
        self.eta_err = np.sqrt(self.eta * (1. - self.eta) / len(data))

        self.ks = kstest(data[data > 0], "chi2", args=self.par)[0]

        return
Example #3
0
def T1_test(sample_cov,true_cov,n):
    """
    Test the hypothesis that a sample covariance matrix comes from a
    multivariate normal distribution whose true covariance is given

    sample_cov:    sample covariance matrix
    true_cov:      known covariance matrix
    n:             number of observations per variable

    Returns the probability of obtaining a covariance matrix like this
    if the distribution were multivariate normal.

    Based on Nagao 1973, this is true only for n large (and larger
    than the size of the matrix).

    By Anne M. Archibald 2007
    """
    from numpy import dot, shape, trace, eye
    from scipy.linalg import inv
    from scipy.stats import chi2

    p, r = shape(sample_cov)
    if p!=r or (p,r) != shape(true_cov):
        raise ValueError, "Sample covariance matrix (%d by %d) and true covariance matrix (%d by %d) must be square matrices of the same size" % (p,r,shape(true_cov)[0],shape(true_cov)[1])
    if p>n:
        raise ValueError, "This statistic is not correct for matrices with n smaller than the matrix size"
    M = dot(sample_cov,inv(true_cov))-eye(p)
    T1 = (n-1)/2*trace(dot(M,M))

    f = p*(p+1)/2
    return chi2(f).sf(T1)-(1./(n-1))*(p/12.*(4*p**2+9*p+7)*chi2(f+6).cdf(T1)-
                                      p/8.*(6*p**2+13*p+8)*chi2(f+4).cdf(T1)+
                                      p/2.*(p+1)**2*chi2(f+2).cdf(T1)-
                                      p/24.*(2*p**2+3*p-1)*chi2(f).cdf(T1))
Example #4
0
    def __setstate(self, state):
        for key, val in state.iteritems():
            setattr(self, key, val)

        self.f1 = chi2(*self.par1)
        self.f2 = chi2(*self.par2)

        return
Example #5
0
    def test_1D_is_chisquared(self):
        # The 1-dimensional Wishart with an identity scale matrix is just a
        # chi-squared distribution.
        # Test variance, mean, entropy, pdf
        # Kolgomorov-Smirnov test for rvs
        np.random.seed(482974)

        sn = 500
        dim = 1
        scale = np.eye(dim)

        df_range = np.arange(1, 10, 2, dtype=float)
        X = np.linspace(0.1,10,num=10)
        for df in df_range:
            w = wishart(df, scale)
            c = chi2(df)

            # Statistics
            assert_allclose(w.var(), c.var())
            assert_allclose(w.mean(), c.mean())
            assert_allclose(w.entropy(), c.entropy())

            # PDF
            assert_allclose(w.pdf(X), c.pdf(X))

            # rvs
            rvs = w.rvs(size=sn)
            args = (df,)
            alpha = 0.01
            check_distribution_rvs('chi2', args, alpha, rvs)
Example #6
0
    def test_is_scaled_chisquared(self):
        # The 2-dimensional Wishart with an arbitrary scale matrix can be
        # transformed to a scaled chi-squared distribution.
        # For :math:`S \sim W_p(V,n)` and :math:`\lambda \in \mathbb{R}^p` we have
        # :math:`\lambda' S \lambda \sim \lambda' V \lambda \times \chi^2(n)`
        np.random.seed(482974)

        sn = 500
        df = 10
        dim = 4
        # Construct an arbitrary positive definite matrix
        scale = np.diag(np.arange(4)+1)
        scale[np.tril_indices(4, k=-1)] = np.arange(6)
        scale = np.dot(scale.T, scale)
        # Use :math:`\lambda = [1, \dots, 1]'`
        lamda = np.ones((dim,1))
        sigma_lamda = lamda.T.dot(scale).dot(lamda).squeeze()
        w = wishart(df, sigma_lamda)
        c = chi2(df, scale=sigma_lamda)

        # Statistics
        assert_allclose(w.var(), c.var())
        assert_allclose(w.mean(), c.mean())
        assert_allclose(w.entropy(), c.entropy())

        # PDF
        X = np.linspace(0.1,10,num=10)
        assert_allclose(w.pdf(X), c.pdf(X))

        # rvs
        rvs = w.rvs(size=sn)
        args = (df,0,sigma_lamda)
        alpha = 0.01
        check_distribution_rvs('chi2', args, alpha, rvs)
    def correct_covariance(self, data):
        """Apply a correction to raw Minimum Covariance Determinant estimates.

        Correction using the empirical correction factor suggested
        by Rousseeuw and Van Driessen in [Rouseeuw1984]_.

        Parameters
        ----------
        data: array-like, shape (n_samples, n_features)
          The data matrix, with p features and n samples.
          The data set must be the one which was used to compute
          the raw estimates.

        Returns
        -------
        covariance_corrected: array-like, shape (n_features, n_features)
          Corrected robust covariance estimate.

        """
        X_centered = data - self.raw_location_
        dist = np.sum(
            np.dot(X_centered, linalg.pinv(self.raw_covariance_)) * X_centered,
            1)
        correction = np.median(dist) / chi2(data.shape[1]).isf(0.5)
        covariance_corrected = self.raw_covariance_ * correction
        self._set_estimates(covariance_corrected)
        return covariance_corrected
Example #8
0
    def jtest(self, theta, **kwargs):
        """J-test for misspecification of the model.

        Tests whether all intercepts alphas are simultaneously zero.

        Parameters
        ----------
        theta : (dim_k*(dim_n+1)-1, ) array
            Parameter vector

        Returns
        -------
        jstat : int
            J-statistic
        jpval : int
            Corresponding p-value of the test, percent

        """

        dim_n, dim_k = self.__get_dimensions()[1:]
        param_var = self.compute_theta_var(theta, **kwargs)
        alpha_var = param_var[0:dim_n*dim_k:dim_k, 0:dim_n*dim_k:dim_k]
        eig = np.linalg.eigvalsh(alpha_var).min()
        if eig <= 0:
            alpha_var -= np.eye(dim_n) * eig * 1.1
        inv_var = np.linalg.pinv(alpha_var)
        try:
            np.linalg.cholesky(inv_var)
        except np.linalg.LinAlgError:
            warnings.warn('Inverse of alpha variance is not P.D.!')
        alpha = self.convert_theta_to2d(theta)[0]
        jstat = (alpha.dot(inv_var) * alpha).sum()
        jpval = 1 - chi2(dim_n).cdf(jstat)
        return jstat, jpval*100
Example #9
0
def calculate_Var_confidence_interval_large(series, confidence_interval=0.95):
    count = series.count()
    var = series.var()
    upper = (count - 1) * var
    rv = chi2(count - 1)
    alpha = 1 - confidence_interval
    return FloatInterval.closed(round(upper / rv.isf(alpha / 2), 2), round(upper / rv.isf(1 - alpha / 2), 2))
Example #10
0
def PlotChi2DistributionDistributionFunction(df):
    if df>0:
        main_frame = QtGui.QWidget()
        dpi = 100
        fig = Figure((5.0, 4.0), dpi=dpi)
        canvas = FigureCanvas(fig)
        canvas.setParent(main_frame)

        axes = fig.add_subplot(111)
        mpl_toolbar = NavigationToolbar(canvas, main_frame)

        hbox = QtGui.QHBoxLayout()
        vbox = QtGui.QVBoxLayout()
        vbox.addWidget(canvas)
        vbox.addWidget(mpl_toolbar)
        vbox.addLayout(hbox)
        main_frame.setLayout(vbox)

        alpha = 0.0005
        sequence = stats.chi2.isf(alpha, df)

        x = np.linspace(-sequence, sequence, 1000)
        rv = stats.chi2(df)
        y = rv.cdf(x)

        axes.plot(x,y)
        canvas.draw()

        return main_frame
    else:
        return False, "Serbestlik derecesi 0'dan kucuk olamaz."
Example #11
0
def plot_gmm_confidence_ellipses(ax, means, covariances, colors,
                                 confidence=0.95, plot_eigenvectors=True):
    """Plots ellipses for gmm covariances.

    :param ax:
    :param means: (n_components, n_features) means.
    :param covariances: (n_components, n_features, n_features) covariances.
    :param colors: ellipse colors.
    :param confidence:
    :param plot_eigenvectors:
    :return:
    """

    n_components, n_features = means.shape
    alpha = np.sqrt(chi2(n_features).ppf(confidence))

    for k in range(n_components):
        # plot ellipse from covariance
        values, vectors = _eig_sort(covariances[k])
        w, h = 2 * alpha * np.sqrt(values)
        angle = np.degrees(np.arctan2(vectors[1, 0], vectors[0, 0]))
        ax.add_artist(
            Ellipse(means[k], w, h, angle, color=colors[k], fill=False))

        # plot eigenvectors if needed
        if plot_eigenvectors:
            arrow_params = {'color': colors[k], 'length_includes_head': True,
                            'head_width': 0.05, 'head_length': 0.1}
            ax.arrow(*means[k], *(vectors[:, 0] * w / 2), **arrow_params)
            ax.arrow(*means[k], *(vectors[:, 1] * h / 2), **arrow_params)
def get_stats(P):
    pdfPk = []
    for i in range(P.shape[1]):
        N = float(P.shape[0])
        var = np.sum(P[:,i])/(N*(4.-2.))
        pdfPk.append(chi2(4.,scale=var))
    return pdfPk
Example #13
0
 def distModelIndexChanged_hndlr(self):
     '''
     handler for changin item in combobox under probability plot
     :return:
     '''
     index = self.distModelBox.currentIndex()
     if index == 0:
         self.probModel = stats.norm
         self.ddofEdit.setDisabled(True)
     elif index == 1:
         self.probModel = stats.expon
         self.ddofEdit.setDisabled(True)
     elif index == 2:
         self.probModel = stats.laplace
         self.ddofEdit.setDisabled(True)
     elif index == 3:
         try:
             self.df = np.float64(self.ddofEdit.text())
         except:
             self.ddofEdit.setText(str(self.df))
         self.probModel = stats.chi2(self.df)
         self.ddoflabel.setText(_('ProboPlot','Number of degrees of freedom'))
         self.ddofEdit.setEnabled(True)
     elif index == 4:
         try:
             self.df = np.float64(self.ddofEdit.text())
         except:
             self.ddofEdit.setText(str(self.df))
         self.probModel = stats.exponweib(a=1,c=self.df)
         self.ddoflabel.setText(_('ProboPlot','Shape of distribution'))
         self.ddofEdit.setEnabled(True)
     try:
         self.drawProbPlot(self.currDist)
     except:
         return
    def correct_covariance(self, data):
        """Apply a correction to raw Minimum Covariance Determinant estimates.

        Correction using the empirical correction factor suggested
        by Rousseeuw and Van Driessen in [RVD]_.

        Parameters
        ----------
        data : array-like, shape (n_samples, n_features)
            The data matrix, with p features and n samples.
            The data set must be the one which was used to compute
            the raw estimates.

        References
        ----------

        .. [RVD] `A Fast Algorithm for the Minimum Covariance
            Determinant Estimator, 1999, American Statistical Association
            and the American Society for Quality, TECHNOMETRICS`

        Returns
        -------
        covariance_corrected : array-like, shape (n_features, n_features)
            Corrected robust covariance estimate.

        """
        correction = np.median(self.dist_) / chi2(data.shape[1]).isf(0.5)
        covariance_corrected = self.raw_covariance_ * correction
        self.dist_ /= correction
        return covariance_corrected
Example #15
0
def profile_likelihood_crit(profile_likelihood,
                            max_likelihood,
                            clevels=[0.674, 0.95, 0.997],
                            log=True):
    """
    Return the critical values of the profile
    likelihood that correspond to the given confidence
    levels (based on the likelihood ratio test).
    
    Useful for the calculation of confidence intervals.
    
    Parameters
    ----------
    profile_likelihood : n-d array_like
        the profile (log)likelihood
    max_likelihood : float
        maximized value of the (log) likelihood
    clevels : list
        confidence levels
    log : bool
        must be True if log-likelihoods are
        provided
        
    """
    df = profile_likelihood.ndim
    lambda_crit = [stats.chi2(df).ppf(cl)
                   for cl in clevels]
    ploglike_crit = (2. * max_likelihood - lambda_crit) / 2.
    
    if log:
        return ploglike_crit
    else:
        return np.exp(ploglike_crit)
Example #16
0
 def error_string1(df, chi2):
     rv = stats.chi2(df)
     p_val = 1 - rv.cdf(chi2)
     
     ans = '(degrees of freedom) df = %s and the p-value = %.3g'\
            % (df, 1 - p_val)
     return ans
Example #17
0
    def __setstate__(self, state):
        for key, val in state.iteritems():
            setattr(self, key, val)

        self.f = chi2(*self.par)

        return
Example #18
0
def Chi2ProbabilitiesLowerTail(values, df):
    if len(values)>0 and df>0:
        outputStr = ""
        areas = []

        for val in values:
            outputStr += str(val)

            rv = stats.chi2(df, loc=0, scale=1)
            area = rv.cdf(val)
            area = "{0:.5f}".format(area)
            areas.append(area)

            if len(values) >1 and values.index(val) < len(values) - 1 : 
                    outputStr += ", "
            else: 
                    outputStr += "" 

        outputStr += ", serbestlik derecesi: " + str(df) 
        return outputStr, areas

    elif df<=0:
        return False, "Standart sapma 0'dan kucuk olamaz."
    else: 
        return False, "Gecerli olasilik degeri girilmelidir."
Example #19
0
def Chi2QuantilesLowerTail(probs, df):
    if len(probs)>0 and df>0:

        outputStr = ""
        yArray = []

        for prob in probs:
            outputStr += str(prob)

            if prob> 0 and prob<1:

                rv = stats.chi2(df, loc = 0, scale = 1)
                y = rv.ppf(prob)
                y = "{0:.5f}".format(y)
                yArray.append(y) 

            else:
                yArray.append("NaN") 
            
            if len(probs) >1 and probs.index(prob) < len(probs) - 1 : 
                    outputStr += ", "
            else: 
                    outputStr += "" 

        outputStr += ", serbestlik derecesi: " + str(df) 
        return outputStr, yArray

    elif df<=0: 
        return False, "Standart sapma 0'dan kucuk olamaz."
    else: 
        return False, "Gecerli olasilik degeri girilmelidir."
Example #20
0
def mcnemar_test(test_1, test_2, significance=0.01):
    """
    Perform McNemar's statistical test.

    Parameters
    ----------
    test_1 : numpy array
        Test 1 sample(s).
    test_2 : numpy array
        Test 2 sample(s).
    significance : float, optional
        Significance level.

    Returns
    -------
    significance : int
        Significance {-1, 0, +1}.
    p_value : float
        P-value.

    Notes
    -----
    Please see: http://en.wikipedia.org/wiki/McNemar%27s_test

    +-----------------+-----------------+-----------------+-----------+
    |                 | Test 2 positive | Test 2 negative | Row total |
    +-----------------+-----------------+-----------------+-----------+
    | Test 1 positive |        a        |        b        |   a + b   |
    | Test 1 negative |        c        |        d        |   c + d   |
    +-----------------+-----------------+-----------------+-----------+
    | Column total    |      a + c      |      b + d      |     n     |
    +-----------------+-----------------+-----------------+-----------+

    """
    from scipy.stats import chi2
    # convert the tests to numpy arrays
    test_1 = np.asarray(test_1)
    test_2 = np.asarray(test_2)
    # both test must have the same length
    if not (test_1.size == test_2.size and test_1.shape == test_2.shape):
        raise ValueError("Both tests must have the same size and shape.")
    # calculate a, b, c, d
    # a = np.sum(test_1 * test_2)
    b = np.sum(test_1 > test_2)
    c = np.sum(test_1 < test_2)
    # d = np.sum(-test_1 * -test_2)
    # is the approximation ok?
    if b + c < 25:
        raise NotImplementedError("implement correct binomial distribution or "
                                  "use bigger sample sizes (b + c > 25)")
    # statistical test
    stat = (b - c) ** 2 / float(b + c)
    # test under chi square distribution
    p = chi2(1).sf(stat)
    # direction of significance
    sig = 0
    if p < significance:
        sig = 1 if b > c else -1
    return sig, p
Example #21
0
 def setup_class(cls):
     cls.rng = RandomState(23456)
     fixed_rng = stats.chi2(10)
     cls.t = t = 1000
     cls.k = k = 50
     cls.losses = fixed_rng.rvs((t, k))
     index = pd.date_range('2000-01-01', periods=t)
     cls.losses_df = pd.DataFrame(cls.losses, index=index)
Example #22
0
 def star_optimize_alpha_threshold(self):
     alpha = self.doubleSpinBox_optimize_alpha.value()
     apsize = float(self.comboBox_apsize.currentText())
     nobs = np.count_nonzero(self.p.aperture[apsize].frames_mask)
     chi2dist = chi2(nobs-2)
     chi2limits = np.divide(chi2dist.interval(alpha), nobs-2)
     self.doubleSpinBox_optimize_lower.setValue(chi2limits[0])
     self.doubleSpinBox_optimize_upper.setValue(chi2limits[1])
def find_optimal_T_chi2(bg_rate, m, P):
    """Returns the min. T so that bg_rate is <= than lower_conf_rate(m,T,P).

    This is equivalent but much faster than find_optimal_T_iter().
    Note: This is based on the confidence intervall of multiple exponential
    """
    T = 0.5*chi2(2*m).ppf(P)/bg_rate
    return T
Example #24
0
def get_mswd_limits(n, k=1):
    dof = n - k
    # calculate the reduced chi2 95% interval for given dof
    # use scale parameter to calculate the chi2_reduced from chi2
    from scipy.stats import chi2

    rv = chi2(dof, scale=1 / float(dof))
    return rv.interval(0.95)
    def __init__(self, x, y, sig=[], chi2limit=0.95, customlimits=[], outlier_threshold=[-3.0, 3.0], maxiter=50):

        nanmask = np.logical_or(np.isnan(x), np.isnan(y))

        mask = np.ones(len(x), dtype=bool)

        iter = 1
        rejn = 0

        # reject outliers

        while True:

            xrlm = x[mask & ~nanmask]
            yrlm = y[mask & ~nanmask]
            X = sm.add_constant(xrlm)
            rlm = sm.RLM(yrlm, X, missing='none', M=sm.robust.norms.TukeyBiweight()).fit()
            residuals = y - (rlm.params[0]+rlm.params[1]*x)
            mad = np.median(np.absolute(residuals))
            sigmad = mad*1.4286
            ratio = residuals/sigmad
            maskit = (ratio > outlier_threshold[0]) & (ratio < outlier_threshold[1])
            if np.array_equal(mask, maskit) == True or iter >= maxiter:
                self.outliers_mask = mask
                self.rlm_params = rlm.params
                self.niter = iter
                break
            else:
                mask = np.copy(maskit)

            iter += 1

        # weghted linear fit to cleaned data

        xlfit = x[self.outliers_mask]
        ylfit = y[self.outliers_mask]
        siglfit = sig[self.outliers_mask] if len(sig) else []
        weights = 1/siglfit**2 if len(siglfit) > 0 else None
        polyfit = np.polyfit(xlfit, ylfit, deg=1, w=weights)

        polyfit_resid = ylfit - (polyfit[1]+polyfit[0]*xlfit)
        polyfit_dof = len(xlfit) - 2

        if len(sig) > 0:
            polyfit_zval = polyfit_resid/siglfit
            polyfit_chi2 = np.sum(polyfit_zval**2)
            self.polyfit_redchi2 = polyfit_chi2/polyfit_dof

        self.polyfit_rms = math.sqrt(np.sum(polyfit_resid**2)/polyfit_dof)
        self.polyfit_chi2dist = chi2(polyfit_dof)
        self.polyfit = polyfit

        if len(customlimits) == 2:
            self.polyfit_chi2limits = customlimits
        else:
            self.polyfit_chi2limits = np.divide(self.polyfit_chi2dist.interval(chi2limit), float(polyfit_dof))
Example #26
0
def chi2pval(data):
	av = np.average(data)
	va = np.var(data)
	hist, binEdges = np.histogram(data, bins=50, density=True)
	rvn = stats.norm(loc = av, scale = np.sqrt(va))
	eHist = np.array([rvn.cdf(binEdges[i+1])-rvn.cdf(binEdges[i]) for i in range(len(binEdges)-1)])
	chi2 = np.sum(np.power(hist-eHist,2)/eHist)
	df = len(hist)-1
	rv = stats.chi2(df)
	print chi2, df, 1-rv.cdf(chi2)
Example #27
0
def normal_plevels(n):
    """
    Return an array of values of the probability within a +- k*sigma region
    centered on the mean of a normal distribution, for k=1 to n.
    """
    c1cdf = stats.chi2(1).cdf
    levels = []
    for i in range(1,n+1):
        levels.append(c1cdf(i**2))
    return array(levels)
Example #28
0
def gauss_ell(mu, va, dim = [0, 1], npoints = 100, level = 0.39):
    """ Given a mean and covariance for multi-variate
    gaussian, returns npoints points for the ellipse
    of confidence given by level (all points will be inside
    the ellipsoides with a probability equal to level)
    
    Returns the coordinate x and y of the ellipse"""
    
    c       = np.array(dim)

    if mu.size < 2:
        raise RuntimeError("this function only make sense for dimension 2 and more")

    if mu.size == va.size:
        mode    = 'diag'
    else:
        if va.ndim == 2:
            if va.shape[0] == va.shape[1]:
                mode    = 'full'
            else:
                raise DenError("variance not square")
        else:
            raise DenError("mean and variance are not dim conformant")

    # If X ~ N(mu, va), then [X` * va^(-1/2) * X] ~ Chi2
    chi22d  = stats.chi2(2)
    mahal   = np.sqrt(chi22d.ppf(level))
    
    # Generates a circle of npoints
    theta   = np.linspace(0, 2 * np.pi, npoints)
    circle  = mahal * np.array([np.cos(theta), np.sin(theta)])

    # Get the dimension which we are interested in:
    mu  = mu[dim]
    if mode == 'diag':
        va      = va[dim]
        elps    = np.outer(mu, np.ones(npoints))
        elps    += np.dot(np.diag(np.sqrt(va)), circle)
    elif mode == 'full':
        va  = va[c,:][:,c]
        #print "va = ", v a
        # Method: compute the cholesky decomp of each cov matrix, that is
        # compute cova such as va = cova * cova' 
        # WARN: scipy is different than matlab here, as scipy computes a lower
        # triangular cholesky decomp: 
        #   - va = cova * cova' (scipy)
        #   - va = cova' * cova (matlab)
        # So take care when comparing results with matlab !
        cova    = np.linalg.cholesky(va)
        elps    = np.outer(mu, np.ones(npoints))
        elps    += np.dot(cova, circle)
    else:
        raise DenParam("var mode not recognized")

    return elps[0, :], elps[1, :]
    def reweight_covariance(self, data):
        """Reweight raw Minimum Covariance Determinant estimates.

        Reweight observations using Rousseeuw's method (equivalent to
        deleting outlying observations from the data set before
        computing location and covariance estimates). [1]

        Parameters
        ----------
        data: array-like, shape (n_samples, n_features)
          The data matrix, with p features and n samples.
          The data set must be the one which was used to compute
          the raw estimates.

        Returns
        -------
        location_reweighted: array-like, shape (n_features, )
          Reweighted robust location estimate.
        covariance_reweighted: array-like, shape (n_features, n_features)
          Reweighted robust covariance estimate.
        support_reweighted: array-like, type boolean, shape (n_samples,)
          A mask of the observations that have been used to compute
          the reweighted robust location and covariance estimates.

        Notes
        -----
        References:
        [1] A Fast Algorithm for the Minimum Covariance Determinant Estimator,
            1999, American Statistical Association and the American Society
            for Quality, TECHNOMETRICS

        """
        n_samples, n_features = data.shape
        X_centered = data - self.location_
        if self.store_precision:
            precision = self.precision_
        else:
            precision = linalg.pinv(self.covariance_)

        dist = np.sum(
            np.dot(X_centered, precision) * X_centered,
            1)
        mask = dist < chi2(n_features).isf(0.025)
        if self.assume_centered:
            location_reweighted = np.zeros(n_features)
        else:
            location_reweighted = data[mask].mean(0)
        covariance_reweighted = self._nonrobust_covariance(
            data[mask], assume_centered=self.assume_centered)
        support_reweighted = np.zeros(n_samples).astype(bool)
        support_reweighted[mask] = True
        self._set_estimates(covariance_reweighted)
        self.location_ = location_reweighted
        self.support_ = support_reweighted
        return location_reweighted, covariance_reweighted, support_reweighted
Example #30
0
def Chi2(df, tag=None):
    """
    A Chi-Squared random variate
    
    Parameters
    ----------
    df : int
        The degrees of freedom of the distribution (must be greater than one)
    """
    assert isinstance(df, int) and df>1, 'DF must be an int greater than 1'
    return uv(rv=ss.chi2(df), tag=tag)
Example #31
0
    def buildKernelAdapt(self, X, C, y, regions, reml=True, maxiter=100):

        #prepare initial values for sig2e and for fixed effects
        hyp0_sig2e, hyp0_fixedEffects = self.getInitialHyps(X, C, y)

        bestKernelNames = []
        kernelsListAll = []
        hyp_kernels = []

        funcToSolve = self.infExact_scipy
        yVar = y.var()

        for r_i, r in enumerate(regions):

            #if (r_i == 0): kernelsToTry = ['lin']
            #else:
            #	kernelsToTry = ['lin', 'poly2_lin', 'rbf_lin', 'nn_lin']
            kernelsToTry = ['lin', 'poly2_lin', 'rbf_lin', 'nn_lin']
            if self.verbose:
                print
                print 'selecting a kernel for region', r_i, 'with', r.sum(
                ), 'SNPs'

            #add linear kernel
            X_lastRegion = X[:, r]
            linKernel = kernels.linearKernel(X_lastRegion)
            kernelsListAll.append(kernels.ScaledKernel(linKernel))
            kernelsListAll.append(None)

            bestFun = np.inf
            bestKernelName = None
            best_hyp0 = None
            bestKernel = None
            bestPval = np.inf

            #iterate over every possible kernel
            for kernelToTry in kernelsToTry:
                hyp0 = [0.5 * np.log(0.5 * yVar)]
                if self.verbose: print 'Testing kernel:', kernelToTry

                #create the kernel
                if (kernelToTry == 'lin'):
                    kernel = None
                    df = None
                elif (kernelToTry == 'rbf_lin'):
                    kernel = kernels.RBFKernel(X_lastRegion)
                    hyp0.append(np.log(1.0))  #ell
                    df = 2
                elif (kernelToTry == 'nn_lin'):
                    kernel = kernels.NNKernel(X_lastRegion)
                    hyp0.append(np.log(1.0))  #ell
                    df = 2
                elif (kernelToTry == 'poly2_lin'):
                    kernel = kernels.Poly2KernelHomo(linKernel)
                    df = 1
                else:
                    raise Exception('unrecognized kernel name')

                if (kernel is not None):
                    #scale the kernel
                    kernel = kernels.ScaledKernel(kernel)
                    hyp0.append(0.5 * np.log(0.5 * yVar))  #scaling hyp

                    #add the kernel as the final kernel in the kernels list
                    kernelsListAll[-1] = kernel
                    sumKernel = kernels.SumKernel(kernelsListAll)
                else:
                    sumKernel = kernels.SumKernel(kernelsListAll[:-1])

                #test log likelihood obtained with this kernel for this region
                args = (sumKernel, C, y, reml)
                self.optimization_counter = 0
                hyp0_all = np.concatenate(
                    (hyp0_sig2e, hyp0_fixedEffects, hyp_kernels + hyp0))
                optObj = gpUtils.minimize(hyp0_all, funcToSolve, -maxiter,
                                          *args)
                if (not optObj.success):
                    print 'Optimization status:', optObj.status
                    print 'optimization message:', optObj.message
                    raise Exception('optimization failed')

                print 'final LL: %0.5e' % (-optObj.fun)
                if (kernelToTry == 'lin'):
                    linLL = -optObj.fun
                    pVal = 1.0
                else:
                    llDiff = -optObj.fun - linLL
                    if (llDiff < 0): pVal = 1.0
                    else: pVal = 0.5 * stats.chi2(df).sf(llDiff)
                    print 'llDiff: %0.5e' % llDiff, 'pVal:%0.5e' % pVal

                if (kernelToTry == 'lin'
                        or (pVal < bestPval and
                            (len(kernelsToTry) == 1 or pVal < 0.05 /
                             (len(kernelsToTry) - 1)))):
                    bestOptObj = optObj
                    bestPval = pVal
                    bestKernelName = kernelToTry
                    best_hyp0 = hyp0
                    best_sumKernel = sumKernel
                    bestKernel = kernel

            if (bestKernel is not None): kernelsListAll[-1] = bestKernel
            else: kernelsListAll = kernelsListAll[:-1]
            hyp_kernels += best_hyp0
            bestKernelNames.append(bestKernelName)

            if self.verbose: print 'selected kernel:', bestKernelName

        if self.verbose:
            print 'selected kernels:', bestKernelNames
            print

        return bestKernelNames
 def test_Uniform_to_ChiSquare(self):
     X = RV(Uniform(a=0, b=1))
     sims = (-2 * log(X)).sim(Nsim)
     cdf = stats.chi2(df=2).cdf
     pval = stats.kstest(sims, cdf).pvalue
     self.assertTrue(pval > .01)
# In[89]:

# Create crosstab of variables of interest
tab = pd.crosstab(data['V1'], data['V7'])

# In[90]:

tab

# In[91]:

from scipy.stats import chi2_contingency as chi2

# In[92]:

chi2(tab)

# #### Persons who belong to a farmers association are more likely to have attended a training before

# ### Are those visited by extension officers more likely to have attended trainings in the past

# In[93]:

tab2 = pd.crosstab(data['V3'], data['V7'])
chi2(tab2)

# #### Persons who have been visited by an extension officer were more likely to have attended a training before

# ***

# # Distribution of correct responses
Example #34
0
def test():
    import DDFacet.ToolsDir.Gaussian
    _,_,PSF=DDFacet.ToolsDir.Gaussian.Gaussian(10,311,1.)
    #PSF.fill(1.)
    
    #import scipy.signal
    #PP=scipy.signal.fftconvolve(PSF,PSF, mode='same')
    
    #print Fact
    import pylab
    pylab.clf()
    pylab.imshow(PSF,interpolation="nearest") 
    pylab.colorbar()
    pylab.draw()
    pylab.show(False)
    pylab.pause(0.1)

    Dirty=np.zeros_like(PSF)
    nx,_=Dirty.shape
    Dirty[nx//2,nx//2+10]+=2.
    Dirty[nx//2+10,nx//2+10]+=2.
    Dirty=np.random.randn(*(Dirty.shape))
    
    PSF=PSF.reshape((1,1,nx,nx))*np.ones((2,1,1,1))
    Dirty=Dirty.reshape((1,1,nx,nx))*np.ones((2,1,1,1))
    Dirty[1,:,:,:]=Dirty[0,:,:,:]*2
    x,y=np.mgrid[0:nx,0:nx]
    dx=10
    nc=nx//2
    x=x[nc-dx:nc+dx,nc-dx:nc+dx].flatten()
    y=y[nc-dx:nc+dx,nc-dx:nc+dx].flatten()
    ListPixParms=[(x[i],y[i]) for i in range(x.size)]
    x,y=np.mgrid[0:nx,0:nx]

    dx=10
    x=x[nc-dx:nc+dx,nc-dx:nc+dx].flatten()
    y=y[nc-dx:nc+dx,nc-dx:nc+dx].flatten()
    ListPixData=[(x[i],y[i]) for i in range(x.size)]
    CC=ClassConvMachine(PSF,ListPixParms,ListPixData,"Matrix")
    
    NFreqBands,_,_,_=Dirty.shape
    NPixListParms=len(ListPixParms)
    NPixListData=len(ListPixData)
    Array=np.zeros((NFreqBands,1,NPixListParms),np.float32)
    x0,y0=np.array(ListPixParms).T
    for iBand in range(NFreqBands):
        Array[iBand,0,:]=Dirty[iBand,0,x0,y0]


    Array=Array.reshape((NFreqBands,NPixListParms))

    import pylab


    Lchi0=[]
    Lchi1=[]


    NTries=5000
    ArrKeep0=np.zeros((NTries,NPixListParms),Array.dtype)
    ArrKeep1=np.zeros((NTries,NPixListParms),Array.dtype)


    for i in range(NTries):
        Array=np.random.randn(*Array.shape)
        #T=ClassTimeIt.ClassTimeIt()
        chi0=np.sum(Array**2)
        Lchi0.append(chi0)
        ConvArray0=CC.Convolve(Array)
        chi1=np.sum(ConvArray0**2)
        #T.timeit("0")
        #ConvArray1=CC.Convolve(Array,ConvMode="Vector").ravel()
        #T.timeit("1")
        #r=chi1/chi0
        #print "%f -> %f [%r]"%(chi0,chi1,r)
        NChan,_,NN=ConvArray0.shape
        NN=int(np.sqrt(NN))
        ArrKeep0[i]=Array[0].ravel()
        ArrKeep1[i]=ConvArray0[0].ravel()
        # pylab.clf()
        # pylab.imshow(ConvArray0.reshape((2,NN,NN))[0],interpolation="nearest")
        # pylab.draw()
        # pylab.show(False)
        # pylab.pause(0.1)


        Lchi1.append(chi1)
        #print np.var(Array),np.var(ConvArray0)/Fact

    Fact=CC.NormData[0]
    print(np.median(np.std(ArrKeep0,axis=0)**2))
    print(np.median(np.std(ArrKeep1,axis=0)**2/Fact))
    return
    
    from scipy.stats import chi2
    from DDFacet.ToolsDir.GeneDist import ClassDistMachine
    DM=ClassDistMachine()



    rv = chi2(Array.size)
    x=np.linspace(0,2*rv.moment(1),1000)
    P=rv.cdf(x)
    pylab.clf()
    pylab.subplot(2,1,1)
    #yd,xe=pylab.histogram(Lchi0,bins=100,normed=True)
    #xd=(xe[1::]+xe[0:-1])/2.
    #yd/=np.sum(yd)
    xd,yd=DM.giveCumulDist(np.array(Lchi0),Ns=100)
    #dx=xd[1]-xd[0]
    #yd/=dx
    pylab.plot(xd,yd)
    pylab.plot(x,P)
    pylab.xlim(0,1600)
    pylab.subplot(2,1,2)
    xd,yd=DM.giveCumulDist(np.array(Lchi1),Ns=20)
    # yd,xe=pylab.histogram(Lchi1,bins=100,normed=True)
    # xd=(xe[1::]+xe[0:-1])/2.
    # dx=xd[1]-xd[0]
    # yd/=np.sum(yd)
    # yd/=dx
    print(np.mean(Lchi1)/Fact)
    print(np.mean(Lchi0))
    # #pylab.xlim(0,800)
    # #pylab.hist(Lchi1,bins=100)

    import scipy.interpolate
    cdf=scipy.interpolate.interp1d(xd, yd,"cubic")
    x=np.linspace(xd.min(),xd.max(),1000)
    #pylab.plot(x,cdf(x),ls="",marker=".")
    #pylab.plot(xd,yd,ls="",marker="s")
    
    y=cdf(x)
    x,y=xd, yd
    y=y[1::]-y[0:-1]
    x=(x[1::]+x[0:-1])/2.
    pylab.plot(x,y,ls="",marker=".")
    
    #pylab.xlim(0,1600)
    pylab.draw()
    pylab.show(False)
    

    

    # import pylab
    # pylab.clf()
    # #pylab.plot(ConvArray0.ravel())
    # pylab.imshow(PSF[0,0])
    # #pylab.plot(ConvArray1)
    # #pylab.plot(ConvArray1-ConvArray0)
    # pylab.draw()
    # pylab.show(False)
    
    stop
 def test_Exponential_to_ChiSquare(self):
     X = RV(Exponential(rate=1 / 2))
     sims = X.sim(Nsim)
     cdf = stats.chi2(df=2).cdf
     pval = stats.kstest(sims, cdf).pvalue
     self.assertTrue(pval > .01)
Example #36
0
def find_optimal_threshold(m, P):
    """Returns the min. threshold to have prob. < P to be BG (averaging m ph).

    Same formula as find_optimal_T() (must be multiplied by bg to have the rate.
    """
    return m / (0.5 * chi2(2 * m).ppf(P))
Example #37
0
# you can set usetex to False.
from astroML.plotting import setup_text_plots
setup_text_plots(fontsize=8, usetex=True)

#------------------------------------------------------------
# Define the distribution parameters to be plotted
k_values = [1, 2, 5, 7]
linestyles = ['-', '--', ':', '-.']
mu = 0
x = np.linspace(-1, 20, 1000)

#------------------------------------------------------------
# plot the distributions
fig, ax = plt.subplots(figsize=(5, 3.75))
fig.subplots_adjust(bottom=0.12)

for k, ls in zip(k_values, linestyles):
    dist = chi2(k, mu)

    plt.plot(x, dist.pdf(x), ls=ls, c='black', label=r'$k=%i$' % k)

plt.xlim(0, 10)
plt.ylim(0, 0.5)

plt.xlabel('$Q$')
plt.ylabel(r'$p(Q|k)$')
plt.title(r'$\chi^2\ \mathrm{Distribution}$')

plt.legend()
plt.show()
Example #38
0
 def __init__(self, theta):
     self._chi2 = chi2(theta)
Example #39
0
# # -*- coding: utf-8 -*-

import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import chi2

df = 5
rv = chi2(df)

#Сгенерируйте из него выборку объёма 1000
sampleRange = chi2.rvs(df, size=1000)
#Постройте гистограмму выборки и нарисуйте поверх неё теоретическую плотность распределения вашей случайной величины.
# plt.hist(sampleRange, normed=True, bins=20, alpha=0.5, label='hist samples')
# plt.ylabel('number of samples')
# plt.xlabel('$x$')
#теоретическая плотность распределения случайной величины
left = chi2.ppf(0.01, df)
right = chi2.ppf(0.99, df)
x = np.linspace(left, 20, 100)
# plt.plot(x, rv.pdf(x), 'r-', lw=5, alpha=0.7, label='chi2 pdf')
plt.legend(loc='best')

# plt.show()

# values = np.array([pareto.rvs(k, size=10) for x in range(10)])
# print values
# plt.hist(values.mean(axis=1), normed=True)

m = []
# for _ in xrange(20):
#     m.append(np.mean(chi2.rvs(df, size=1000)))
Example #40
0
 def r(arr):
     arr = arr.dropna()
     rhos = acf(arr, nlags=12, fft=True)
     test = arr.shape[0] * rhos[-1]**2 / (1 + (rhos[1:-1]**2).sum())
     return test > stats.chi2(1).ppf(0.9)
Example #41
0
#viz.plot_vel_err(ownship, navsys,boxplot=False)
# Tracking results
xy_measurements = [polar_to_cartesian(ground_radar.data[:,k]) for k in range(len(radar_time))]
xy_measurements = np.vstack(xy_measurements).T
_, ax_xy = plt.subplots(1,2)
ax_xy[0].plot(ownship.state[1,:], ownship.state[0,:])
viz.target_xy(target, perfect_pose_imm, ax=ax_xy[0], measurements=xy_measurements)
ax_xy[0].set_title('IMM - perfect pose')
ax_xy[1].plot(ownship.state[1,:], ownship.state[0,:])
viz.target_xy(target, navigation_imm, ax=ax_xy[1], measurements=xy_measurements)
ax_xy[1].set_title('IMM - navigation pose')

viz.target_velocity(target, navigation_imm)
viz.target_velocity(target, perfect_pose_imm)
# NEES plot
UB = chi2(df=2*N_MC).ppf(0.975)/N_MC*np.ones_like(radar_time)
LB = chi2(df=2*N_MC).ppf(0.025)/N_MC*np.ones_like(radar_time)
NEES_fig, consistency_ax = plt.subplots(1,2)
NEES_ax = consistency_ax[0]
NEES_ax.plot(radar_time, UB, 'k')
NEES_ax.plot(radar_time, LB, 'k')
NEES_ax.plot(radar_time, np.mean(NEES_nav, axis=0), label='navigation pose')
NEES_ax.plot(radar_time, np.mean(NEES_perf, axis=0), label='perfect pose')
NEES_ax.legend()
NEES_ax.set_title('NEES of tracking for ' + str(N_MC) + ' monte carlo runs')
RMS_ax = consistency_ax[1]
RMS_ax.plot(radar_time, np.sqrt(np.mean(RMSE_nav, axis=0)), label='navigation pose')
RMS_ax.plot(radar_time, np.sqrt(np.mean(RMSE_perf, axis=0)), label='perfect pose')
RMS_ax.legend()
RMS_ax.set_title('Position RMS error for ' + str(N_MC) + ' monte carlo runs')
    def runMetroSingleChain(self, individual0, NSteps=1000, chain_dict={}):

        df = self.PM.NPixListData
        self.rv = chi2(df)
        _, Chi2 = self.GiveFitness(individual0)
        self.MinChi2 = Chi2
        logProb = self.rv.logpdf(Chi2)

        x = np.linspace(0, 2 * self.rv.moment(1), 1000)
        lP = self.rv.logpdf(x)
        iMax = np.argmax(lP)
        self.Chi2PMax = x[iMax]

        # #####################
        # # V0
        #self.Var=self.MinChi2/self.Chi2PMax
        #Chi20_n=self.MinChi2/self.Var
        #VarMin=(3e-3)**2
        #ThVar=np.max([self.Var,VarMin])
        #ShrinkFactor=np.min([1.,self.Var/ThVar])
        # # print
        # # print ShrinkFactor
        # # print
        # # stop
        # #####################
        VarMin = (3e-4)**2
        #self.Var=np.max([self.EstimatedStdFromResid**2,VarMin])
        Var = self.MinChi2 / self.Chi2PMax
        S = self.PM.ArrayToSubArray(individual0, Type="S")
        B = np.sum(np.abs(S)) / float(S.size)
        B0 = 7e-4
        Sig0 = 3e-3
        Sig = B * Sig0 / B0

        # print
        # print "%f %f %f -> %f"%(B,B0,Sig0,Sig)
        # print

        self.Var = np.max([4. * self.EstimatedStdFromResid**2, Sig**2])

        Chi20_n = self.MinChi2 / self.Var
        ShrinkFactor = 1.
        # #####################

        DicoChains = {}
        Parms = individual0

        # ##################################
        DoPlot = True
        if DoPlot:
            import pylab
            pylab.figure(1)
            x = np.linspace(0, 2 * self.rv.MeanChi2, 1000)
            P = self.rv.pdf(x)
            pylab.clf()
            pylab.plot(x, P)
            Chi2Red = Chi2_0  #/self.Var
            pylab.scatter(Chi2Red, np.mean(P), c="black")
            pylab.draw()
            pylab.show(False)
        # ##################################

        # ##################################
        DoPlot = False
        # DoPlot=True
        if DoPlot:
            import pylab
            x = np.linspace(0, 2 * self.rv.moment(1), 1000)
            P = self.rv.pdf(x)
            pylab.clf()
            pylab.plot(x, P)
            pylab.scatter(Chi20_n, np.mean(P), c="black")
            pylab.draw()
            pylab.show(False)
        # ##################################

        DicoChains["Parms"] = []
        DicoChains["Chi2"] = []
        DicoChains["logProb"] = []
        logProb0 = self.rv.logpdf(Chi20_n)

        Mut_pFlux, Mut_p0, Mut_pMove = 0.2, 0., 0.3

        #T.disable()
        FactorAccelerate = 1.
        lAccept = []
        NBurn = self.GD["MetroClean"]["MetroNBurnin"]

        NSteps = NSteps + NBurn

        NAccepted = 0
        iStep = 0
        NMax = NSteps  #10000

        #for iStep in range(NSteps):
        while NAccepted < NSteps and iStep < NMax:
            iStep += 1
            #print "========================"
            #print iStep
            individual1, = self.MutMachine.mutGaussian(individual0.copy(),
                                                       Mut_pFlux, Mut_p0,
                                                       Mut_pMove)  #,
            #FactorAccelerate=FactorAccelerate)
            # ds=Noise
            # individual1,=self.MutMachine.mutNormal(individual0.copy(),ds*1e-1*FactorAccelerate)
            # #T.timeit("mutate")

            _, Chi2 = self.GiveFitness(individual1)
            # if Chi2<self.MinChi2:
            #     self.Var=Chi2/self.Chi2PMax
            #     #print "           >>>>>>>>>>>>>> %f"%np.min(Chi2)

            Chi2_n = Chi2 / self.Var

            Chi2_n = Chi20_n + ShrinkFactor * (Chi2_n - Chi20_n)

            logProb = self.rv.logpdf(Chi2_n)

            p1 = logProb
            p0 = logProb0  #DicoChains["logProb"][-1]
            if p1 - p0 > 5:
                R = 1
            elif p1 - p0 < -5:
                R = 0
            else:
                R = np.min([1., np.exp(p1 - p0)])

            r = np.random.rand(1)[0]
            #print "%5.3f [%f -> %f]"%(R,p0,p1)
            # print "MaxDiff ",np.max(np.abs(self.pop[iChain]-DicoChains[iChain]["Parms"][-1]))
            lAccept.append((r < R))
            if r < R:  # accept
                individual0 = individual1
                logProb0 = logProb
                NAccepted += 1
                if NAccepted > NBurn:
                    DicoChains["logProb"].append(p1)
                    DicoChains["Parms"].append(individual1)
                    DicoChains["Chi2"].append(Chi2_n)

                if DoPlot:
                    pylab.scatter(Chi2_n, np.exp(p1), lw=0)
                    pylab.draw()
                    pylab.show(False)
                    pylab.pause(0.1)

                # print "  accept"
                # # Model=self.StackChain()

                # # Asq=self.ArrayMethodsMachine.PM.ModelToSquareArray(Model,TypeInOut=("Parms","Parms"))
                # # _,npol,NPix,_=Asq.shape
                # # A=np.mean(Asq,axis=0).reshape((NPix,NPix))
                # # Mask=(A==0)
                # # pylab.clf()
                # # pylab.imshow(A,interpolation="nearest")
                # # pylab.draw()
                # # pylab.show(False)
                # # pylab.pause(0.1)

            else:

                # # #######################
                if DoPlot:
                    pylab.scatter(Chi2_n, np.exp(p1), c="red", lw=0)
                    pylab.draw()
                    pylab.show(False)
                    pylab.pause(0.1)
                # # #######################
                pass

            #T.timeit("Compare")

            AccRate = np.count_nonzero(lAccept) / float(len(lAccept))
            #print "[%i] Acceptance rate %f [%f with ShrinkFactor %f]"%(iStep,AccRate,FactorAccelerate,ShrinkFactor)
            if (iStep % 50 == 0) & (iStep > 10):
                if AccRate > 0.234:
                    FactorAccelerate *= 1.5
                else:
                    FactorAccelerate /= 1.5
                FactorAccelerate = np.min([3., FactorAccelerate])
                FactorAccelerate = np.max([.01, FactorAccelerate])
                lAccept = []
            #T.timeit("Acceptance")

        T.timeit("Chain")

        chain_dict["logProb"] = np.array(DicoChains["logProb"])
        chain_dict["Parms"] = np.array(DicoChains["Parms"])
        chain_dict["Chi2"] = np.array(DicoChains["Chi2"])
Example #43
0
    return (y - func(x, a, b, c, d, e))**2


# Leastsquare Method
p0 = [1, 1, 1, 1, 1]  # Starting Values
plsq, cov = curve_fit(func, xData, yData, p0, sigma=yerr)
a, b, c, d, e = plsq[0], plsq[1], plsq[2], plsq[3], plsq[4]
np.set_printoptions(precision=2)
print cov
yFit = func(xData, a, b, c, d, e)
print "Param for UpFit ist:", 'a= ', a, ' b= ', b, ' c =', c, ' d= ', d, ' e= ', e

# Chisquare test
S = np.sum(residuals(xData, yData, a, b, c, d, e) / (yerr**2))
dof = len(xData) - 5  #Put number of Parameters here
rv = chi2(dof)
chimin, chimax = rv.ppf(0.025), rv.ppf(0.975)
# two sided pvalue test
if S >= dof: pvalue = 2 * (rv.cdf(S) - 1)
if S < dof: pvalue = 2 * rv.cdf(S)
#pvalue = rv.cdf(S)
print 'chimin:' + str('%.2f' % chimin), 'chimax:' + str(
    '%.2f' % chimax), 'chisquare:' + str('%.2f' % S), 'pvalue: ' + str(
        '%.2f' % pvalue)

# plot Reult
#plt.figure()
#plt.subplot(211)
#plt.title(r'Ein vielsagender Titel')
plt.errorbar(xData, yData, yerr, fmt='o', label=r'Up', color='r')
plt.plot(xData, yFit, label='Up Fit', color='r')
_DIST_MAP = {
    dist.BernoulliProbs:
    lambda probs: osp.bernoulli(p=probs),
    dist.BernoulliLogits:
    lambda logits: osp.bernoulli(p=_to_probs_bernoulli(logits)),
    dist.Beta:
    lambda con1, con0: osp.beta(con1, con0),
    dist.BinomialProbs:
    lambda probs, total_count: osp.binom(n=total_count, p=probs),
    dist.BinomialLogits:
    lambda logits, total_count: osp.binom(n=total_count,
                                          p=_to_probs_bernoulli(logits)),
    dist.Cauchy:
    lambda loc, scale: osp.cauchy(loc=loc, scale=scale),
    dist.Chi2:
    lambda df: osp.chi2(df),
    dist.Dirichlet:
    lambda conc: osp.dirichlet(conc),
    dist.Exponential:
    lambda rate: osp.expon(scale=np.reciprocal(rate)),
    dist.Gamma:
    lambda conc, rate: osp.gamma(conc, scale=1. / rate),
    dist.HalfCauchy:
    lambda scale: osp.halfcauchy(scale=scale),
    dist.HalfNormal:
    lambda scale: osp.halfnorm(scale=scale),
    dist.InverseGamma:
    lambda conc, rate: osp.invgamma(conc, scale=rate),
    dist.LogNormal:
    lambda loc, scale: osp.lognorm(s=scale, scale=np.exp(loc)),
    dist.MultinomialProbs:
Example #45
0
def test_linear_model_parameters_risk_free_gls(data):
    mod = LinearFactorModel(data.portfolios, data.factors, risk_free=True)
    p = mod.portfolios.ndarray
    sigma = np.cov(p.T)
    val, vec = np.linalg.eigh(sigma)
    sigma_m12 = vec @ np.diag(1.0 / np.sqrt(val)) @ vec.T
    sigma_inv = np.linalg.inv(sigma)

    mod = LinearFactorModel(data.portfolios,
                            data.factors,
                            risk_free=True,
                            sigma=sigma)
    assert 'using GLS' in str(mod)
    res = mod.fit()
    f = mod.factors.ndarray
    p = mod.portfolios.ndarray
    n = f.shape[0]
    moments = np.zeros(
        (n, p.shape[1] * (f.shape[1] + 1) + f.shape[1] + 1 + p.shape[1]))
    fc = np.c_[np.ones((n, 1)), f]
    betas = np.linalg.lstsq(fc, p)[0]
    eps = p - fc @ betas
    loc = 0
    for i in range(eps.shape[1]):
        for j in range(fc.shape[1]):
            moments[:, loc] = eps[:, i] * fc[:, j]
            loc += 1
    bc = np.c_[np.ones((p.shape[1], 1)), betas[1:, :].T]
    lam = np.linalg.lstsq(sigma_m12 @ bc, sigma_m12 @ p.mean(0)[:, None])[0]
    pricing_errors = p - (bc @ lam).T

    for i in range(lam.shape[0]):
        lam_error = pricing_errors @ sigma_inv @ bc[:, [i]]
        moments[:, loc] = lam_error.squeeze()
        loc += 1
    alphas = p.mean(0)[:, None] - bc @ lam
    moments[:, loc:] = pricing_errors - alphas.T
    mod_moments = mod._moments(eps, bc, lam, alphas, pricing_errors)

    assert_allclose(res.betas, bc[:, 1:])
    assert_allclose(res.risk_premia, lam.squeeze())
    assert_allclose(res.alphas, alphas.squeeze())
    assert_allclose(moments, mod_moments)

    m = moments.shape[1]
    jac = np.eye(m)
    block1 = p.shape[1] * (f.shape[1] + 1)
    # 1,1

    jac[:block1, :block1] = np.kron(np.eye(p.shape[1]), fc.T @ fc / n)
    # 2, 1
    loc = 0
    nport, nf = p.shape[1], f.shape[1]
    block2 = block1 + nf + 1
    bct = sigma_inv @ bc
    at = sigma_inv @ alphas
    for i in range(nport):
        block = np.zeros((nf + 1, nf + 1))
        for j in range(nf + 1):  # rows
            for k in range(1, nf + 1):  # cols
                block[j, k] = bct[i][j] * lam[k]
                if j == k:
                    block[j, k] -= at[i]
        jac[block1:block2, loc:loc + nf + 1] = block
        loc += nf + 1
    # 2, 2
    jac[block1:block2, block1:block2] = bc.T @ sigma_inv @ bc
    # 3,1
    block = np.zeros((nport, nport * (nf + 1)))
    row = col = 0
    for i in range(nport):
        for j in range(nf + 1):
            if j != 0:
                block[row, col] = lam[j]
            col += 1
        row += 1
    jac[-nport:, :(nport * (nf + 1))] = block
    # 3, 2
    jac[-nport:, (nport * (nf + 1)):(nport * (nf + 1)) + nf + 1] = bc
    # 3, 3: already done since eye
    mod_jac = mod._jacobian(bc, lam, alphas)
    assert_allclose(mod_jac[:block1], jac[:block1])
    assert_allclose(mod_jac[block1:block2, :block1],
                    jac[block1:block2, :block1])
    assert_allclose(mod_jac[block1:block2, block1:block2], jac[block1:block2,
                                                               block1:block2])
    assert_allclose(mod_jac[block1:block2, block2:], jac[block1:block2,
                                                         block2:])
    assert_allclose(mod_jac[block2:], jac[block2:])

    s = moments.T @ moments / (n - (nf + 1))
    ginv = np.linalg.inv(jac)
    cov = ginv @ s @ ginv.T / n
    order = np.zeros((nport, nf + 1), dtype=np.int64)
    order[:, 0] = np.arange(block2, block2 + nport)
    for i in range(nf):
        order[:, i + 1] = (nf + 1) * np.arange(nport) + (i + 1)
    order = np.r_[order.ravel(), block1:block2]
    cov = cov[order][:, order]
    cov = (cov + cov.T) / 2
    assert_allclose(cov, res.cov)

    acov = cov[:block1:(nf + 1), :block1:(nf + 1)]
    jstat = float(alphas.T @ np.linalg.pinv(acov) @ alphas)
    assert_allclose(res.cov.values[:block1:(nf + 1), :block1:(nf + 1)], acov)
    assert_allclose(res.j_statistic.stat, jstat, rtol=1e-1)
    assert_allclose(res.j_statistic.pval,
                    1 - stats.chi2(nport - nf - 1).cdf(jstat),
                    rtol=1e-2)

    get_all(res)
Example #46
0
from math import log, exp
from numpy import partition
from numpy import mean
from numpy_sugar.special import logsumexp


def _get_median_terms(n):
    if n % 2 == 0:
        nh = n // 2
        kth = [nh - 1, nh]
    else:
        kth = [(n - 1) // 2]
    return kth


_chi2_df1 = chi2(df=1)


def gcontrol(chi2_values):
    """ Genomic control
    """
    n = len(chi2_values)
    kth = _get_median_terms(n)
    chi2_values = partition(chi2_values, kth)
    x2obs = mean(chi2_values[kth])
    x2exp = _chi2_df1.ppf(0.5)
    return x2obs / x2exp


def qvalues(pv):
    import rpy2.robjects as robjects
Example #47
0
def test_linear_model_parameters(data):
    mod = LinearFactorModel(data.portfolios, data.factors)
    res = mod.fit()
    f = mod.factors.ndarray
    p = mod.portfolios.ndarray
    n = f.shape[0]
    moments = np.zeros(
        (n, p.shape[1] * (f.shape[1] + 1) + f.shape[1] + p.shape[1]))
    fc = np.c_[np.ones((n, 1)), f]
    betas = np.linalg.lstsq(fc, p)[0]
    eps = p - fc @ betas
    loc = 0
    for i in range(eps.shape[1]):
        for j in range(fc.shape[1]):
            moments[:, loc] = eps[:, i] * fc[:, j]
            loc += 1
    b = betas[1:, :].T
    lam = np.linalg.lstsq(b, p.mean(0)[:, None])[0]
    pricing_errors = p - (b @ lam).T
    for i in range(lam.shape[0]):
        lam_error = (p - (b @ lam).T) @ b[:, [i]]
        moments[:, loc] = lam_error.squeeze()
        loc += 1
    alphas = pricing_errors.mean(0)[:, None]
    moments[:, loc:] = pricing_errors - alphas.T
    mod_moments = mod._moments(eps, b, lam, alphas, pricing_errors)

    assert_allclose(res.betas, b)
    assert_allclose(res.risk_premia, lam.squeeze())
    assert_allclose(res.alphas, alphas.squeeze())
    assert_allclose(moments, mod_moments)

    m = moments.shape[1]
    jac = np.eye(m)
    block1 = p.shape[1] * (f.shape[1] + 1)
    # 1,1

    jac[:block1, :block1] = np.kron(np.eye(p.shape[1]), fc.T @ fc / n)
    # 2, 1
    loc = 0
    nport, nf = p.shape[1], f.shape[1]
    block2 = block1 + nf
    for i in range(nport):
        block = np.zeros((nf, nf + 1))
        for j in range(nf):  # rows
            for k in range(1, nf + 1):  # cols
                block[j, k] = b[i][j] * lam[k - 1]
                if j + 1 == k:
                    block[j, k] -= alphas[i]
        jac[block1:block2, loc:loc + nf + 1] = block
        loc += nf + 1
    # 2, 2
    jac[block1:block2, block1:block2] = b.T @ b
    # 3,1
    block = np.zeros((nport, nport * (nf + 1)))
    row = col = 0
    for i in range(nport):
        for j in range(nf + 1):
            if j != 0:
                block[row, col] = lam[j - 1]
            col += 1
        row += 1
    jac[-nport:, :(nport * (nf + 1))] = block
    # 3, 2
    jac[-nport:, (nport * (nf + 1)):(nport * (nf + 1)) + nf] = b
    # 3, 3: already done since eye
    mod_jac = mod._jacobian(b, lam, alphas)
    assert_allclose(mod_jac[:block1], jac[:block1])
    assert_allclose(mod_jac[block1:block2, :block1],
                    jac[block1:block2, :block1])
    assert_allclose(mod_jac[block1:block2, block1:block2], jac[block1:block2,
                                                               block1:block2])
    assert_allclose(mod_jac[block1:block2, block2:], jac[block1:block2,
                                                         block2:])
    assert_allclose(mod_jac[block2:], jac[block2:])

    s = moments.T @ moments / (n - (nf + 1))
    ginv = np.linalg.inv(jac)
    cov = ginv @ s @ ginv.T / n
    order = np.zeros((nport, nf + 1), dtype=np.int64)
    order[:, 0] = np.arange(block2, block2 + nport)
    for i in range(nf):
        order[:, i + 1] = (nf + 1) * np.arange(nport) + (i + 1)
    order = np.r_[order.ravel(), block1:block2]
    cov = cov[order][:, order]
    cov = (cov + cov.T) / 2
    assert_allclose(cov, res.cov)

    acov = cov[:block1:(nf + 1), :block1:(nf + 1)]
    jstat = float(alphas.T @ np.linalg.pinv(acov) @ alphas)
    assert_allclose(res.j_statistic.stat, jstat)
    assert_allclose(res.j_statistic.pval,
                    1 - stats.chi2(nport - nf).cdf(jstat))

    get_all(res)

    res = LinearFactorModel(data.portfolios,
                            data.factors).fit(cov_type='kernel',
                                              debiased=False)
    std_mom = moments / moments.std(0)[None, :]
    mom = std_mom.sum(1)
    bw = kernel_optimal_bandwidth(mom)
    w = kernel_weight_bartlett(bw, n - 1)
    s = _cov_kernel(moments, w)
    cov = ginv @ s @ ginv.T / n
    cov = cov[order][:, order]
    cov = (cov + cov.T) / 2
    assert_allclose(cov, res.cov)
Example #48
0
def epf_ineff_manipulate_alt(truepar,
                             dictd,
                             dictv,
                             n=75000,
                             rlow=0,
                             rhigh=500,
                             offset=0,
                             per=1,
                             getlow=-1,
                             gethigh=-1,
                             direct="",
                             jchoice=-1):
    filename = dictd['filename']
    dataname = dictd['dataname']
    dataname2 = dictd['dataname2']
    wdataname = dictd['wdataname']
    vdataname = dictd['vdataname']
    if 'filename2' in dictd:
        filename2 = dictd['filename2']
    if 'fixed' in dictv:
        filename = filename + '_fix'
    Summest = []
    for i in range(int(rlow / per + offset / per),
                   int(ceil(rhigh / per + offset / per))):
        try:
            Summest = Summest + [
                pd.read_csv("../Results/MC/Trials/" + direct + "summary_" +
                            filename + "_" + str(i),
                            header=None)
            ]
        except:
            continue
    if 'filename2' in dictd:
        Summest2 = []
        for i in range(int(rlow / per + offset / per),
                       int(ceil(rhigh / per + offset / per))):
            try:
                Summest2 = Summest2 + [
                    pd.read_csv("../Results/MC/Trials/" + direct + "summary_" +
                                filename2 + "_" + str(i),
                                header=None)
                ]
            except:
                continue
    else:
        Summest2 = Summest
    jacdes = ""
    if jchoice >= 0:
        jacdes = '_jac_' + str(jchoice)
    Varoos = []
    for i in range(int(rlow / per + offset / per),
                   int(ceil(rhigh / per + offset / per))):
        try:
            Varoos = Varoos + [
                pd.read_csv("../Results/MC/Trials/" + direct + "varoos_" +
                            filename + "_" + str(i),
                            header=None)
            ]
        except:
            continue
    Summest = np.array(pd.concat(Summest))
    Summest2 = np.array(pd.concat(Summest2))
    Varoos = np.array(pd.concat(Varoos))
    Qdat = np.array(pd.read_csv("../Results/MC/" + dataname, header=None))
    Qdat2 = np.array(pd.read_csv("../Results/MC/" + dataname2, header=None))
    if wdataname == 'identity':
        Wdat = np.identity(8)
    else:
        Wdat = np.array(pd.read_csv("../Results/MC/" + wdataname, header=None))
    Vdat = np.array(pd.read_csv("../Results/MC/" + vdataname, header=None))
    #older = Summest[:,1] > 0
    #Summest = Summest[older, :]
    #Varoos = Varoos[older, :]
    #jacw = np.prod(np.isnan(Summest[:,njac1:njac1_end].astype(float))==False,1)

    if 'njac' in dictv:
        njac = dictv['njac']
    else:
        njac = njac1

    if 'njaca' in dictv:
        njaca = dictv['njaca']
    else:
        njaca = njac2

    if 'njacend' in dictv:
        njacend = dictv['njacend']
    else:
        njacend = njac1_end

    if 'njacaend' in dictv:
        njacaend = dictv['njacaend']
    else:
        njacaend = njac2_end

    if 'nimom' in dictv:
        nimom = dictv['nimom']
    else:
        nimom = nmom

    if 'nimom_end' in dictv:
        nimom_end = dictv['nimom_end']
    else:
        nimom_end = nmom_end

    if 'namom' in dictv:
        namom = dictv['namom']
    else:
        namom = nepfq

    if 'namom_end' in dictv:
        namom_end = dictv['namom_end']
    else:
        namom_end = nepfq_end

    if 'npar_est' in dictv:
        npar_est = dictv['npar_est']
    else:
        npar_est = 4

    if 'nresi' in dictv:
        nresi = dictv['nresi']
    else:
        nresi = nres

    if 'nresi_end' in dictv:
        nresi_end = dictv['nresi_end']
    else:
        nresi_end = nres_end

    if 'jacname' in dictd:
        jacname = dictd['jacname']
    else:
        jacname = 'jacobians_' + filename

    n_mom = nimom_end - nimom
    print(n_mom)
    n_mom2 = namom_end - namom

    if jchoice == -1:
        jacs = Summest[:, njac:njacend]
        jacs2 = Summest[:, njaca:njacaend]
        #jacs = Summest[:, njac:njacend]
        #jacs2 = Summest[:, njaca:njacaend]
        jacs[jacs == ' '] = ' -nan'
        jacs2[jacs2 == ' '] = ' -nan'
        jacs = jacs.astype(float)
        jacs2 = jacs2.astype(float)
    elif jchoice == -2:
        Jaccest = []
        for i in range(int(rlow / per + offset / per),
                       int(ceil(rhigh / per + offset / per))):
            try:
                Jaccest = Jaccest + [
                    pd.read_csv("../Results/MC/Trials/" + direct + "summary_" +
                                jacname + "_" + str(i),
                                header=None)
                ]
            except:
                continue
        Jaccest = np.array(pd.concat(Jaccest))
        if np.shape(Jaccest)[0] > np.shape(Summest)[0]:
            Jaccest = Jaccest[0:np.shape(Summest)[0]]
        jacs = Jaccest[:, njac:njacend]
        jacs2 = Jaccest[:, njaca:njacaend]
        #jacs = Summest[:, njac:njacend]
        #jacs2 = Summest[:, njaca:njacaend]
        jacs[jacs == ' '] = ' -nan'
        jacs2[jacs2 == ' '] = ' -nan'
        jacs = jacs.astype(float)
        jacs2 = jacs2.astype(float)
    else:
        Jaccest = [
            pd.read_csv("../Results/MC/Trials/" + direct + jacname + "_" +
                        str(i),
                        header=None)
            for i in range(int(rlow / per + offset /
                               per), int(ceil(rhigh / per + offset / per)))
        ]
        Jaccest = np.array(pd.concat(Jaccest))
        Jaccest = Jaccest[ind, :]
        Jaccest = Jaccest[selected, :]
        jacs = Jaccest[:,
                       jchoice * (njac2_end - njac1) + njac - njac1:jchoice *
                       (njac2_end - njac1) + njacend - njac1].astype(float)
        jacs2 = Jaccest[:,
                        jchoice * (njac2_end - njac1) + njaca - njac1:jchoice *
                        (njac2_end - njac1) + njacaend - njac1].astype(float)
    jacw = np.prod(np.isnan(jacs) == False, 1) == 1
    Summest = Summest[0:np.shape(jacs)[0], :]
    Varoos = Varoos[0:np.shape(jacs)[0], :]
    Summest = Summest[jacw, :]
    Varoos = Varoos[jacw, :]
    jacs = jacs[jacw, :]
    jacs2 = jacs2[jacw, :]

    trash, ind = np.unique(Summest[:, 0], return_index=True)
    Summest = Summest[ind, :]
    Varoos = Varoos[ind, :]
    jacs = jacs[ind, :]
    jacs2 = jacs2[ind, :]

    if getlow < 0:
        getlow = np.nanmin(Summest[:, 0].astype(float))
    if gethigh < 0:
        gethigh = np.nanmax(Summest[:, 0].astype(float)) + 1
    selected = np.array([(si >= getlow) and (si < gethigh)
                         for si in Summest[:, 0].astype(float)])
    Summest = Summest[selected, :]
    Varoos = Varoos[selected, :]
    jacs = jacs[selected, :]
    jacs2 = jacs2[selected, :]

    Mest = np.array(Summest[:, nres:nres_end]).astype(float)
    n_par = Mest.shape[1]
    print(n_par)

    err = Mest - truepar
    #print(err)
    bias = np.nanmean(err, 0)
    mse = np.nanmean(err**2, 0).astype(float)
    #print(mse)
    #rmse = np.sqrt(np.array(mse))
    rmse = 0
    #for i in range(0, 4):
    #  bias[i] = np.nanmean(err[:,i][np.abs(err[:,i])>0])
    #  mse[i] = np.nanmean(err[:,i][np.abs(err[:,i])>0]**2)
    #print(mse)
    rmse = np.sqrt(np.array(mse))
    sd = np.zeros((Mest.shape[0], 4))
    ub = np.zeros((Mest.shape[0], 4))
    lb = np.zeros((Mest.shape[0], 4))

    jacworked = np.prod(np.isnan(jacs) == False, 1)
    njacworked = np.sum(jacworked)
    inb = np.zeros((njacworked, Mest.shape[1]))
    ts = np.zeros((njacworked, 4))
    jn = 0
    jstat = np.zeros(njacworked)
    jstato = np.zeros(njacworked)
    tsn = np.zeros((njacworked, n_mom))
    Diff = np.zeros((Summest.shape[0], n_mom))
    Diff2 = np.zeros((njacworked, n_mom))
    Diffo = np.zeros((njacworked, n_mom2))
    J = np.zeros((njacworked, n_mom * 4))
    J2 = np.zeros((njacworked, n_mom2 * 4))
    JWJ = np.zeros((njacworked, 4 * 4))
    CV = np.zeros((njacworked, n_mom * 4))
    V = np.zeros((njacworked, n_mom * n_mom))
    VV = np.zeros((njacworked, n_mom * n_mom))
    VVV = np.zeros((njacworked, n_mom * n_mom))
    VO = np.zeros((njacworked, n_mom2 * n_mom2))
    for i in range(0, Summest.shape[0]):
        jac = jacs[i, :].reshape(n_mom, n_par).transpose().astype(float)
        jac = jac[0:npar_est, :]
        jac[np.abs(jac) < 1e-8] = 0
        #jac[np.abs(jac) > 50] = 0
        jac2 = jacs2[i, :].reshape(n_mom2, n_par).transpose().astype(float)
        jac2 = jac2[0:npar_est, :]
        jac2[np.abs(jac2) < 1e-8] = 0
        #jac2[np.abs(jac2) > 50] = 0
        #if moment:
        #    jac2 = JE
        #    jac = JM
        #else:
        #    jac = JE
        #    jac2 = JM
        try:
            diff = Summest[i, nimom:nimom_end] - Qdat[Summest[i, 0], :]
        except:
            diff = Summest[i, nimom:nimom_end] * np.nan
        #if n_mom == 8:
        #    if np.sum(jacs2[i,:]==0)>0:
        #        jac2 = jac2 * np.nan
        #else:
        #    if np.sum(jacs[i,:]==0)>0:
        #        jac = jac * np.nan
        #for i in range(0, n_par):
        #    for j in range(0, n_mom):
        #        jac[i, j] = float(jac[i,j])
        Diff[i, :] = diff
        try:
            if wdataname == 'identity':
                w = np.identity(n_mom)
            else:
                w = Wdat[Summest[i, 0], :].reshape(n_mom, n_mom)
            v = Vdat[Summest[i, 0], :].reshape(n_mom, n_mom)
            diff2 = Qdat2[Summest[i, 0], :] - Summest[i, namom:namom_end]
        except:
            w = np.ones((n_mom, n_mom)) * np.nan
            v = w
            diff2 = Summest[i, namom:namom_end] * np.nan
        try:
            jwj = np.linalg.inv(quad(jac, w))
            #jwjj = np.dot(jwj,jac)
            jwjj = np.linalg.solve(quad(jac, w), jac)
        except:
            jwj = np.ones((4, 4)) * np.nan
            jwjj = np.ones((4, n_mom)) * np.nan
        brd = quad(w, v)
        jbrd = quad(jac, brd)
        avar = quad(jwj, jbrd) / n
        sd[i, :] = np.sqrt(np.diag(avar) * (1 + 1 / 10))
        ub[i, :] = Mest[i, 0:npar_est] + 1.96 * sd[i, :]
        lb[i, :] = Mest[i, 0:npar_est] - 1.96 * sd[i, :]
        if jacworked[i] == 1:
            for j in range(0, 4):
                if (ub[i, j] > truepar[j]) and (lb[i, j] < truepar[j]):
                    inb[jn, j] = 1.0
            #bread = np.eye(n_mom) - np.dot(np.dot(jac.transpose(), jwjj), w)
            #cv = np.dot(np.dot(jac.transpose(), jwjj), np.dot(w, v))
            cv = np.dot(np.dot(jac.transpose(), jwjj), np.dot(w, v))
            vv = (v - cv - cv.transpose() +
                  quad(np.dot(jac.transpose(), jwjj), quad(w, v)))
            try:
                wnew = np.linalg.pinv(vv) * n * 10 / 11
                #wnew = np.linalg.pinv(quad(bread, v) * (1 + 1/10)) * n
                #wnew  = np.linalg.pinv((v - cv*(1-1/10) - cv.transpose()*(1-1/10) + quad(np.dot(jac.transpose(), jwjj), quad(w, v))*(1+1/10))) * n
            except:
                wnew = np.ones((nimom_end - nimom, nimom_end - nimom)) * np.nan
            try:
                wnew2 = np.linalg.pinv(Varoos[i, :].reshape(
                    namom_end - namom, namom_end - namom)) * n * 10 / 11
            except:
                wnew2 = np.ones(
                    (namom_end - namom, namom_end - namom)) * np.nan
            #print(quad(diff,wnew))
            jstat[jn] = quad(diff, wnew)
            jstato[jn] = quad(diff2, wnew2)
            #if n_mom == 8:
            #    if np.sum(jacs2[i,:]==0)>0:
            #        jstato[jn] = np.nan
            #else:
            #    if np.sum(jacs[i,:]==0)>0:
            #        jstat[jn] = np.nan
            Diff2[jn, :] = diff
            vv = v - cv * (1 - 1 / 10) - cv.transpose() * (1 - 1 / 10) + quad(
                np.dot(jac.transpose(), jwjj), quad(w, v)) * (1 + 1 / 10)
            VV[jn, :] = (v - cv * (1 - 1 / 10) - cv.transpose() *
                         (1 - 1 / 10) +
                         quad(np.dot(jac.transpose(), jwjj), quad(w, v)) *
                         (1 + 1 / 10)).reshape(1, n_mom * n_mom)
            V[jn, :] = wnew.reshape(1, n_mom * n_mom)
            VVV[jn, :] = v.reshape(1, n_mom * n_mom)
            VO[jn, :] = Varoos[i, :]
            Diffo[jn, :] = diff2
            JWJ[jn, :] = jwj.reshape(1, 4 * 4)
            J2[jn, :] = jac2.reshape(n_mom2 * npar_est)
            J[jn, :] = jac.reshape(n_mom * npar_est)
            CV[jn, :] = np.dot(jwj, np.dot(jac,
                                           np.dot(w,
                                                  v))).reshape(1, 4 * n_mom)
            try:
                ts[jn, :] = err[i, 0:npar_est] / sd[i, :]
            except:
                ts[jn, :] = np.nan
            tsn[jn, :] = diff / np.sqrt(np.diag(vv) / n)
            jn += 1

    jstat_true = np.abs(Summest2[isfloat(Summest2[:, 2]), 2].astype(float) *
                        n * (10 / 11))
    prt = np.nanmean(
        np.abs(ts[np.prod(np.isnan(ts) == False, 1) == 1, :]) > 1.96, 0)
    dist = stats.chi2(n_mom - 4)
    dist2 = stats.chi2(n_mom)
    dist3 = stats.chi2(n_mom2)
    dnorm = stats.norm()
    pjstat = dist.cdf(jstat)
    pjstat2 = dist2.cdf(jstat_true)
    pjstato = dist3.cdf(jstato)
    if wdataname == 'ientity':
        Wdat = np.identity(n_mom)

    results = {
        'mest':
        np.nanmean(Mest, 0),
        'mestf':
        Mest,
        'bias':
        bias,
        'bias_pct':
        bias / truepar,
        'mse':
        mse,
        'rmse':
        rmse,
        'rmse_pct':
        rmse / truepar,
        'sd_pct':
        np.mean(sd[np.prod(np.isnan(tsn) == False, 1) == 1, :], 0) /
        truepar[0:npar_est],
        'sd':
        sd,
        'inb':
        inb,
        'prt':
        prt,
        'summ':
        Summest,
        'jstat':
        jstat,
        'jstat_true':
        jstat_true,
        'pjstat':
        pjstat,
        'pjstat2':
        pjstat2,
        'err':
        err,
        'ts':
        ts,
        'tsn':
        tsn,
        'jstato':
        jstato,
        'pjstato':
        pjstato,
        'w':
        Wdat,
        'diff':
        Diff,
        'diff2':
        Diff2,
        'v':
        V,
        'vv':
        VV,
        'vvv':
        VVV,
        'cv':
        CV,
        'jwj':
        JWJ,
        'ju1':
        J,
        'ju2':
        J2,
        'vfile':
        Vdat,
        'vo':
        VO,
        'diffo':
        Diffo,
        'dat':
        Qdat
    }

    plt.figure(figsize=(20, 20))
    try:
        plt.plot(dist.cdf(np.nanpercentile(jstat, np.arange(0, 101))),
                 np.arange(0, 101) / 100.0,
                 color='r',
                 lw=4)
        plt.plot(dist2.cdf(np.nanpercentile(jstat_true, np.arange(0, 101))),
                 np.arange(0, 101) / 100.0,
                 'b--',
                 lw=4)
    except:
        print("No in sample moments")
    plt.plot(np.arange(0, 101) / 100.0, np.arange(0, 101) / 100.0, 'k', lw=2)
    try:
        plt.plot(dist3.cdf(np.nanpercentile(jstato, np.arange(0, 101))),
                 np.arange(0, 101) / 100.0,
                 'g-.',
                 lw=4)
    except:
        print("No out of sample moments")
    plt.xlabel(r'Theoretical percentile', fontsize=40)
    plt.ylabel(r'Actual percentile', fontsize=40)
    plt.legend([
        r'Estimated parameters', r'True parameters', r'Theoretical',
        r'Out-of-sample'
    ],
               loc=2,
               frameon=False,
               fontsize=35)
    plt.savefig("../WR/chi2plot_" + filename + ".png")
    plt.close()

    try:
        plt.figure()
        for i in range(0, tsn.shape[1]):
            plt.plot(np.nanpercentile(dnorm.cdf(tsn[:, i]), np.arange(0, 101)),
                     np.arange(0, 101) / 100.0)
        plt.plot(np.arange(0, 101) / 100.0, np.arange(0, 101) / 100.0, 'k--')
        plt.xlabel('Theoretical percentile')
        plt.ylabel('Actual percentile')
        plt.savefig("../WR/tplot_" + filename + ".png")
        plt.close()
    except:
        print("no plot")

    try:
        plt.figure()
        for i in range(0, ts.shape[1]):
            plt.plot(np.nanpercentile(dnorm.cdf(ts[:, i]), np.arange(0, 101)),
                     np.arange(0, 101) / 100.0)
        plt.plot(np.arange(0, 101) / 100.0, np.arange(0, 101) / 100.0, 'k--')
        plt.xlabel('Theoretical percentile')
        plt.ylabel('Actual percentile')
        plt.savefig("../WR/tplot_par_" + filename + ".png")
        plt.close()
    except:
        print("no plot")
    return (results)
Example #49
0
def _mvn_to_scipy(loc, cov, prec, tril):
    jax_dist = dist.MultivariateNormal(loc, cov, prec, tril)
    mean = jax_dist.mean
    cov = jax_dist.covariance_matrix
    return osp.multivariate_normal(mean=mean, cov=cov)


_DIST_MAP = {
    dist.BernoulliProbs: lambda probs: osp.bernoulli(p=probs),
    dist.BernoulliLogits: lambda logits: osp.bernoulli(p=_to_probs_bernoulli(logits)),
    dist.Beta: lambda con1, con0: osp.beta(con1, con0),
    dist.BinomialProbs: lambda probs, total_count: osp.binom(n=total_count, p=probs),
    dist.BinomialLogits: lambda logits, total_count: osp.binom(n=total_count, p=_to_probs_bernoulli(logits)),
    dist.Cauchy: lambda loc, scale: osp.cauchy(loc=loc, scale=scale),
    dist.Chi2: lambda df: osp.chi2(df),
    dist.Dirichlet: lambda conc: osp.dirichlet(conc),
    dist.Exponential: lambda rate: osp.expon(scale=np.reciprocal(rate)),
    dist.Gamma: lambda conc, rate: osp.gamma(conc, scale=1./rate),
    dist.HalfCauchy: lambda scale: osp.halfcauchy(scale=scale),
    dist.HalfNormal: lambda scale: osp.halfnorm(scale=scale),
    dist.LogNormal: lambda loc, scale: osp.lognorm(s=scale, scale=np.exp(loc)),
    dist.MultinomialProbs: lambda probs, total_count: osp.multinomial(n=total_count, p=probs),
    dist.MultinomialLogits: lambda logits, total_count: osp.multinomial(n=total_count,
                                                                        p=_to_probs_multinom(logits)),
    dist.MultivariateNormal: _mvn_to_scipy,
    dist.Normal: lambda loc, scale: osp.norm(loc=loc, scale=scale),
    dist.Pareto: lambda alpha, scale: osp.pareto(alpha, scale=scale),
    dist.Poisson: lambda rate: osp.poisson(rate),
    dist.StudentT: lambda df, loc, scale: osp.t(df=df, loc=loc, scale=scale),
    dist.Uniform: lambda a, b: osp.uniform(a, b - a),
Example #50
0
# In[19]:

elements = np.array([1, 5, 12])
probabilities = [0.05, 0.7, 0.25]
np.random.choice(elements, 10, p=probabilities)

# # Другие распределения

# Существует большое количество других стандартных семейств распределений, многие из которых также можно генерировать в Питоне.
# Например, распределение хи-квадрат $\chi^2_k$, имеющее наутральный параметр $k$, который называется числом степеней свободы:

# In[20]:

x = np.linspace(0, 30, 100)
for k in [1, 2, 3, 4, 6, 9]:
    rv = sts.chi2(k)
    cdf = rv.cdf(x)
    plt.plot(x, cdf, label="$k=%s$" % k)
plt.legend()
plt.title("CDF ($\chi^2_k$)")

# In[21]:

x = np.linspace(0, 30, 100)
for k in [1, 2, 3, 4, 6, 9]:
    rv = sts.chi2(k)
    pdf = rv.pdf(x)
    plt.plot(x, pdf, label="$k=%s$" % k)
plt.legend()
plt.title("PDF ($\chi^2_k$)")
Example #51
0
def ITGP(X, Y, alpha1=0.50, alpha2=0.975, nsh=2, ncc=2, nrw=1,
         maxiter=None, return_predict=True, callback=None, callback_args=(),
         warm_start=False, optimize_kwargs={}, **gp_kwargs):
    """
    Robust Gaussian Process Regression Based on Iterative Trimming.

    Parameters
    ----------
    X: array shape (n, d)
    Y: array shape (n, 1)
        Input data with shape (# of data, # of dims).
    alpha1, alpha2: float in (0, 1)
        Trimming and reweighting parameters respectively.
    nsh, ncc, nrw: int (>=0)
        Number of shrinking, concentrating, and reweighting iterations respectively.
    return_predict: bool
        If True, then the predicted mean, variance, and score of input data will be returned.
    callback: callable
        Function for monitoring the iteration process. It takes
        the iteration number i and the locals() dict as input
        e.g.
            callback=lambda i, locals: print(i, locals['gp'].num_data, locals['gp'].param_array)
        or
            callback=lambda i, locals: locals['gp'].plot()
    callback_args:
        Extra parameters for callback.
    warm_start: bool, int
        From which step it uses the warm start for optimizing hyper-parameters.
            0: (default) disable warm start, always use a fresh initial guess (provided by input gp object).
          >=1: start optimization with hyper-parameters trained from last iteration for steps >= warm_start,
        A warm start might help converge faster with the risk of being trapped at a local solution.
    optimize_kwargs:
        GPy.core.GP.optimize parameters.
    **gp_kwargs:
        GPy.core.GP parameters, including likelihood and kernel.
        Gaussian and RBF are used as defaults.

    Returns
    -------
    ITGPResult: named tuple object
        gp:
            GPy.core.GP object.
        consistency:
            Consistency factor.
        ix_sub:
            Boolean index for trimming sample.
        niter:
            Total iterations performed, <= 1 + nsh + ncc + nrw.
        Y_avg, Y_var:
            Expectation and variance of input data points. None if return_predict=False.
        score:
            Scaled residuals. None if return_predict=False.
    """
    # check parameters
    if X.ndim == 1:
        X = np.atleast_2d(X).T
    if Y.ndim == 1:
        Y = np.atleast_2d(Y).T
    if len(X) != len(Y):
        raise ValueError("X should have the same length as Y")

    n, p = Y.shape
    if p != 1:
        raise ValueError("Y is expected in shape (n, 1).")
    if n * alpha1 - 0.5 <= 2:
        raise ValueError("The dataset is unreasonably small!")

    if nsh < 0 or ncc < 0 or nrw < 0:
        raise ValueError("nsh, ncc and nrw are expected to be nonnegative numbers.")

    gp_kwargs.setdefault('likelihood', GPy.likelihoods.Gaussian(variance=1.0))
    gp_kwargs.setdefault('kernel', GPy.kern.RBF(X.shape[1]))
    gp_kwargs.setdefault('name', 'ITGP regression')

    # use copies so that input likelihood and kernel will not be changed
    likelihood_init = gp_kwargs['likelihood'].copy()
    kernel_init = gp_kwargs['kernel'].copy()

    # temp vars declaration
    d_sq = None
    ix_old = None
    niter = 0

    # shrinking and concentrating
    for i in range(1 + nsh + ncc):
        if i == 0:
            # starting with the full sample
            ix_sub = slice(None)
            consistency = 1.0
        else:
            # reducing alpha from 1 to alpha1 gradually
            if i <= nsh:
                alpha = alpha1 + (1 - alpha1) * (1 - i / (nsh + 1))
            else:
                alpha = alpha1
            chi_sq = chi2(p).ppf(alpha)
            h = int(min(np.ceil(n * alpha - 0.5), n - 1))  # alpha <= (h+0.5)/n

            # XXX: might be buggy when there are identical data points
            # better to use argpartition! but may break ix_sub == ix_old.
            ix_sub = (d_sq <= np.partition(d_sq, h)[h])  # alpha-quantile
            consistency = alpha / chi2(p + 2).cdf(chi_sq)

        # check convergence
        if (i > nsh + 1) and (ix_sub == ix_old).all():
            break  # converged
        ix_old = ix_sub

        # warm start?
        if 0 == warm_start or niter < warm_start:
            gp_kwargs['likelihood'] = likelihood_init.copy()
            gp_kwargs['kernel'] = kernel_init.copy()

        # train GP
        gp = GPy.core.GP(X[ix_sub], Y[ix_sub], **gp_kwargs)
        gp.optimize(**optimize_kwargs)

        # make prediction
        Y_avg, Y_var = gp.predict(X, include_likelihood=True)
        d_sq = ((Y - Y_avg)**2 / Y_var).ravel()

        if callback is not None:
            callback(niter, locals(), *callback_args)
        niter += 1

    # reweighting
    for i in range(nrw):
        alpha = alpha2
        chi_sq = chi2(p).ppf(alpha)

        # XXX: might be buggy when there are identical data points
        ix_sub = (d_sq <= chi_sq * consistency)
        consistency = alpha / chi2(p + 2).cdf(chi_sq)

        # check convergence
        if (ix_sub == ix_old).all():
            break  # converged
        ix_old = ix_sub

        # warm start?
        if 0 == warm_start or niter < warm_start:
            gp_kwargs['likelihood'] = likelihood_init.copy()
            gp_kwargs['kernel'] = kernel_init.copy()

        # train GP
        gp = GPy.core.GP(X[ix_sub], Y[ix_sub], **gp_kwargs)
        gp.optimize(**optimize_kwargs)

        # make prediction
        if i < nrw - 1 or return_predict:
            Y_avg, Y_var = gp.predict(X, include_likelihood=True)
            d_sq = ((Y - Y_avg)**2 / Y_var).ravel()
        else:
            pass  # skip final training unless prediction is wanted

        if callback is not None:
            callback(niter, locals(), *callback_args)
        niter += 1

    if return_predict:
        # outlier detection
        score = (d_sq / consistency)**0.5
        return ITGPResult(gp, consistency, ix_sub, niter, Y_avg, Y_var, score)
    else:
        return ITGPResult(gp, consistency, ix_sub, niter, None, None, None)
Example #52
0
from statsmodels.stats.diagnostic import acorr_ljungbox
from statsmodels.tsa.stattools import adfuller

def normalnoisesim(nobs=500, loc=0.0):
    return (loc+np.random.randn(nobs))


def lb(x):
    s,p = acorr_ljungbox(x, lags=4)
    return np.r_[s, p]


mc1 = StatTestMC(normalnoisesim, lb)
mc1.run(5000, statindices=lrange(4))

print(mc1.summary_quantiles([1,2,3], stats.chi2([2,3,4]).ppf,
                            varnames=['lag 1', 'lag 2', 'lag 3'],
                            title='acorr_ljungbox'))
print('\n\n')

frac = [0.01, 0.025, 0.05, 0.1, 0.975]
crit = stats.chi2([2,3,4]).ppf(np.atleast_2d(frac).T)
print(mc1.summary_cdf([1,2,3], frac, crit,
                      varnames=['lag 1', 'lag 2', 'lag 3'],
                      title='acorr_ljungbox'))
print(mc1.cdf(crit, [1,2,3])[1])

#----------------------

def randwalksim(nobs=500, drift=0.0):
    return (drift+np.random.randn(nobs)).cumsum()
 def test_Gamma_to_ChiSquare(self):
     X = RV(Gamma(shape=10 / 2, scale=2))
     sims = X.sim(Nsim)
     cdf = stats.chi2(df=10).cdf
     pval = stats.kstest(sims, cdf).pvalue
     self.assertTrue(pval > .01)
Example #54
0
File: lmm.py Project: moqri/era_old
    def lmm(self, data, phe, covars, cpgnames, logdelta, reml=True):
        """
        returns output sorted by pvalues:
        sorted_cpgnames, sorted_cpg_indices, p_vals, beta_est, sigma_e_est, sigma_g_est, statistics
        where beta_est is 2d array where beta_est[i] is the coefficients of site i 
            beta_est[i][0] is the coefficient of the interception
            beta_est[i][-1] is the coefficient of site i
            beta_est[i][1:-1] is the coefficient of the covariates
        """
        number_of_samples = phe.shape[0]

        #Prepare required matrices
        Uy = np.dot(self.U.T, phe).flatten()
        UX = self.U.T.dot(covars)
        Sd = self.s + np.exp(logdelta)
        UyS = Uy / Sd
        yKy = UyS.T.dot(Uy)
        logdetK = np.log(Sd).sum()

        num_of_non_zero_eigenvalues = len(Sd)
        num_of_zero_eigenvalues = number_of_samples - num_of_non_zero_eigenvalues
        logging.debug("Found %d zero eigenvalues." % num_of_zero_eigenvalues)
        #Compute null LL
        XX = covars.T.dot(covars)
        [Sxx, Uxx] = la.eigh(XX)
        logdetXX = np.log(Sxx).sum()
        null_ll, beta_0, null_F = lleval(Uy,
                                         UX,
                                         Sd,
                                         yKy,
                                         logdetK,
                                         logdetXX,
                                         reml=reml)
        logging.debug('null LL: %s.' % null_ll)

        #Add an extra column to UX, that will hold UX for the tested site
        UX = np.concatenate((np.zeros((UX.shape[0], 1)), UX), axis=1)

        UX_all = self.U.T.dot(data)

        #Compute logdetXX - we assume it is the same for all sites because they are standardized
        covars = np.concatenate((np.zeros((number_of_samples, 1)), covars),
                                axis=1)
        covars[:, 0] = data[:, 0]
        XX = covars.T.dot(covars)
        [Sxx, Uxx] = la.eigh(XX)
        logdetXX = np.log(Sxx).sum()

        #perform GWAS
        results = []
        for site_i, site_name in enumerate(cpgnames):
            UX[:, 0] = UX_all[:, site_i]

            ll, beta, F = lleval(
                Uy, UX, Sd, yKy, logdetK, logdetXX, reml=reml
            )  # Note that the order of coefficient in beta is: site under test, covaraites, intercept
            # Calculate sigms_g, sigms_e
            sigma_g = np.sum([((Uy[i] - np.dot(UX[i, :], beta))**2) / Sd[i]
                              for i in range(num_of_non_zero_eigenvalues)])
            sigma_g += np.sum([
                ((Uy[i] - np.dot(UX[i, :], beta))**2) / np.exp(logdelta)
                for i in range(num_of_zero_eigenvalues)
            ])
            if reml:
                sigma_g = (sigma_g / (number_of_samples - UX.shape[1]))**0.5
            else:
                sigma_g = (sigma_g / number_of_samples)**0.5

            sigma_e = (np.exp(logdelta) * (sigma_g**2))**0.5

            results.append((site_i, site_name, ll, F, beta, sigma_g, sigma_e))

        #sort and print results

        if reml:
            results.sort(key=lambda t: t[3], reverse=True)
            fDist = stats.f(1, number_of_samples - 1)
            p_vals = fDist.sf([t[3] for t in results])
        else:
            results.sort(key=lambda t: t[2], reverse=True)
            chi2 = stats.chi2(1)
            p_vals = chi2.sf(2 * (np.array([t[2] for t in results]) - null_ll))

        sorted_cpg_indices = [
            res[0] for res in results
        ]  # sorted_cpg_indices[i] is the index of sorted_cpgnames[i] in cpgnames. i.e cpgnames[sorted_cpg_indices[i]] == sorted_cpgnames[i]
        sorted_cpgnames = [res[1] for res in results]

        beta_est = [res[4] for res in results]
        sigma_g_est = [res[5] for res in results]
        sigma_e_est = [res[6] for res in results]

        statistics = []
        if reml:
            statistics = [res[3] for res in results]
        else:
            statistics = [res[2] for res in results]

        return sorted_cpgnames, sorted_cpg_indices, p_vals, beta_est, sigma_e_est, sigma_g_est, statistics
 def test_sum_Normal_to_ChiSquare(self):
     X, Y, Z, A, B = RV(Normal(mean=0, var=1)**5)
     sims = ((X**2) + (Y**2) + (Z**2) + (A**2) + (B**2)).sim(Nsim)
     cdf = stats.chi2(df=5).cdf
     pval = stats.kstest(sims, cdf).pvalue
     self.assertTrue(pval > .01)
Example #56
0
import rvlib as rl
import scipy.stats as st
import numpy as np

# Get random points to evaluate functions
np.random.seed(1234)
x = np.random.rand(10)

# Create normal distrtibution
N_rl = rl.Normal(0, 1)
N_st = st.norm(0, 1)

# Check normal cdfs/pdfs against each other
N_rl_cdf = N_rl.cdf(x)
N_st_cdf = N_st.cdf(x)
np.allclose(N_rl_cdf, N_st_cdf)

# Create chi2 distributions
chi2_rl = rl.Chisq(5)
chi2_st = st.chi2(5)

# Check chi2 cdfs/pdfs against each other
chi2_rl_cdf = chi2_rl.cdf(x)
chi2_st_cdf = chi2_st.cdf(x)
np.allclose(chi2_rl_cdf, chi2_st_cdf)
Example #57
0
mean_revocation_fraction_of_discharges = {
    crime: mean(revocations[crime]) / mean(discharges[crime])
    for crime in crimes
}

mean_completion_duration = {
    crime: (1 - mean_revocation_fraction_of_discharges[crime]) *
    mean(total_population[crime]) /
    (mean(discharges[crime]) - mean(revocations[crime]))
    for crime in crimes
}

transitions_data = pd.DataFrame()
for crime in crimes:
    # populate transition data
    completion_pdf = chi2(mean_completion_duration[crime]).pdf
    probation_transition_table = pd.DataFrame({
        "compartment": ["probation"] * 100,
        "compartment_duration": [i + 1 for i in range(50)] * 2,
        "outflow_to": ["release"] * 50 + ["prison"] * 50,
        "total_population": [completion_pdf(i + 1) for i in range(50)] + [
            completion_pdf(i + 1) *
            mean_revocation_fraction_of_discharges[crime] for i in range(50)
        ],
        "crime_type": [crime] * 100,
    })
    secondary_transition_table = pd.DataFrame({
        "compartment": ["release", "prison"],
        "compartment_duration": [1, 1],
        "outflow_to": ["release", "prison"],
        "total_population": [1, 1],
Example #58
0
def chatterjeeMachlerHadi(X, y, **kwargs):
    # basic info
    options = parseKeywords(kwargs)

    # for the distances, will use absX - do this before adding intercept term
    # a column of all ones will cause problems with non full rank covariance matrices
    absX = np.absolute(X)

    # now calculate p and n
    n = absX.shape[0]
    p = absX.shape[1]

    # we treat the X matrix as a multivariate matrix with n observations and p variables
    # first need to find a basic subset free of outliers
    correctionFactor = 1 + (1.0 * (p + 1) / (n - p)) + (2.0 / (n - 1 - 3 * p))
    chi = stats.chi2(p, 0)
    alpha = 0.05
    chi2bound = correctionFactor * chi.pdf(alpha / n)
    # calculate h, this is the size of the firt basic subset
    # note that this is the value h, the index of the hth element is h-1
    h = int(1.0 * (n + p + 1) / 2)  # here, only want the integer part of this
    # need to get the coordinatewise medians - this is the median of the columns
    medians = np.median(absX)
    # now compute the matrix to help calculate the distance
    A = np.zeros(shape=(p, p))
    for i in xrange(0, n):
        tmp = absX[i, :] - medians
        A += np.outer(tmp, tmp)
    A = 1.0 / (n - 1) * A

    # now calculate initial distances
    dInit = calculateDistCMH(n, absX, medians, A)

    # now get the h smallest values of d
    sortOrder = np.argsort(dInit)
    indices = sortOrder[0:h]
    means = np.average(absX[indices, :])
    covariance = np.cov(
        absX[indices],
        rowvar=False)  # observations in rows, columns are variables
    dH = calculateDistCMH(n, absX, means, covariance)

    # rearrange into n observations into order and partition into two initial subsets
    # one subset p+1, the n-p-1
    sortOrder = np.argsort(dH)
    indicesBasic = sortOrder[:p + 1]
    # there is a rank issue here, but ignore for now - natural observations will presumably be full rank
    means = np.average(absX[indicesBasic, :])
    covariance = np.cov(absX[indicesBasic], rowvar=False)
    dist = calculateDistCMH(n, absX, means, covariance)

    # create the basic subset
    r = p + 2
    increment = (h - r) / 100
    if increment < 1:
        increment = 1  # here, limiting to 100 iterations of this
    while r <= h:
        sortOrder = np.argsort(dist)
        indices = sortOrder[:r]  # indices start from zero, hence the - 1
        means = np.average(absX[indices])
        covariance = np.cov(absX[indices], rowvar=False)
        dist = calculateDistCMH(n, absX, means, covariance)
        if h - r > 0 and h - r < increment:
            r = h
        else:
            r += increment

    # now the second part = add more points and exclude outliers to basic set
    # all distances above r+1 = outliers
    #r = p + 1
    #increment = (n - 1 - r)/100
    while r < n:
        sortOrder = np.argsort(dist)
        dist2 = np.power(dist, 2)
        if dist2[sortOrder[r]] > chi2bound:
            break  # then leave, everything else is an outlier - it would be good if this could be saved somehow
        # otherwise, continue adding points
        sortOrder = np.argsort(dist)
        indices = sortOrder[:r]
        means = np.average(absX[indices])
        covariance = np.cov(absX[indices], rowvar=False)
        dist = calculateDistCMH(n, absX, means, covariance)
        if n - 1 - r > 0 and n - 1 - r < increment:
            r = n - 1
        else:
            r += increment

    # now with the Hadi distances calculated, can proceed to do the robust regression
    # normalise and manipulate Hadi distances
    dist = dist / np.max(dist)
    # for the median, use the basic subset
    # indicesBasic = sortOrder[:r]
    # distMedian = np.median(dist[indicesBasic]) # I am using on indicesBasic
    distMedian = np.median(
        dist)  # the paper suggests using the median of the complete
    tmp = np.maximum(dist, np.ones(shape=(n)) * distMedian)
    dist = np.reciprocal(tmp)
    dist2 = np.power(dist, 2)
    dist = dist2 / np.sum(dist2)

    # calculate first set of weights - this is simply dist
    weights = dist

    # now add the additional constant intercept column if required
    if options["intercept"] == True:
        # add column of ones for constant term
        X = np.hstack((np.ones(shape=(X.shape[0], 1), dtype="complex"), X))

    n = X.shape[0]
    p = X.shape[1]

    # iteratively weighted least squares
    iteration = 0
    while iteration < options["maxiter"]:
        # do the weighted least-squares
        Anew, ynew = weightLS(X, y, weights)
        paramsNew, squareResidNew, rankNew, sNew = linalg.lstsq(Anew, ynew)
        residsNew = y - np.dot(X, paramsNew)
        # check residsNew to make sure not all zeros (i.e. will happen in undetermined or equally determined system)
        if np.sum(np.absolute(residsNew)) < eps():
            # then return everything here
            return paramsNew, residsNew, weights

        residsAbs = np.absolute(residsNew)
        residsSquare = np.power(residsAbs, 2)
        residsNew = residsSquare / np.sum(residsSquare)
        residsMedian = np.median(residsAbs)

        # calculate the new weights
        tmpDenom = np.maximum(residsNew,
                              np.ones(shape=(n), dtype="float") * residsMedian)
        tmp = (1 - dist) / tmpDenom
        weightsNew = np.power(tmp, 2) / np.sum(np.power(tmp, 2))

        # increment iteration
        iteration = iteration + 1
        weights = weightsNew
        params = paramsNew

        if iteration > 1:
            # check to see whether the change is smaller than the tolerance
            changeResids = linalg.norm(residsNew -
                                       resids) / linalg.norm(residsNew)
            if changeResids < eps():
                # update resids
                resids = residsNew
                break
        # update resids
        resids = residsNew

    # at the end, return the components
    return params, resids, weights
Example #59
0
G[:N * K + N, :N * K + N] = kron(eye(N), SigmaX)
G[N * K + N:, N * K + N:] = -beta @ beta.T

# Hertil
for i in range(N):
    temp = zeros((K, K + 1))
    values = mean(u[:, i]) - multiply(all_coef[:, i], riskPremia) # beta[:, i]
    temp[:, 1:] = diag(values)
    G[N * K + N:, i * (K + 1):(i + 1) * (K + 1)] = temp

vcv = inv(G.T) * S * inv(G) / T

vcvAlpha = vcv[0:N * K + N:4, 0:N * K + N:4]
J = alpha @ inv(vcvAlpha) @ alpha.T
J = J[0, 0]
Jpval = 1 - chi2(25).cdf(J)

vcvRiskPremia = vcv[N * K + N:, N * K + N:]
annualizedRP = 12 * riskPremia
arp = list(squeeze(annualizedRP))
arpSE = list(sqrt(12 * diag(vcvRiskPremia)))
print('        Annualized Risk Premia')
print('           Market       SMB        HML')
print('--------------------------------------')
print('Premia     {0:0.4f}    {1:0.4f}     {2:0.4f}'.format(arp[0], arp[1], arp[2]))
print('Std. Err.  {0:0.4f}    {1:0.4f}     {2:0.4f}'.format(arpSE[0], arpSE[1], arpSE[2]))
print('\n\n')

print('J-test:   {:0.4f}'.format(J))
print('P-value:   {:0.4f}'.format(Jpval))
Example #60
0
    acceptance = kernels.evaluate_acceptance(values)
    logging.info('obtained %d posterior samples with acceptance %.3f',
                 args.num_samples, acceptance)
    logging.info('posterior mean: %s',
                 dict(zip(feature_names, np.mean(xs, axis=0))))
    logging.info('posterior std: %s',
                 dict(zip(feature_names, np.std(xs, axis=0))))
    if 'theta' in data:
        logging.info('true values: %s', dict(zip(feature_names,
                                                 data['theta'])))
        residuals = np.mean(xs, axis=0) - data['theta']
        logging.info('z-scores: %s',
                     dict(zip(feature_names, residuals / np.std(xs, axis=0))))
        cov_ = np.cov(xs.T)
        chi2 = residuals.dot(np.linalg.inv(cov_)).dot(residuals)
        pval = 1 - stats.chi2(len(cov_)).cdf(chi2)
        logging.info('chi2 for %d dof: %f; p-val: %f', len(cov_), chi2, pval)

    # Package the data and results and save them ---------------------------------------------------
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    with atomic_write(filename, mode='wb', overwrite=True) as fp:
        pickle.dump(
            {
                'arghash': arghash,
                'args': config,
                'data': data,
                'result': result,
                'samples': {
                    'xs': xs,
                    'values': values,
                },