Esempio n. 1
0
def getLocation(df, name):
    loc = df[name].mean()
    try:
        loc, scale = Huber(norm=norms.TukeyBiweight(c=4.685))(df[name])
    except ValueError:
        print('error ' + name)
    return loc
Esempio n. 2
0
    def setup_class(cls):
        super(TestRlmBisquareHuber, cls).setup_class()

        model = RLM(cls.data.endog, cls.data.exog, M=norms.TukeyBiweight())
        results = model.fit(scale_est=HuberScale())
        h2 = model.fit(cov="H2", scale_est=HuberScale()).bcov_scaled
        h3 = model.fit(cov="H3", scale_est=HuberScale()).bcov_scaled
        cls.res1 = results
        cls.res1.h2 = h2
        cls.res1.h3 = h3
Esempio n. 3
0
    def setup_class(cls):
        super(TestRlmBisquare, cls).setup_class()
        # Test precisions
        cls.decimal_standarderrors = DECIMAL_1

        model = RLM(cls.data.endog, cls.data.exog, M=norms.TukeyBiweight())
        results = model.fit()
        h2 = model.fit(cov="H2").bcov_scaled
        h3 = model.fit(cov="H3").bcov_scaled
        cls.res1 = results
        cls.res1.h2 = h2
        cls.res1.h3 = h3
Esempio n. 4
0
def smafit(X0,
           Y0,
           W0=None,
           cl=0.95,
           intercept=True,
           robust=False,
           rmethod='FastMCD'):
    """Standard Major-Axis (SMA) line fitting
    
    Calculate standard major axis, aka reduced major axis, fit to 
    data X and Y. The main advantage of this over ordinary least squares is 
    that the best fit of Y to X will be the same as the best fit of X to Y.
    
    The fit equations and confidence intervals are implemented following 
    Warton et al. (2006). Robust fits use the FastMCD covariance estimate 
    from Rousseeuw and Van Driessen (1999). While there are many alternative 
    robust covariance estimators (e.g. other papers by D.I. Warton using M-estimators), 
    the FastMCD algorithm is default in Matlab. When the standard error or 
    uncertainty of each point is known, then weighted SMA may be preferrable to 
    robust SMA. The conventional choice of weights for each point i is 
    W_i = 1 / ( var(X_i) + var(Y_i) ), where var() is the variance 
    (squared standard error).
    
    References 
    Warton, D. I., Wright, I. J., Falster, D. S. and Westoby, M.: 
        Bivariate line-fitting methods for allometry, Biol. Rev., 81(02), 259, 
        doi:10.1017/S1464793106007007, 2006.
    Rousseeuw, P. J. and Van Driessen, K.: A Fast Algorithm for the Minimum 
        Covariance Determinant Estimator, Technometrics, 41(3), 1999.

    Parameters
    ----------
    X, Y : array_like
        Input values, Must have same length.
    W    : optional array of weights for each X-Y point, typically W_i = 1/(var(X_i)+var(Y_i)) 
    cl   : float (default = 0.95)
        Desired confidence level for output. 
    intercept : boolean (default=True)
        Specify if the fitted model should include a non-zero intercept.
        The model will be forced through the origin (0,0) if intercept=False.
    robust : boolean (default=False)
        Use statistical methods that are robust to the presence of outliers
    rmethod: string (default='FastMCD')
        Method for calculating robust variance and covariance. Options:
        'MCD' or 'FastMCD' for Fast MCD
        'Huber' for Huber's T: reduce, not eliminate, influence of outliers
        'Biweight' for Tukey's Biweight: reduces then eliminates influence of outliers

        
    Returns
    -------
    Slope     : float
        Slope or Gradient of Y vs. X
    Intercept : float
        Y intercept.
    ste_grad : float
        Standard error of gradient estimate
    ste_int : float
        standard error of intercept estimate
    ci_grad : [float, float]
        confidence interval for gradient at confidence level cl
    ci_int : [float, float]
        confidence interval for intercept at confidence level cl
    """

    import numpy as np
    import scipy.stats as stats
    from sklearn.covariance import MinCovDet
    import statsmodels.formula.api as smf
    import statsmodels.robust.norms as norms

    # Make sure arrays have the same length
    assert (len(X0) == len(Y0)), 'Arrays X and Y must have the same length'
    if (W0 != None):
        assert (
            len(W0) == len(X0)), 'Array W must have the same length as X and Y'

    # Make sure cl is within the range 0-1
    assert (cl < 1), 'cl must be less than 1'
    assert (cl > 0), 'cl must be greater than 0'

    if (W0 == None):
        W0 = np.zeros_like(X0) + 1

    # Drop any NaN elements of X or Y
    # Infinite values are allowed but will make the result undefined
    idx = ~np.logical_or(np.isnan(X0), np.isnan(Y0))

    X = X0[idx]
    Y = Y0[idx]
    W = W0[idx]

    # Number of observations
    N = len(X)

    # Degrees of freedom for the model
    if (intercept):
        dfmod = 2
    else:
        dfmod = 1

    # Choose whether to use methods robust to outliers
    if (robust):

        # Choose the robust method
        if ((rmethod.lower() == 'mcd') or (rmethod.lower() == 'fastmcd')):
            # FAST MCD

            if (not intercept):
                # intercept=False could possibly be supported by calculating
                # using mcd.support_ as weights in an explicit variance/covariance calculation
                raise NotImplementedError(
                    'FastMCD method only supports SMA with intercept')

            # Fit robust model of mean and covariance
            mcd = MinCovDet().fit(np.array([X, Y]).T)

            # Robust mean
            Xmean = mcd.location_[0]
            Ymean = mcd.location_[1]

            # Robust variance of X, Y
            Vx = mcd.covariance_[0, 0]
            Vy = mcd.covariance_[1, 1]

            # Robust covariance
            Vxy = mcd.covariance_[0, 1]

            # Number of observations used in mean and covariance estimate
            # excludes observations marked as outliers
            N = mcd.support_.sum()

        elif ((rmethod.lower() == 'biweight') or (rmethod.lower() == 'huber')):

            # Tukey's Biweight and Huber's T
            if (rmethod.lower() == 'biweight'):
                norm = norms.TukeyBiweight()
            else:
                norm = norms.HuberT()

            # Get weights for downweighting outliers
            # Fitting a linear model the easiest way to get these
            # Options include "TukeyBiweight" (totally removes large deviates)
            # "HuberT" (linear, not squared weighting of large deviates)
            rweights = smf.rlm('y~x+1', {'x': X, 'y': Y}, M=norm).fit().weights

            # Sum of weight and weights squared, for convienience
            rsum = np.sum(rweights)
            rsum2 = np.sum(rweights**2)

            # Mean
            Xmean = np.sum(X * rweights) / rsum
            Ymean = np.sum(Y * rweights) / rsum

            # Force intercept through zero, if requested
            if (not intercept):
                Xmean = 0
                Ymean = 0

            # Variance & Covariance
            Vx = np.sum((X - Xmean)**2 * rweights**2) / rsum2
            Vy = np.sum((Y - Ymean)**2 * rweights**2) / rsum2
            Vxy = np.sum((X - Xmean) * (Y - Ymean) * rweights**2) / rsum2

            # Effective number of observations
            N = rsum

        else:

            raise NotImplementedError(
                "smafit.py hasn't implemented rmethod={:%s}".format(rmethod))
    else:

        if (intercept):

            wsum = np.sum(W)

            # Average values
            Xmean = np.sum(X * W) / wsum
            Ymean = np.sum(Y * W) / wsum

            # Covariance matrix
            cov = np.cov(X, Y, ddof=1, aweights=W**2)

            # Variance
            Vx = cov[0, 0]
            Vy = cov[1, 1]

            # Covariance
            Vxy = cov[0, 1]

        else:

            # Force the line to pass through origin by setting means to zero
            Xmean = 0
            Ymean = 0

            wsum = np.sum(W)

            # Sum of squares in place of variance and covariance
            Vx = np.sum(X**2 * W) / wsum
            Vy = np.sum(Y**2 * W) / wsum
            Vxy = np.sum(X * Y * W) / wsum

    # Standard deviation
    Sx = np.sqrt(Vx)
    Sy = np.sqrt(Vy)

    # Correlation coefficient (equivalent to np.corrcoef()[1,0] for non-robust cases)
    R = Vxy / np.sqrt(Vx * Vy)

    #############
    # SLOPE

    Slope = np.sign(R) * Sy / Sx

    # Standard error of slope estimate
    ste_slope = np.sqrt(1 / (N - dfmod) * Sy**2 / Sx**2 * (1 - R**2))

    # Confidence interval for Slope
    B = (1 - R**2) / (N - dfmod) * stats.f.isf(1 - cl, 1, N - dfmod)
    ci_grad = Slope * (np.sqrt(B + 1) + np.sqrt(B) * np.array([-1, +1]))

    #############
    # INTERCEPT

    if (intercept):
        Intercept = Ymean - Slope * Xmean

        # Standard deviation of residuals
        # New Method: Formula from smatr R package (Warton)
        # This formula avoids large residuals of outliers when using robust=True
        Sr = np.sqrt(
            (Vy - 2 * Slope * Vxy + Slope**2 * Vx) * (N - 1) / (N - dfmod))

        # OLD METHOD
        # Standard deviation of residuals
        #resid = Y - (Intercept + Slope * X )
        # Population standard deviation of the residuals
        #Sr = np.std( resid, ddof=0 )

        # Standard error of the intercept estimate
        ste_int = np.sqrt(Sr**2 / N + Xmean**2 * ste_slope**2)

        # Confidence interval for Intercept
        tcrit = stats.t.isf((1 - cl) / 2, N - dfmod)
        ci_int = Intercept + ste_int * np.array([-tcrit, tcrit])

    else:

        # Set Intercept quantities to zero
        Intercept = 0
        ste_int = 0
        ci_int = np.array([0, 0])

    return Slope, Intercept, ste_slope, ste_int, ci_grad, ci_int